From 692580fb6310034138577d8e6a1f0727e35b3af1 Mon Sep 17 00:00:00 2001 From: AlongWY Date: Sat, 24 Aug 2024 05:24:45 +0000 Subject: [PATCH] deploy: 72066be21ad467c8ffc76b74c152b38decf3f0ac --- .nojekyll | 0 cache.json | 1 + favicon.ico | Bin 0 -> 15086 bytes index.css | 355 + index.html | 81649 ++++++++++++++++++++++++++++++++++++++++++++++++++ index.js | 39 + 6 files changed, 82044 insertions(+) create mode 100644 .nojekyll create mode 100644 cache.json create mode 100644 favicon.ico create mode 100644 index.css create mode 100644 index.html create mode 100644 index.js diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/cache.json b/cache.json new file mode 100644 index 00000000..e7616153 --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2024-08-16T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2408.08872v1","updated":"2024-08-16T17:57:01Z","published":"2024-08-16T17:57:01Z","title":"xGen-MM (BLIP-3): A Family of Open Large Multimodal Models","summary":" This report introduces xGen-MM (also known as BLIP-3), a framework for\ndeveloping Large Multimodal Models (LMMs). The framework comprises meticulously\ncurated datasets, a training recipe, model architectures, and a resulting suite\nof LMMs. xGen-MM, short for xGen-MultiModal, expands the Salesforce xGen\ninitiative on foundation AI models. Our models undergo rigorous evaluation\nacross a range of tasks, including both single and multi-image benchmarks. Our\npre-trained base model exhibits strong in-context learning capabilities and the\ninstruction-tuned model demonstrates competitive performance among open-source\nLMMs with similar model sizes. In addition, we introduce a safety-tuned model\nwith DPO, aiming to mitigate harmful behaviors such as hallucinations and\nimprove safety. We open-source our models, curated large-scale datasets, and\nour fine-tuning codebase to facilitate further advancements in LMM research.\nAssociated resources will be available on our project page above.\n","authors":["Le Xue","Manli Shu","Anas Awadalla","Jun Wang","An Yan","Senthil Purushwalkam","Honglu Zhou","Viraj Prabhu","Yutong Dai","Michael S Ryoo","Shrikant Kendre","Jieyu Zhang","Can Qin","Shu Zhang","Chia-Chih Chen","Ning Yu","Juntao Tan","Tulika Manoj Awalgaonkar","Shelby Heinecke","Huan Wang","Yejin Choi","Ludwig Schmidt","Zeyuan Chen","Silvio Savarese","Juan Carlos Niebles","Caiming Xiong","Ran Xu"],"pdf_url":"https://arxiv.org/pdf/2408.08872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08869v1","updated":"2024-08-16T17:54:09Z","published":"2024-08-16T17:54:09Z","title":"PEDAL: Enhancing Greedy Decoding with Large Language Models using\n Diverse Exemplars","summary":" Self-ensembling techniques with diverse reasoning paths such as\nSelf-Consistency have demonstrated remarkable gains in accuracy for Large\nLanguage Models (LLMs). However, such techniques depend on the availability of\nan accurate answer extraction process to aggregate across multiple outputs.\nMoreover, they acquire higher inference cost, in comparison to Greedy Decoding,\ndue to generation of relatively higher number of output tokens. Research has\nshown that the free form text outputs from Self-Consistency can be aggregated\nreliably using LLMs to produce the final output. Additionally, recent\nadvancements in LLM inference have demonstrated that usage of diverse exemplars\nin prompts have the ability to induce diversity in the LLM outputs. Such proven\ntechniques can be easily extended to self-ensembling based approaches to\nachieve enhanced results in text generation. In this paper, we introduce PEDAL\n(Prompts based on Exemplar Diversity Aggregated using LLMs), a hybrid\nself-ensembling approach, that combines the strengths of diverse exemplar based\nprompts and LLM based aggregation to achieve improvement in overall\nperformance. On the publicly available SVAMP and ARC datasets, our experiments\nreveal that PEDAL can achieve better accuracy than Greedy Decoding based\nstrategies with lower inference cost compared to Self Consistency based\napproaches.\n","authors":["Sumanth Prabhu"],"pdf_url":"https://arxiv.org/pdf/2408.08869v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02304v3","updated":"2024-08-16T17:28:08Z","published":"2023-10-03T17:59:32Z","title":"Self-Taught Optimizer (STOP): Recursively Self-Improving Code Generation","summary":" Several recent advances in AI systems solve problems by providing a\n\"scaffolding\" program that structures multiple calls to language models (LMs)\nto generate better outputs. A scaffolding program is written in a programming\nlanguage such as Python. In this work, we use a language-model-infused\nscaffolding program to improve itself. We start with a seed \"improver\" that\nimproves an input program according to a given utility function by querying an\nLM several times and returning the best solution. We then run this seed\nimprover to improve itself. Across a small set of downstream tasks, the\nresulting improved improver generates programs with significantly better\nperformance than its seed improver. A variety of self-improvement strategies\nare proposed by the language model, including beam search, genetic algorithms,\nand simulated annealing. Since the language models themselves are not altered,\nthis is not full recursive self-improvement. Nonetheless, it demonstrates that\na modern language model, GPT-4 in our experiments, is capable of writing code\nthat can call itself to improve itself. We consider concerns around the\ndevelopment of self-improving technologies and evaluate the frequency with\nwhich the generated code bypasses a sandbox.\n","authors":["Eric Zelikman","Eliana Lorch","Lester Mackey","Adam Tauman Kalai"],"pdf_url":"https://arxiv.org/pdf/2310.02304v3.pdf","comment":"Published as a conference paper at COLM 2024"},{"id":"http://arxiv.org/abs/2408.08848v1","updated":"2024-08-16T17:19:23Z","published":"2024-08-16T17:19:23Z","title":"PsychoLex: Unveiling the Psychological Mind of Large Language Models","summary":" This paper explores the intersection of psychology and artificial\nintelligence through the development and evaluation of specialized Large\nLanguage Models (LLMs). We introduce PsychoLex, a suite of resources designed\nto enhance LLMs' proficiency in psychological tasks in both Persian and\nEnglish. Key contributions include the PsychoLexQA dataset for instructional\ncontent and the PsychoLexEval dataset for rigorous evaluation of LLMs in\ncomplex psychological scenarios. Additionally, we present the PsychoLexLLaMA\nmodel, optimized specifically for psychological applications, demonstrating\nsuperior performance compared to general-purpose models. The findings\nunderscore the potential of tailored LLMs for advancing psychological research\nand applications, while also highlighting areas for further refinement. This\nresearch offers a foundational step towards integrating LLMs into specialized\npsychological domains, with implications for future advancements in AI-driven\npsychological practice.\n","authors":["Mohammad Amin Abbasi","Farnaz Sadat Mirnezami","Hassan Naderi"],"pdf_url":"https://arxiv.org/pdf/2408.08848v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15455v2","updated":"2024-08-16T17:12:27Z","published":"2024-03-18T23:41:52Z","title":"Improving Sampling Methods for Fine-tuning SentenceBERT in Text Streams","summary":" The proliferation of textual data on the Internet presents a unique\nopportunity for institutions and companies to monitor public opinion about\ntheir services and products. Given the rapid generation of such data, the text\nstream mining setting, which handles sequentially arriving, potentially\ninfinite text streams, is often more suitable than traditional batch learning.\nWhile pre-trained language models are commonly employed for their high-quality\ntext vectorization capabilities in streaming contexts, they face challenges\nadapting to concept drift - the phenomenon where the data distribution changes\nover time, adversely affecting model performance. Addressing the issue of\nconcept drift, this study explores the efficacy of seven text sampling methods\ndesigned to selectively fine-tune language models, thereby mitigating\nperformance degradation. We precisely assess the impact of these methods on\nfine-tuning the SBERT model using four different loss functions. Our\nevaluation, focused on Macro F1-score and elapsed time, employs two text stream\ndatasets and an incremental SVM classifier to benchmark performance. Our\nfindings indicate that Softmax loss and Batch All Triplets loss are\nparticularly effective for text stream classification, demonstrating that\nlarger sample sizes generally correlate with improved macro F1-scores. Notably,\nour proposed WordPieceToken ratio sampling method significantly enhances\nperformance with the identified loss functions, surpassing baseline results.\n","authors":["Cristiano Mesquita Garcia","Alessandro Lameiras Koerich","Alceu de Souza Britto Jr","Jean Paul Barddal"],"pdf_url":"https://arxiv.org/pdf/2403.15455v2.pdf","comment":"Accepted for presentation at the 27th International Conference on\n Pattern Recognition (ICPR) 2024"},{"id":"http://arxiv.org/abs/2403.03640v4","updated":"2024-08-16T17:06:39Z","published":"2024-03-06T11:56:02Z","title":"Apollo: A Lightweight Multilingual Medical LLM towards Democratizing\n Medical AI to 6B People","summary":" Despite the vast repository of global medical knowledge predominantly being\nin English, local languages are crucial for delivering tailored healthcare\nservices, particularly in areas with limited medical resources. To extend the\nreach of medical AI advancements to a broader population, we aim to develop\nmedical LLMs across the six most widely spoken languages, encompassing a global\npopulation of 6.1 billion. This effort culminates in the creation of the\nApolloCorpora multilingual medical dataset and the XMedBench benchmark. In the\nmultilingual medical benchmark, the released Apollo models, at various\nrelatively-small sizes (i.e., 0.5B, 1.8B, 2B, 6B, and 7B), achieve the best\nperformance among models of equivalent size. Especially, Apollo-7B is the\nstate-of-the-art multilingual medical LLMs up to 70B. Additionally, these lite\nmodels could be used to improve the multi-lingual medical capabilities of\nlarger models without fine-tuning in a proxy-tuning fashion. We will\nopen-source training corpora, code, model weights and evaluation benchmark.\n","authors":["Xidong Wang","Nuo Chen","Junyin Chen","Yidong Wang","Guorui Zhen","Yan Hu","Xiangbo Wu","Anningzhe Gao","Xiang Wan","Haizhou Li","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2403.03640v4.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2401.07964v3","updated":"2024-08-16T17:01:06Z","published":"2024-01-15T21:06:20Z","title":"AI-as-exploration: Navigating intelligence space","summary":" Artificial Intelligence is a field that lives many lives, and the term has\ncome to encompass a motley collection of scientific and commercial endeavours.\nIn this paper, I articulate the contours of a rather neglected but central\nscientific role that AI has to play, which I dub `AI-as-exploration'.The basic\nthrust of AI-as-exploration is that of creating and studying systems that can\nreveal candidate building blocks of intelligence that may differ from the forms\nof human and animal intelligence we are familiar with. In other words, I\nsuggest that AI is one of the best tools we have for exploring intelligence\nspace, namely the space of possible intelligent systems. I illustrate the value\nof AI-as-exploration by focusing on a specific case study, i.e., recent work on\nthe capacity to combine novel and invented concepts in humans and Large\nLanguage Models. I show that the latter, despite showing human-level accuracy\nin such a task, probably solve it in ways radically different, but no less\nrelevant to intelligence research, to those hypothesised for humans.\n","authors":["Dimitri Coelho Mollo"],"pdf_url":"https://arxiv.org/pdf/2401.07964v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08841v1","updated":"2024-08-16T17:00:11Z","published":"2024-08-16T17:00:11Z","title":"FLEXTAF: Enhancing Table Reasoning with Flexible Tabular Formats","summary":" The table reasoning task aims to answer the question according to the given\ntable. Currently, using Large Language Models (LLMs) is the predominant method\nfor table reasoning. Most existing methods employ a fixed tabular format to\nrepresent the table, which could limit the performance. Given that each\ninstance requires different capabilities and models possess varying abilities,\nwe assert that different instances and models suit different tabular formats.\nWe prove the aforementioned claim through quantitative analysis of experimental\nresults, where different instances and models achieve different performances\nusing various tabular formats. Building on this discussion, we propose\nFLEXTAF-Single and FLEXTAF-Vote to enhance table reasoning performance by\nemploying flexible tabular formats. Specifically, (i) FLEXTAF-Single trains a\nclassifier to predict the most suitable tabular format based on the instance\nand the LLM. (ii) FLEXTAF-Vote integrates the results across different formats.\nOur experiments on WikiTableQuestions and TabFact reveal significant\nimprovements, with average gains of 2.3% and 4.8% compared to the best\nperformance achieved using a fixed tabular format with greedy decoding and\nself-consistency decoding, thereby validating the effectiveness of our methods.\n","authors":["Xuanliang Zhang","Dingzirui Wang","Longxu Dou","Baoxin Wang","Dayong Wu","Qingfu Zhu","Wanxiang Che"],"pdf_url":"https://arxiv.org/pdf/2408.08841v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01008v3","updated":"2024-08-16T16:26:11Z","published":"2023-03-31T16:11:56Z","title":"Self-Supervised Multimodal Learning: A Survey","summary":" Multimodal learning, which aims to understand and analyze information from\nmultiple modalities, has achieved substantial progress in the supervised regime\nin recent years. However, the heavy dependence on data paired with expensive\nhuman annotations impedes scaling up models. Meanwhile, given the availability\nof large-scale unannotated data in the wild, self-supervised learning has\nbecome an attractive strategy to alleviate the annotation bottleneck. Building\non these two directions, self-supervised multimodal learning (SSML) provides\nways to learn from raw multimodal data. In this survey, we provide a\ncomprehensive review of the state-of-the-art in SSML, in which we elucidate\nthree major challenges intrinsic to self-supervised learning with multimodal\ndata: (1) learning representations from multimodal data without labels, (2)\nfusion of different modalities, and (3) learning with unaligned data. We then\ndetail existing solutions to these challenges. Specifically, we consider (1)\nobjectives for learning from multimodal unlabeled data via self-supervision,\n(2) model architectures from the perspective of different multimodal fusion\nstrategies, and (3) pair-free learning strategies for coarse-grained and\nfine-grained alignment. We also review real-world applications of SSML\nalgorithms in diverse fields such as healthcare, remote sensing, and machine\ntranslation. Finally, we discuss challenges and future directions for SSML. A\ncollection of related resources can be found at:\nhttps://github.com/ys-zong/awesome-self-supervised-multimodal-learning.\n","authors":["Yongshuo Zong","Oisin Mac Aodha","Timothy Hospedales"],"pdf_url":"https://arxiv.org/pdf/2304.01008v3.pdf","comment":"Accepted to IEEE T-PAMI"},{"id":"http://arxiv.org/abs/2402.10666v3","updated":"2024-08-16T15:37:25Z","published":"2024-02-16T13:14:35Z","title":"Multi-Hop Table Retrieval for Open-Domain Text-to-SQL","summary":" Open-domain text-to-SQL is an important task that retrieves question-relevant\ntables from massive databases and then generates SQL. However, existing\nretrieval methods that retrieve in a single hop do not pay attention to the\ntext-to-SQL challenge of schema linking, which is aligning the entities in the\nquestion with table entities, reflected in two aspects: similar irrelevant\nentity and domain mismatch entity. Therefore, we propose our method, the\nmulti-hop table retrieval with rewrite and beam search (Murre). To reduce the\neffect of the similar irrelevant entity, our method focuses on unretrieved\nentities at each hop and considers the low-ranked tables by beam search. To\nalleviate the limitation of domain mismatch entity, Murre rewrites the question\nbased on retrieved tables in multiple hops, decreasing the domain gap with\nrelevant tables. We conduct experiments on SpiderUnion and BirdUnion+, reaching\nnew state-of-the-art results with an average improvement of 6.38%.\n","authors":["Xuanliang Zhang","Dingzirui Wang","Longxu Dou","Qingfu Zhu","Wanxiang Che"],"pdf_url":"https://arxiv.org/pdf/2402.10666v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08660v2","updated":"2024-08-16T15:33:23Z","published":"2024-06-12T21:46:13Z","title":"Fine-Tuned 'Small' LLMs (Still) Significantly Outperform Zero-Shot\n Generative AI Models in Text Classification","summary":" Generative AI offers a simple, prompt-based alternative to fine-tuning\nsmaller BERT-style LLMs for text classification tasks. This promises to\neliminate the need for manually labeled training data and task-specific model\ntraining. However, it remains an open question whether tools like ChatGPT can\ndeliver on this promise. In this paper, we show that smaller, fine-tuned LLMs\n(still) consistently and significantly outperform larger, zero-shot prompted\nmodels in text classification. We compare three major generative AI models\n(ChatGPT with GPT-3.5/GPT-4 and Claude Opus) with several fine-tuned LLMs\nacross a diverse set of classification tasks (sentiment, approval/disapproval,\nemotions, party positions) and text categories (news, tweets, speeches). We\nfind that fine-tuning with application-specific training data achieves superior\nperformance in all cases. To make this approach more accessible to a broader\naudience, we provide an easy-to-use toolkit alongside this paper. Our toolkit,\naccompanied by non-technical step-by-step guidance, enables users to select and\nfine-tune BERT-like LLMs for any classification task with minimal technical and\ncomputational effort.\n","authors":["Martin Juan José Bucher","Marco Martini"],"pdf_url":"https://arxiv.org/pdf/2406.08660v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08805v1","updated":"2024-08-16T15:29:54Z","published":"2024-08-16T15:29:54Z","title":"CIKMar: A Dual-Encoder Approach to Prompt-Based Reranking in Educational\n Dialogue Systems","summary":" In this study, we introduce CIKMar, an efficient approach to educational\ndialogue systems powered by the Gemma Language model. By leveraging a\nDual-Encoder ranking system that incorporates both BERT and SBERT model, we\nhave designed CIKMar to deliver highly relevant and accurate responses, even\nwith the constraints of a smaller language model size. Our evaluation reveals\nthat CIKMar achieves a robust recall and F1-score of 0.70 using BERTScore\nmetrics. However, we have identified a significant challenge: the Dual-Encoder\ntends to prioritize theoretical responses over practical ones. These findings\nunderscore the potential of compact and efficient models like Gemma in\ndemocratizing access to advanced educational AI systems, ensuring effective and\ncontextually appropriate responses.\n","authors":["Joanito Agili Lopo","Marina Indah Prasasti","Alma Permatasari"],"pdf_url":"https://arxiv.org/pdf/2408.08805v1.pdf","comment":"This paper is the result of the final project of the Natural Language\n Processing course, Master of Artificial Intelligence, Universitas Gadjah Mada"},{"id":"http://arxiv.org/abs/2408.08803v1","updated":"2024-08-16T15:28:02Z","published":"2024-08-16T15:28:02Z","title":"Leveraging FourierKAN Classification Head for Pre-Trained\n Transformer-based Text Classification","summary":" For many years, transformer-based pre-trained models with Multi-layer\nPerceptron (MLP) heads have been the standard for text classification tasks.\nHowever, the fixed non-linear functions employed by MLPs often fall short of\ncapturing the intricacies of the contextualized embeddings produced by\npre-trained encoders. Furthermore, MLPs usually require a significant number of\ntraining parameters, which can be computationally expensive. In this work, we\nintroduce FourierKAN (FR-KAN), a variant of the promising MLP alternative\ncalled Kolmogorov-Arnold Networks (KANs), as classification heads for\ntransformer-based encoders. Our studies reveal an average increase of 10% in\naccuracy and 11% in F1-score when incorporating FR-KAN heads instead of\ntraditional MLP heads for several transformer-based pre-trained models across\nmultiple text classification tasks. Beyond improving model accuracy, FR-KAN\nheads train faster and require fewer parameters. Our research opens new grounds\nfor broader applications of KAN across several Natural Language Processing\n(NLP) tasks.\n","authors":["Abdullah Al Imran","Md Farhan Ishmam"],"pdf_url":"https://arxiv.org/pdf/2408.08803v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14322v3","updated":"2024-08-16T15:02:45Z","published":"2024-06-20T13:54:32Z","title":"Mind the Privacy Unit! User-Level Differential Privacy for Language\n Model Fine-Tuning","summary":" Large language models (LLMs) have emerged as powerful tools for tackling\ncomplex tasks across diverse domains, but they also raise privacy concerns when\nfine-tuned on sensitive data due to potential memorization. While differential\nprivacy (DP) offers a promising solution by ensuring models are 'almost\nindistinguishable' with or without any particular privacy unit, current\nevaluations on LLMs mostly treat each example (text record) as the privacy\nunit. This leads to uneven user privacy guarantees when contributions per user\nvary. We therefore study user-level DP motivated by applications where it\nnecessary to ensure uniform privacy protection across users. We present a\nsystematic evaluation of user-level DP for LLM fine-tuning on natural language\ngeneration tasks. Focusing on two mechanisms for achieving user-level DP\nguarantees, Group Privacy and User-wise DP-SGD, we investigate design choices\nlike data selection strategies and parameter tuning for the best\nprivacy-utility tradeoff.\n","authors":["Lynn Chua","Badih Ghazi","Yangsibo Huang","Pritish Kamath","Ravi Kumar","Daogao Liu","Pasin Manurangsi","Amer Sinha","Chiyuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.14322v3.pdf","comment":"Published as a conference paper at COLM 2024"},{"id":"http://arxiv.org/abs/2401.14267v3","updated":"2024-08-16T14:56:36Z","published":"2024-01-25T16:01:49Z","title":"Transformers and Cortical Waves: Encoders for Pulling In Context Across\n Time","summary":" The capabilities of transformer networks such as ChatGPT and other Large\nLanguage Models (LLMs) have captured the world's attention. The crucial\ncomputational mechanism underlying their performance relies on transforming a\ncomplete input sequence - for example, all the words in a sentence - into a\nlong \"encoding vector\" that allows transformers to learn long-range temporal\ndependencies in naturalistic sequences. Specifically, \"self-attention\" applied\nto this encoding vector enhances temporal context in transformers by computing\nassociations between pairs of words in the input sequence. We suggest that\nwaves of neural activity traveling across single cortical areas or multiple\nregions at the whole-brain scale could implement a similar encoding principle.\nBy encapsulating recent input history into a single spatial pattern at each\nmoment in time, cortical waves may enable temporal context to be extracted from\nsequences of sensory inputs, the same computational principle used in\ntransformers.\n","authors":["Lyle Muller","Patricia S. Churchland","Terrence J. Sejnowski"],"pdf_url":"https://arxiv.org/pdf/2401.14267v3.pdf","comment":"27 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.08782v1","updated":"2024-08-16T14:54:41Z","published":"2024-08-16T14:54:41Z","title":"EmoDynamiX: Emotional Support Dialogue Strategy Prediction by Modelling\n MiXed Emotions and Discourse Dynamics","summary":" Designing emotionally intelligent conversational systems to provide comfort\nand advice to people experiencing distress is a compelling area of research.\nPrevious efforts have focused on developing modular dialogue systems that treat\nsocio-emotional strategy prediction as an auxiliary task and generate\nstrategy-conditioned responses with customized decoders. Recently, with\nadvancements in large language models (LLMs), end-to-end dialogue agents\nwithout explicit socio-emotional strategy prediction steps have become\nprevalent. However, despite their excellence in language generation, recent\nstudies show that LLMs' inherent preference bias towards certain\nsocio-emotional strategies hinders the delivery of high-quality emotional\nsupport. To address this challenge, we propose decoupling strategy prediction\nfrom language generation, and introduce a novel dialogue strategy predictor,\nEmoDynamiX, which models the discourse dynamics between user emotions and\nsystem strategies using a heterogeneous graph. Additionally, we make use of the\nEmotion Recognition in Conversations (ERC) task and design a flexible\nmixed-emotion module to capture fine-grained emotional states of the user.\nExperimental results on two ESC datasets show EmoDynamiX outperforms previous\nstate-of-the-art methods with a significant margin.\n","authors":["Chenwei Wan","Matthieu Labeau","Chloé Clavel"],"pdf_url":"https://arxiv.org/pdf/2408.08782v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08781v1","updated":"2024-08-16T14:49:35Z","published":"2024-08-16T14:49:35Z","title":"Evaluating the Evaluator: Measuring LLMs' Adherence to Task Evaluation\n Instructions","summary":" LLMs-as-a-judge is a recently popularized method which replaces human\njudgements in task evaluation (Zheng et al. 2024) with automatic evaluation\nusing LLMs. Due to widespread use of RLHF (Reinforcement Learning from Human\nFeedback), state-of-the-art LLMs like GPT4 and Llama3 are expected to have\nstrong alignment with human preferences when prompted for a quality judgement,\nsuch as the coherence of a text. While this seems beneficial, it is not clear\nwhether the assessments by an LLM-as-a-judge constitute only an evaluation\nbased on the instructions in the prompts, or reflect its preference for\nhigh-quality data similar to its fine-tune data. To investigate how much\ninfluence prompting the LLMs-as-a-judge has on the alignment of AI judgements\nto human judgements, we analyze prompts with increasing levels of instructions\nabout the target quality of an evaluation, for several LLMs-as-a-judge.\nFurther, we compare to a prompt-free method using model perplexity as a quality\nmeasure instead. We aggregate a taxonomy of quality criteria commonly used\nacross state-of-the-art evaluations with LLMs and provide this as a rigorous\nbenchmark of models as judges. Overall, we show that the LLMs-as-a-judge\nbenefit only little from highly detailed instructions in prompts and that\nperplexity can sometimes align better with human judgements than prompting,\nespecially on textual quality.\n","authors":["Bhuvanashree Murugadoss","Christian Poelitz","Ian Drosos","Vu Le","Nick McKenna","Carina Suzana Negreanu","Chris Parnin","Advait Sarkar"],"pdf_url":"https://arxiv.org/pdf/2408.08781v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08780v1","updated":"2024-08-16T14:49:04Z","published":"2024-08-16T14:49:04Z","title":"Large Language Models Might Not Care What You Are Saying: Prompt Format\n Beats Descriptions","summary":" With the help of in-context learning (ICL), large language models (LLMs) have\nachieved impressive performance across various tasks. However, the function of\ndescriptive instructions during ICL remains under-explored. In this work, we\npropose an ensemble prompt framework to describe the selection criteria of\nmultiple in-context examples, and preliminary experiments on machine\ntranslation (MT) across six translation directions confirm that this framework\nboosts ICL perfromance. But to our surprise, LLMs might not necessarily care\nwhat the descriptions actually say, and the performance gain is primarily\ncaused by the ensemble format, since the framework could lead to improvement\neven with random descriptive nouns. We further apply this new ensemble prompt\non a range of commonsense, math, logical reasoning and hallucination tasks with\nthree LLMs and achieve promising results, suggesting again that designing a\nproper prompt format would be much more effective and efficient than paying\neffort into specific descriptions. Our code will be publicly available once\nthis paper is published.\n","authors":["Chenming Tang","Zhixiang Wang","Yunfang Wu"],"pdf_url":"https://arxiv.org/pdf/2408.08780v1.pdf","comment":"10 pages, 6 figures, 3 tables"},{"id":"http://arxiv.org/abs/2408.08779v1","updated":"2024-08-16T14:43:15Z","published":"2024-08-16T14:43:15Z","title":"DAC: Decomposed Automation Correction for Text-to-SQL","summary":" Text-to-SQL is an important task that helps people obtain information from\ndatabases by automatically generating SQL queries. Considering the brilliant\nperformance, approaches based on Large Language Models (LLMs) become the\nmainstream for text-to-SQL. Among these approaches, automated correction is an\neffective approach that further enhances performance by correcting the mistakes\nin the generated results. The existing correction methods require LLMs to\ndirectly correct with generated SQL, while previous research shows that LLMs do\nnot know how to detect mistakes, leading to poor performance. Therefore, in\nthis paper, we propose to employ the decomposed correction to enhance\ntext-to-SQL performance. We first demonstrate that decomposed correction\noutperforms direct correction since detecting and fixing mistakes with the\nresults of the decomposed sub-tasks is easier than with SQL. Based on this\nanalysis, we introduce Decomposed Automation Correction (DAC), which corrects\nSQL by decomposing text-to-SQL into entity linking and skeleton parsing. DAC\nfirst generates the entity and skeleton corresponding to the question and then\ncompares the differences between the initial SQL and the generated entities and\nskeleton as feedback for correction. Experimental results show that our method\nimproves performance by $3.7\\%$ on average of Spider, Bird, and KaggleDBQA\ncompared with the baseline method, demonstrating the effectiveness of DAC.\n","authors":["Dingzirui Wang","Longxu Dou","Xuanliang Zhang","Qingfu Zhu","Wanxiang Che"],"pdf_url":"https://arxiv.org/pdf/2408.08779v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08769v1","updated":"2024-08-16T14:23:59Z","published":"2024-08-16T14:23:59Z","title":"Lower Layer Matters: Alleviating Hallucination via Multi-Layer Fusion\n Contrastive Decoding with Truthfulness Refocused","summary":" Large Language Models (LLMs) have demonstrated exceptional performance across\nvarious natural language processing tasks, yet they occasionally tend to yield\ncontent that factually inaccurate or discordant with the expected output, a\nphenomenon empirically referred to as \"hallucination\". To tackle this issue,\nrecent works have investigated contrastive decoding between the original model\nand an amateur model with induced hallucination, which has shown promising\nresults. Nonetheless, this method may undermine the output distribution of the\noriginal LLM caused by its coarse contrast and simplistic subtraction\noperation, potentially leading to errors in certain cases. In this paper, we\nintroduce a novel contrastive decoding framework termed LOL (LOwer Layer\nMatters). Our approach involves concatenating the contrastive decoding of both\nthe final and lower layers between the original model and the amateur model,\nthereby achieving multi-layer fusion to aid in the mitigation of hallucination.\nAdditionally, we incorporate a truthfulness refocused module that leverages\ncontextual guidance to enhance factual encoding, further capturing truthfulness\nduring contrastive decoding. Extensive experiments conducted on two publicly\navailable datasets illustrate that our proposed LOL framework can substantially\nalleviate hallucination while surpassing existing baselines in most cases.\nCompared with the best baseline, we improve by average 4.5 points on all\nmetrics of TruthfulQA. The source code is coming soon.\n","authors":["Dingwei Chen","Feiteng Fang","Shiwen Ni","Feng Liang","Ruifeng Xu","Min Yang","Chengming Li"],"pdf_url":"https://arxiv.org/pdf/2408.08769v1.pdf","comment":"9 pages, 4 figures, 5 tables"},{"id":"http://arxiv.org/abs/2408.08729v1","updated":"2024-08-16T13:22:55Z","published":"2024-08-16T13:22:55Z","title":"ConcateNet: Dialogue Separation Using Local And Global Feature\n Concatenation","summary":" Dialogue separation involves isolating a dialogue signal from a mixture, such\nas a movie or a TV program. This can be a necessary step to enable dialogue\nenhancement for broadcast-related applications. In this paper, ConcateNet for\ndialogue separation is proposed, which is based on a novel approach for\nprocessing local and global features aimed at better generalization for\nout-of-domain signals. ConcateNet is trained using a noise reduction-focused,\npublicly available dataset and evaluated using three datasets: two noise\nreduction-focused datasets (in-domain), which show competitive performance for\nConcateNet, and a broadcast-focused dataset (out-of-domain), which verifies the\nbetter generalization performance for the proposed architecture compared to\nconsidered state-of-the-art noise-reduction methods.\n","authors":["Mhd Modar Halimeh","Matteo Torcoli","Emanuël Habets"],"pdf_url":"https://arxiv.org/pdf/2408.08729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08724v1","updated":"2024-08-16T13:11:53Z","published":"2024-08-16T13:11:53Z","title":"ChatZero:Zero-shot Cross-Lingual Dialogue Generation via Pseudo-Target\n Language","summary":" Although large language models(LLMs) show amazing capabilities, among various\nexciting applications discovered for LLMs fall short in other low-resource\nlanguages. Besides, most existing methods depend on large-scale dialogue\ncorpora and thus building systems for dialogue generation in a zero-shot\nscenario remains a considerable challenge. To address this challenge, we\npropose a novel end-to-end zero-shot dialogue generation model ChatZero based\non cross-lingual code-switching method. First, we construct code-switching\nlanguage and pseudo-target language with placeholders. Then for cross-lingual\nsemantic transfer, we employ unsupervised contrastive learning to minimize the\nsemantics gap of the source language, code-switching language, and\npseudo-target language that are mutually positive examples in the high\ndimensional semantic space. Experiments on the multilingual DailyDialog and\nDSTC7-AVSD datasets demonstrate that ChatZero can achieve more than 90\\% of the\noriginal performance under the zero-shot case compared to supervised learning,\nand achieve state-of-the-art performance compared with other baselines.\n","authors":["Yongkang Liu","Feng Shi","Daling Wang","Yifei Zhang","Hinrich Schütze"],"pdf_url":"https://arxiv.org/pdf/2408.08724v1.pdf","comment":"ECAI2024"},{"id":"http://arxiv.org/abs/2403.06249v3","updated":"2024-08-16T12:30:07Z","published":"2024-03-10T16:22:20Z","title":"No Language is an Island: Unifying Chinese and English in Financial\n Large Language Models, Instruction Data, and Benchmarks","summary":" While the progression of Large Language Models (LLMs) has notably propelled\nfinancial analysis, their application has largely been confined to singular\nlanguage realms, leaving untapped the potential of bilingual Chinese-English\ncapacity. To bridge this chasm, we introduce ICE-PIXIU, seamlessly amalgamating\nthe ICE-INTENT model and ICE-FLARE benchmark for bilingual financial analysis.\nICE-PIXIU uniquely integrates a spectrum of Chinese tasks, alongside translated\nand original English datasets, enriching the breadth and depth of bilingual\nfinancial modeling. It provides unrestricted access to diverse model variants,\na substantial compilation of diverse cross-lingual and multi-modal instruction\ndata, and an evaluation benchmark with expert annotations, comprising 10 NLP\ntasks, 20 bilingual specific tasks, totaling 95k datasets. Our thorough\nevaluation emphasizes the advantages of incorporating these bilingual datasets,\nespecially in translation tasks and utilizing original English data, enhancing\nboth linguistic flexibility and analytical acuity in financial contexts.\nNotably, ICE-INTENT distinguishes itself by showcasing significant enhancements\nover conventional LLMs and existing financial LLMs in bilingual milieus,\nunderscoring the profound impact of robust bilingual data on the accuracy and\nefficacy of financial NLP.\n","authors":["Gang Hu","Ke Qin","Chenhan Yuan","Min Peng","Alejandro Lopez-Lira","Benyou Wang","Sophia Ananiadou","Jimin Huang","Qianqian Xie"],"pdf_url":"https://arxiv.org/pdf/2403.06249v3.pdf","comment":"19 pages, 3 figures, 12 tables, including Appendix"},{"id":"http://arxiv.org/abs/2408.08696v1","updated":"2024-08-16T12:20:56Z","published":"2024-08-16T12:20:56Z","title":"Turning Trash into Treasure: Accelerating Inference of Large Language\n Models with Token Recycling","summary":" The rapid growth in the parameters of large language models (LLMs) has made\ninference latency a fundamental bottleneck, limiting broader application of\nLLMs. Speculative decoding represents a lossless approach to accelerate\ninference through a guess-and-verify paradigm, leveraging the parallel\ncapabilities of modern hardware. Some speculative decoding methods rely on\nadditional structures to guess draft tokens, such as small models or\nparameter-efficient architectures, which need extra training before use.\nAlternatively, retrieval-based train-free techniques build libraries from\npre-existing corpora or by n-gram generation. However, they face challenges\nlike large storage requirements, time-consuming retrieval, and limited\nadaptability. Observing that candidate tokens generated during the decoding\nprocess are likely to reoccur in future sequences, we propose Token Recycling.\nThis approach stores candidate tokens in an adjacency matrix and employs a\nbreadth-first search (BFS)-like algorithm on the matrix to construct a draft\ntree. The tree is then validated through tree attention. New candidate tokens\nfrom the decoding process are then used to update the matrix. Token Recycling\nrequires \\textless2MB of additional storage and achieves approximately 2x\nspeedup across all sizes of LLMs. It significantly outperforms existing\ntrain-free methods by 30\\% and even a training method by 25\\%. It can be\ndirectly applied to any existing LLMs and tasks without the need for\nadaptation.\n","authors":["Xianzhen Luo","Yixuan Wang","Qingfu Zhu","Zhiming Zhang","Xuanyu Zhang","Qing Yang","Dongliang Xu","Wanxiang Che"],"pdf_url":"https://arxiv.org/pdf/2408.08696v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2408.08694v1","updated":"2024-08-16T12:16:59Z","published":"2024-08-16T12:16:59Z","title":"Quantifying the Effectiveness of Student Organization Activities using\n Natural Language Processing","summary":" Student extracurricular activities play an important role in enriching the\nstudents' educational experiences. With the increasing popularity of Machine\nLearning and Natural Language Processing, it becomes a logical step that\nincorporating ML-NLP in improving extracurricular activities is a potential\nfocus of study in Artificial Intelligence (AI). This research study aims to\ndevelop a machine learning workflow that will quantify the effectiveness of\nstudent-organized activities based on student emotional responses using\nsentiment analysis. The study uses the Bidirectional Encoder Representations\nfrom Transformers (BERT) Large Language Model (LLM) called via the\npysentimiento toolkit, as a Transformer pipeline in Hugging Face. A sample data\nset from Organization C, a Recognized Student Organization (RSO) of a higher\neducational institute in the Philippines, College X, was used to develop the\nworkflow. The workflow consisted of data preprocessing, key feature selection,\nLLM feature processing, and score aggregation, resulting in an Event Score for\neach data set. The results show that the BERT LLM can also be used effectively\nin analyzing sentiment beyond product reviews and post comments. For the\nstudent affairs offices of educational institutions, this study can provide a\npractical example of how NLP can be applied to real-world scenarios, showcasing\nthe potential impact of data-driven decision making.\n","authors":["Lyberius Ennio F. Taruc","Arvin R. De La Cruz"],"pdf_url":"https://arxiv.org/pdf/2408.08694v1.pdf","comment":"11 pages, 4 figures, presented in International Conference on\n Generative Al and its Applications (ICGAIA-24) last 22nd - 23rd, July, 2024\n at Jakarta, Indonesia"},{"id":"http://arxiv.org/abs/2408.08693v1","updated":"2024-08-16T12:14:55Z","published":"2024-08-16T12:14:55Z","title":"Med-PMC: Medical Personalized Multi-modal Consultation with a Proactive\n Ask-First-Observe-Next Paradigm","summary":" The application of the Multi-modal Large Language Models (MLLMs) in medical\nclinical scenarios remains underexplored. Previous benchmarks only focus on the\ncapacity of the MLLMs in medical visual question-answering (VQA) or report\ngeneration and fail to assess the performance of the MLLMs on complex clinical\nmulti-modal tasks. In this paper, we propose a novel Medical Personalized\nMulti-modal Consultation (Med-PMC) paradigm to evaluate the clinical capacity\nof the MLLMs. Med-PMC builds a simulated clinical environment where the MLLMs\nare required to interact with a patient simulator to complete the multi-modal\ninformation-gathering and decision-making task. Specifically, the patient\nsimulator is decorated with personalized actors to simulate diverse patients in\nreal scenarios. We conduct extensive experiments to access 12 types of MLLMs,\nproviding a comprehensive view of the MLLMs' clinical performance. We found\nthat current MLLMs fail to gather multimodal information and show potential\nbias in the decision-making task when consulted with the personalized patient\nsimulators. Further analysis demonstrates the effectiveness of Med-PMC, showing\nthe potential to guide the development of robust and reliable clinical MLLMs.\nCode and data are available at https://github.com/LiuHC0428/Med-PMC.\n","authors":["Hongcheng Liu","Yusheng Liao","Siqv Ou","Yuhao Wang","Heyang Liu","Yanfeng Wang","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2408.08693v1.pdf","comment":"26 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.08688v1","updated":"2024-08-16T12:01:55Z","published":"2024-08-16T12:01:55Z","title":"The Fellowship of the LLMs: Multi-Agent Workflows for Synthetic\n Preference Optimization Dataset Generation","summary":" This paper presents and evaluates multi-agent workflows for synthetic\nPreference Optimization (PO) dataset generation. PO dataset generation requires\ntwo modules: (1) response evaluation, and (2) response generation. In the\nresponse evaluation module, the responses from Large Language Models (LLMs) are\nevaluated and ranked - a task typically carried out by human annotators that we\nautomate using LLMs. We assess the response evaluation module in a 2 step\nprocess. In step 1, we assess LLMs as evaluators using three distinct prompting\nstrategies. In step 2, we apply the winning prompting strategy to compare the\nperformance of LLM-as-a-Judge, LLMs-as-a-Jury, and LLM Debate. In each step, we\nuse inter-rater agreement using Cohen's Kappa between human annotators and\nLLMs. For the response generation module, we compare different configurations\nfor the LLM Feedback Loop using the identified LLM evaluator configuration. We\nuse the win rate (the fraction of times a generation framework is selected as\nthe best by an LLM evaluator) to determine the best multi-agent configuration\nfor generation. After identifying the best configurations for both modules, we\nuse models from the GPT, Gemma, and Llama families to generate our PO datasets\nusing the above pipeline. We generate two types of PO datasets, one to improve\nthe generation capabilities of individual LLM and the other to improve the\nmulti-agent workflow. Our evaluation shows that GPT-4o-as-a-Judge is more\nconsistent across datasets when the candidate responses do not include\nresponses from the GPT family. Additionally, we find that the LLM Feedback\nLoop, with Llama as the generator and Gemma as the reviewer, achieves a notable\n71.8% and 73.8% win rate over single-agent Llama and Gemma, respectively.\n","authors":["Samee Arif","Sualeha Farid","Abdul Hameed Azeemi","Awais Athar","Agha Ali Raza"],"pdf_url":"https://arxiv.org/pdf/2408.08688v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08212v2","updated":"2024-08-16T11:57:53Z","published":"2024-08-15T15:23:00Z","title":"Covert Bias: The Severity of Social Views' Unalignment in Language\n Models Towards Implicit and Explicit Opinion","summary":" While various approaches have recently been studied for bias identification,\nlittle is known about how implicit language that does not explicitly convey a\nviewpoint affects bias amplification in large language models. To examine the\nseverity of bias toward a view, we evaluated the performance of two downstream\ntasks where the implicit and explicit knowledge of social groups were used.\nFirst, we present a stress test evaluation by using a biased model in edge\ncases of excessive bias scenarios. Then, we evaluate how LLMs calibrate\nlinguistically in response to both implicit and explicit opinions when they are\naligned with conflicting viewpoints. Our findings reveal a discrepancy in LLM\nperformance in identifying implicit and explicit opinions, with a general\ntendency of bias toward explicit opinions of opposing stances. Moreover, the\nbias-aligned models generate more cautious responses using uncertainty phrases\ncompared to the unaligned (zero-shot) base models. The direct, incautious\nresponses of the unaligned models suggest a need for further refinement of\ndecisiveness by incorporating uncertainty markers to enhance their reliability,\nespecially on socially nuanced topics with high subjectivity.\n","authors":["Abeer Aldayel","Areej Alokaili","Rehab Alahmadi"],"pdf_url":"https://arxiv.org/pdf/2408.08212v2.pdf","comment":"This work is under-review"},{"id":"http://arxiv.org/abs/2408.08682v1","updated":"2024-08-16T11:55:44Z","published":"2024-08-16T11:55:44Z","title":"LLM-PCGC: Large Language Model-based Point Cloud Geometry Compression","summary":" The key to effective point cloud compression is to obtain a robust context\nmodel consistent with complex 3D data structures. Recently, the advancement of\nlarge language models (LLMs) has highlighted their capabilities not only as\npowerful generators for in-context learning and generation but also as\neffective compressors. These dual attributes of LLMs make them particularly\nwell-suited to meet the demands of data compression. Therefore, this paper\nexplores the potential of using LLM for compression tasks, focusing on lossless\npoint cloud geometry compression (PCGC) experiments. However, applying LLM\ndirectly to PCGC tasks presents some significant challenges, i.e., LLM does not\nunderstand the structure of the point cloud well, and it is a difficult task to\nfill the gap between text and point cloud through text description, especially\nfor large complicated and small shapeless point clouds. To address these\nproblems, we introduce a novel architecture, namely the Large Language\nModel-based Point Cloud Geometry Compression (LLM-PCGC) method, using LLM to\ncompress point cloud geometry information without any text description or\naligning operation. By utilizing different adaptation techniques for\ncross-modality representation alignment and semantic consistency, including\nclustering, K-tree, token mapping invariance, and Low Rank Adaptation (LoRA),\nthe proposed method can translate LLM to a compressor/generator for point\ncloud. To the best of our knowledge, this is the first structure to employ LLM\nas a compressor for point cloud data. Experiments demonstrate that the LLM-PCGC\noutperforms the other existing methods significantly, by achieving -40.213% bit\nrate reduction compared to the reference software of MPEG Geometry-based Point\nCloud Compression (G-PCC) standard, and by achieving -2.267% bit rate reduction\ncompared to the state-of-the-art learning-based method.\n","authors":["Yuqi Ye","Wei Gao"],"pdf_url":"https://arxiv.org/pdf/2408.08682v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08661v1","updated":"2024-08-16T11:09:56Z","published":"2024-08-16T11:09:56Z","title":"MIA-Tuner: Adapting Large Language Models as Pre-training Text Detector","summary":" The increasing parameters and expansive dataset of large language models\n(LLMs) highlight the urgent demand for a technical solution to audit the\nunderlying privacy risks and copyright issues associated with LLMs. Existing\nstudies have partially addressed this need through an exploration of the\npre-training data detection problem, which is an instance of a membership\ninference attack (MIA). This problem involves determining whether a given piece\nof text has been used during the pre-training phase of the target LLM. Although\nexisting methods have designed various sophisticated MIA score functions to\nachieve considerable detection performance in pre-trained LLMs, how to achieve\nhigh-confidence detection and how to perform MIA on aligned LLMs remain\nchallenging. In this paper, we propose MIA-Tuner, a novel instruction-based MIA\nmethod, which instructs LLMs themselves to serve as a more precise pre-training\ndata detector internally, rather than design an external MIA score function.\nFurthermore, we design two instruction-based safeguards to respectively\nmitigate the privacy risks brought by the existing methods and MIA-Tuner. To\ncomprehensively evaluate the most recent state-of-the-art LLMs, we collect a\nmore up-to-date MIA benchmark dataset, named WIKIMIA-24, to replace the widely\nadopted benchmark WIKIMIA. We conduct extensive experiments across various\naligned and unaligned LLMs over the two benchmark datasets. The results\ndemonstrate that MIA-Tuner increases the AUC of MIAs from 0.7 to a\nsignificantly high level of 0.9.\n","authors":["Wenjie Fu","Huandong Wang","Chen Gao","Guanghua Liu","Yong Li","Tao Jiang"],"pdf_url":"https://arxiv.org/pdf/2408.08661v1.pdf","comment":"code and dataset: https://github.com/wjfu99/MIA-Tuner"},{"id":"http://arxiv.org/abs/2406.08068v2","updated":"2024-08-16T10:50:45Z","published":"2024-06-12T10:36:27Z","title":"Large Language Models Meet Text-Centric Multimodal Sentiment Analysis: A\n Survey","summary":" Compared to traditional sentiment analysis, which only considers text,\nmultimodal sentiment analysis needs to consider emotional signals from\nmultimodal sources simultaneously and is therefore more consistent with the way\nhow humans process sentiment in real-world scenarios. It involves processing\nemotional information from various sources such as natural language, images,\nvideos, audio, physiological signals, etc. However, although other modalities\nalso contain diverse emotional cues, natural language usually contains richer\ncontextual information and therefore always occupies a crucial position in\nmultimodal sentiment analysis. The emergence of ChatGPT has opened up immense\npotential for applying large language models (LLMs) to text-centric multimodal\ntasks. However, it is still unclear how existing LLMs can adapt better to\ntext-centric multimodal sentiment analysis tasks. This survey aims to (1)\npresent a comprehensive review of recent research in text-centric multimodal\nsentiment analysis tasks, (2) examine the potential of LLMs for text-centric\nmultimodal sentiment analysis, outlining their approaches, advantages, and\nlimitations, (3) summarize the application scenarios of LLM-based multimodal\nsentiment analysis technology, and (4) explore the challenges and potential\nresearch directions for multimodal sentiment analysis in the future.\n","authors":["Hao Yang","Yanyan Zhao","Yang Wu","Shilong Wang","Tian Zheng","Hongbo Zhang","Zongyang Ma","Wanxiang Che","Bing Qin"],"pdf_url":"https://arxiv.org/pdf/2406.08068v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2210.14556 by other authors"},{"id":"http://arxiv.org/abs/2408.08656v1","updated":"2024-08-16T10:45:45Z","published":"2024-08-16T10:45:45Z","title":"LLMs Are Biased Towards Output Formats! Systematically Evaluating and\n Mitigating Output Format Bias of LLMs","summary":" We present the first systematic evaluation examining format bias in\nperformance of large language models (LLMs). Our approach distinguishes between\ntwo categories of an evaluation metric under format constraints to reliably and\naccurately assess performance: one measures performance when format constraints\nare adhered to, while the other evaluates performance regardless of constraint\nadherence. We then define a metric for measuring the format bias of LLMs and\nestablish effective strategies to reduce it. Subsequently, we present our\nempirical format bias evaluation spanning four commonly used categories --\nmultiple-choice question-answer, wrapping, list, and mapping -- covering 15\nwidely-used formats. Our evaluation on eight generation tasks uncovers\nsignificant format bias across state-of-the-art LLMs. We further discover that\nimproving the format-instruction following capabilities of LLMs across formats\npotentially reduces format bias. Based on our evaluation findings, we study\nprompting and fine-tuning with synthesized format data techniques to mitigate\nformat bias. Our methods successfully reduce the variance in ChatGPT's\nperformance among wrapping formats from 235.33 to 0.71 (%$^2$).\n","authors":["Do Xuan Long","Hai Nguyen Ngoc","Tiviatis Sim","Hieu Dao","Shafiq Joty","Kenji Kawaguchi","Nancy F. Chen","Min-Yen Kan"],"pdf_url":"https://arxiv.org/pdf/2408.08656v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03544v2","updated":"2024-08-16T10:42:38Z","published":"2024-08-07T04:49:38Z","title":"Unlocking the Non-Native Language Context Limitation: Native Language\n Prompting Facilitates Knowledge Elicitation","summary":" Multilingual large language models (MLLMs) struggle to answer questions posed\nin non-dominant languages, even though they have acquired the relevant\nknowledge from their dominant language corpus. In contrast, human multilinguals\ncan overcome such non-native language context limitations through Positive\nNative Language Transfer (PNLT). Inspired by the process of PNLT, we analogize\nthe dominant language of MLLMs to the native language of human multilinguals,\nand propose Native Language Prompting (NatLan) to simulate the PNLT observed in\nhuman multilinguals. It explicitly creates native language contexts for MLLMs\nto facilitate the elicitation of the rich native language knowledge during\nquestion-answering, unlocking the limitations imposed by non-native language\ncontexts. By employing multi-MLLM collaboration, NatLan reduces the workload on\neach MLLM in simulating PNLT and refines semantic transfer. On the C-Eval\nbenchmark, NatLan provides up to a 10.1% average accuracy improvement and up to\na 5.0% increase in the hard-level subset across five MLLMs, surpassing all\ntop-notch related methods. Our code is available at\nhttps://github.com/AnonyNLP/NatLan.\n","authors":["Baixuan Li","Yunlong Fan","Zhiqiang Gao"],"pdf_url":"https://arxiv.org/pdf/2408.03544v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08651v1","updated":"2024-08-16T10:34:50Z","published":"2024-08-16T10:34:50Z","title":"Reasoning Beyond Bias: A Study on Counterfactual Prompting and Chain of\n Thought Reasoning","summary":" Language models are known to absorb biases from their training data, leading\nto predictions driven by statistical regularities rather than semantic\nrelevance. We investigate the impact of these biases on answer choice\npreferences in the Massive Multi-Task Language Understanding (MMLU) task. Our\nfindings reveal that differences in learned regularities across answer options\nare predictive of model preferences and mirror human test-taking strategies. To\naddress this issue, we introduce two novel methods: Counterfactual Prompting\nwith Chain of Thought (CoT) and Counterfactual Prompting with Agnostically\nPrimed CoT (APriCoT). We demonstrate that while Counterfactual Prompting with\nCoT alone is insufficient to mitigate bias, our novel Primed Counterfactual\nPrompting with CoT approach effectively reduces the influence of base-rate\nprobabilities while improving overall accuracy. Our results suggest that\nmitigating bias requires a \"System-2\" like process and that CoT reasoning is\nsusceptible to confirmation bias under some prompting methodologies. Our\ncontributions offer practical solutions for developing more robust and fair\nlanguage models.\n","authors":["Kyle Moore","Jesse Roberts","Thao Pham","Douglas Fisher"],"pdf_url":"https://arxiv.org/pdf/2408.08651v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08650v1","updated":"2024-08-16T10:33:19Z","published":"2024-08-16T10:33:19Z","title":"An End-to-End Model for Photo-Sharing Multi-modal Dialogue Generation","summary":" Photo-Sharing Multi-modal dialogue generation requires a dialogue agent not\nonly to generate text responses but also to share photos at the proper moment.\nUsing image text caption as the bridge, a pipeline model integrates an image\ncaption model, a text generation model, and an image generation model to handle\nthis complex multi-modal task. However, representing the images with text\ncaptions may loss important visual details and information and cause error\npropagation in the complex dialogue system. Besides, the pipeline model\nisolates the three models separately because discrete image text captions\nhinder end-to-end gradient propagation. We propose the first end-to-end model\nfor photo-sharing multi-modal dialogue generation, which integrates an image\nperceptron and an image generator with a large language model. The large\nlanguage model employs the Q-Former to perceive visual images in the input end.\nFor image generation in the output end, we propose a dynamic vocabulary\ntransformation matrix and use straight-through and gumbel-softmax techniques to\nalign the large language model and stable diffusion model and achieve\nend-to-end gradient propagation. We perform experiments on PhotoChat and\nDialogCC datasets to evaluate our end-to-end model. Compared with pipeline\nmodels, the end-to-end model gains state-of-the-art performances on various\nmetrics of text and image generation. More analysis experiments also verify the\neffectiveness of the end-to-end model for photo-sharing multi-modal dialogue\ngeneration.\n","authors":["Peiming Guo","Sinuo Liu","Yanzhao Zhang","Dingkun Long","Pengjun Xie","Meishan Zhang","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.08650v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2408.08648v1","updated":"2024-08-16T10:30:30Z","published":"2024-08-16T10:30:30Z","title":"Understanding Enthymemes in Argument Maps: Bridging Argument Mining and\n Logic-based Argumentation","summary":" Argument mining is natural language processing technology aimed at\nidentifying arguments in text. Furthermore, the approach is being developed to\nidentify the premises and claims of those arguments, and to identify the\nrelationships between arguments including support and attack relationships. In\nthis paper, we assume that an argument map contains the premises and claims of\narguments, and support and attack relationships between them, that have been\nidentified by argument mining. So from a piece of text, we assume an argument\nmap is obtained automatically by natural language processing. However, to\nunderstand and to automatically analyse that argument map, it would be\ndesirable to instantiate that argument map with logical arguments. Once we have\nthe logical representation of the arguments in an argument map, we can use\nautomated reasoning to analyze the argumentation (e.g. check consistency of\npremises, check validity of claims, and check the labelling on each arc\ncorresponds with thw logical arguments). We address this need by using\nclassical logic for representing the explicit information in the text, and\nusing default logic for representing the implicit information in the text. In\norder to investigate our proposal, we consider some specific options for\ninstantiation.\n","authors":["Jonathan Ben-Naim","Victor David","Anthony Hunter"],"pdf_url":"https://arxiv.org/pdf/2408.08648v1.pdf","comment":"Research note"},{"id":"http://arxiv.org/abs/2408.08640v1","updated":"2024-08-16T10:11:05Z","published":"2024-08-16T10:11:05Z","title":"Math-PUMA: Progressive Upward Multimodal Alignment to Enhance\n Mathematical Reasoning","summary":" Multimodal Large Language Models (MLLMs) excel in solving text-based\nmathematical problems, but they struggle with mathematical diagrams since they\nare primarily trained on natural scene images. For humans, visual aids\ngenerally enhance problem-solving, but MLLMs perform worse as information\nshifts from textual to visual modality. This decline is mainly due to their\nshortcomings in aligning images and text. To tackle aforementioned challenges,\nwe propose Math-PUMA, a methodology focused on Progressive Upward Multimodal\nAlignment. This approach is designed to improve the mathematical reasoning\nskills of MLLMs through a three-stage training process, with the second stage\nbeing the critical alignment stage. We first enhance the language model's\nmathematical reasoning capabilities with extensive set of textual mathematical\nproblems. We then construct a multimodal dataset with varying degrees of\ntextual and visual information, creating data pairs by presenting each problem\nin at least two forms. By leveraging the Kullback-Leibler (KL) divergence of\nnext-token prediction distributions to align visual and textual modalities,\nconsistent problem-solving abilities are ensured. Finally, we utilize\nmultimodal instruction tuning for MLLMs with high-quality multimodal data.\nExperimental results on multiple mathematical reasoning benchmarks demonstrate\nthat the MLLMs trained with Math-PUMA surpass most open-source MLLMs. Our\napproach effectively narrows the performance gap for problems presented in\ndifferent modalities.\n","authors":["Wenwen Zhuang","Xin Huang","Xiantao Zhang","Jin Zeng"],"pdf_url":"https://arxiv.org/pdf/2408.08640v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01129v3","updated":"2024-08-16T10:03:53Z","published":"2024-04-01T14:11:45Z","title":"Emphasising Structured Information: Integrating Abstract Meaning\n Representation into LLMs for Enhanced Open-Domain Dialogue Evaluation","summary":" Automatic open-domain dialogue evaluation has attracted increasing attention.\nTrainable evaluation metrics, typically trained with true positive and randomly\nselected negative responses, tend to assign higher scores to responses that\nshare greater content similarity with a given context. However, adversarial\nnegative responses, despite possessing high content similarity with the\ncontexts, are semantically different. Consequently, existing evaluation metrics\nare not robust enough to evaluate such responses, resulting in low correlations\nwith human judgments. While recent studies have demonstrated the effectiveness\nof Large Language Models (LLMs) for open-domain dialogue evaluation, they still\nface challenges in effectively handling adversarial negative examples. In this\npaper, we propose an effective framework for open-domain dialogue evaluation,\nwhich combines domain-specific language models (SLMs) enhanced with Abstract\nMeaning Representation (AMR) knowledge with LLMs. The SLMs can explicitly\nincorporate AMR graph information of the dialogue through a gating mechanism\nfor enhanced dialogue semantic representation learning. Both the evaluation\nresult from the SLMs and the AMR graph information are incorporated into the\nLLM's prompt for enhanced evaluation performance. Experimental results on\nopen-domain dialogue evaluation tasks demonstrate the superiority of our method\ncompared to a wide range of state-of-the-art baselines, especially in\ndiscriminating adversarial negative responses. Our code and data are publicly\navailable at https://github.com/Bernard-Yang/SIMAMR.\n","authors":["Bohao Yang","Kun Zhao","Chen Tang","Dong Liu","Liang Zhan","Chenghua Lin"],"pdf_url":"https://arxiv.org/pdf/2404.01129v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07081v3","updated":"2024-08-16T09:54:23Z","published":"2024-08-07T18:07:15Z","title":"MathBridge: A Large Corpus Dataset for Translating Spoken Mathematical\n Expressions into $LaTeX$ Formulas for Improved Readability","summary":" Improving the readability of mathematical expressions in text-based document\nsuch as subtitle of mathematical video, is an significant task. To achieve\nthis, mathematical expressions should be convert to compiled formulas. For\ninstance, the spoken expression ``x equals minus b plus or minus the square\nroot of b squared minus four a c, all over two a'' from automatic speech\nrecognition is more readily comprehensible when displayed as a compiled formula\n$x = \\frac{-b \\pm \\sqrt{b^2 - 4ac}}{2a}$. To convert mathematical spoken\nsentences to compiled formulas, two processes are required: spoken sentences\nare converted into LaTeX formulas, and LaTeX formulas are converted into\ncompiled formulas. The latter can be managed by using LaTeX engines. However,\nthere is no way to do the former effectively. Even if we try to solve this\nusing language models, there is no paired data between spoken sentences and\nLaTeX formulas to train it. In this paper, we introduce MathBridge, the first\nextensive dataset for translating mathematical spoken sentences into LaTeX\nformulas. MathBridge comprises approximately 23 million LaTeX formulas paired\nwith the corresponding mathematical spoken sentences. Through comprehensive\nevaluations, including fine-tuning with proposed data, we discovered that\nMathBridge significantly enhances the capabilities of pretrained language\nmodels for converting to LaTeX formulas from mathematical spoken sentences.\nSpecifically, for the T5-large model, the sacreBLEU score increased from 4.77\nto 46.8, demonstrating substantial enhancement.\n","authors":["Kyudan Jung","Sieun Hyeon","Jeong Youn Kwon","Nam-Joon Kim","Hyun Gon Ryu","Hyuk-Jae Lee","Jaeyoung Do"],"pdf_url":"https://arxiv.org/pdf/2408.07081v3.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.08632v1","updated":"2024-08-16T09:52:02Z","published":"2024-08-16T09:52:02Z","title":"A Survey on Benchmarks of Multimodal Large Language Models","summary":" Multimodal Large Language Models (MLLMs) are gaining increasing popularity in\nboth academia and industry due to their remarkable performance in various\napplications such as visual question answering, visual perception,\nunderstanding, and reasoning. Over the past few years, significant efforts have\nbeen made to examine MLLMs from multiple perspectives. This paper presents a\ncomprehensive review of \\textbf{180 benchmarks} and evaluation for MLLMs,\nfocusing on (1)perception and understanding, (2)cognition and reasoning,\n(3)specific domains, (4)key capabilities, and (5)other modalities. Finally, we\ndiscuss the limitations of the current evaluation methods for MLLMs and explore\npromising future directions. Our key argument is that evaluation should be\nregarded as a crucial discipline to better support the development of MLLMs.\nFor more details, please visit our GitHub repository:\nhttps://github.com/swordlidev/Evaluation-Multimodal-LLMs-Survey.\n","authors":["Jian Li","Weiheng Lu"],"pdf_url":"https://arxiv.org/pdf/2408.08632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08631v1","updated":"2024-08-16T09:49:51Z","published":"2024-08-16T09:49:51Z","title":"Persona is a Double-edged Sword: Enhancing the Zero-shot Reasoning by\n Ensembling the Role-playing and Neutral Prompts","summary":" Recent studies demonstrate that prompting an appropriate role-playing persona\nto an LLM improves its reasoning capability. However, assigning a proper\npersona is difficult since an LLM's performance is extremely sensitive to\nassigned prompts; therefore, personas sometimes hinder LLMs and degrade their\nreasoning capabilities. In this paper, we propose a novel framework, Jekyll \\&\nHyde, which ensembles the results of role-playing and neutral prompts to\neradicate performance degradation via unilateral use of role-playing prompted\nLLM and enhance the robustness of an LLM's reasoning ability. Specifically,\nJekyll \\& Hyde collects two potential solutions from both role-playing and\nneutral prompts and selects a better solution after cross-checking via an LLM\nevaluator. However, LLM-based evaluators tend to be affected by the order of\nthose potential solutions within the prompt when selecting the proper solution;\nthus, we also propose a robust LLM evaluator to mitigate the position bias. The\nexperimental analysis demonstrates that role-playing prompts distract LLMs and\ndegrade their reasoning abilities in 4 out of 12 datasets, even when using\nGPT-4. In addition, we reveal that Jekyll \\& Hyde improves reasoning\ncapabilities by selecting better choices among the potential solutions on\ntwelve widely-used reasoning datasets. We further show that our proposed LLM\nevaluator outperforms other baselines, proving the LLMs' position bias is\nsuccessfully mitigated.\n","authors":["Junseok Kim","Nakyeong Yang","Kyomin Jung"],"pdf_url":"https://arxiv.org/pdf/2408.08631v1.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.08624v1","updated":"2024-08-16T09:32:43Z","published":"2024-08-16T09:32:43Z","title":"RealMedQA: A pilot biomedical question answering dataset containing\n realistic clinical questions","summary":" Clinical question answering systems have the potential to provide clinicians\nwith relevant and timely answers to their questions. Nonetheless, despite the\nadvances that have been made, adoption of these systems in clinical settings\nhas been slow. One issue is a lack of question-answering datasets which reflect\nthe real-world needs of health professionals. In this work, we present\nRealMedQA, a dataset of realistic clinical questions generated by humans and an\nLLM. We describe the process for generating and verifying the QA pairs and\nassess several QA models on BioASQ and RealMedQA to assess the relative\ndifficulty of matching answers to questions. We show that the LLM is more\ncost-efficient for generating \"ideal\" QA pairs. Additionally, we achieve a\nlower lexical similarity between questions and answers than BioASQ which\nprovides an additional challenge to the top two QA models, as per the results.\nWe release our code and our dataset publicly to encourage further research.\n","authors":["Gregory Kell","Angus Roberts","Serge Umansky","Yuti Khare","Najma Ahmed","Nikhil Patel","Chloe Simela","Jack Coumbe","Julian Rozario","Ryan-Rhys Griffiths","Iain J. Marshall"],"pdf_url":"https://arxiv.org/pdf/2408.08624v1.pdf","comment":"Accepted at AMIA Annual Symposium 2024"},{"id":"http://arxiv.org/abs/2406.17962v3","updated":"2024-08-16T08:48:26Z","published":"2024-06-25T22:44:17Z","title":"Crafting Customisable Characters with LLMs: Introducing SimsChat, a\n Persona-Driven Role-Playing Agent Framework","summary":" Large Language Models (LLMs) demonstrate a remarkable ability to comprehend\nhuman instructions and generate high-quality text. This capability allows LLMs\nto function as agents that can emulate human beings at a more sophisticated\nlevel, beyond the mere replication of basic human behaviours. However, there is\na lack of exploring into leveraging LLMs to craft characters from diverse\naspects. In this work, we introduce the Customisable Conversation Agent\nFramework, which leverages LLMs to simulate real-world characters that can be\nfreely customised according to various user preferences. This adaptable\nframework is beneficial for the design of customisable characters and\nrole-playing agents aligned with human preferences. We propose the SimsConv\ndataset, which encompasses 68 different customised characters, 1,360 multi-turn\nrole-playing dialogues, and a total of 13,971 interaction dialogues. The\ncharacters are created from several real-world elements, such as career,\naspiration, trait, and skill. Building upon these foundations, we present\nSimsChat, a freely customisable role-playing agent. It incorporates diverse\nreal-world scenes and topic-specific character interaction dialogues, thereby\nsimulating characters' life experiences in various scenarios and topic-specific\ninteractions with specific emotions. Experimental results indicate that our\nproposed framework achieves desirable performance and provides a valuable\nguideline for the construction of more accurate human simulacra in the future.\nOur data and code are publicly available at\nhttps://github.com/Bernard-Yang/SimsChat.\n","authors":["Bohao Yang","Dong Liu","Chen Tang","Chenghao Xiao","Kun Zhao","Chao Li","Lin Yuan","Guang Yang","Lanxiao Huang","Chenghua Lin"],"pdf_url":"https://arxiv.org/pdf/2406.17962v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11550v3","updated":"2024-08-16T08:46:33Z","published":"2024-07-16T09:53:32Z","title":"Ada-KV: Optimizing KV Cache Eviction by Adaptive Budget Allocation for\n Efficient LLM Inference","summary":" Large Language Models have excelled in various fields but encounter\nchallenges in memory and time efficiency due to the expanding Key-Value (KV)\ncache required for long-sequence inference. Recent efforts try to reduce KV\ncache size to a given memory budget by evicting vast non-critical cache\nelements during runtime, while preserving generation quality. Our revisiting of\ncurrent eviction methods reveals that they fundamentally minimize an upper\nbound of the $L_1$ eviction loss between the pre- and post-eviction outputs of\nmulti-head self-attention mechanisms. Moreover, our analysis indicates that the\ncommon practices of uniformly assigning budgets across attention heads harm\ntheir post-eviction generation quality. In light of these findings, we propose\na simple yet effective adaptive budget allocation algorithm. This algorithm not\nonly optimizes the theoretical loss upper bound but also reduces the $L_1$\neviction loss in practice by aligning with the varied characteristics across\ndifferent heads. By integrating this algorithm into two state-of-the-art\nmethods, we demonstrate the effectiveness of using adaptive budget allocation\nto optimize KV cache eviction. Extensive evaluations on 16 datasets and the\nNeedle-in-a-Haystack test confirm significant performance improvements across\nvarious tasks.\n","authors":["Yuan Feng","Junlin Lv","Yukun Cao","Xike Xie","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.11550v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.06992v2","updated":"2024-08-16T08:18:19Z","published":"2024-07-09T16:07:01Z","title":"Robust Neural Information Retrieval: An Adversarial and\n Out-of-distribution Perspective","summary":" Recent advances in neural information retrieval (IR) models have\nsignificantly enhanced their effectiveness over various IR tasks. The\nrobustness of these models, essential for ensuring their reliability in\npractice, has also garnered significant attention. With a wide array of\nresearch on robust IR being proposed, we believe it is the opportune moment to\nconsolidate the current status, glean insights from existing methodologies, and\nlay the groundwork for future development. We view the robustness of IR to be a\nmultifaceted concept, emphasizing its necessity against adversarial attacks,\nout-of-distribution (OOD) scenarios and performance variance. With a focus on\nadversarial and OOD robustness, we dissect robustness solutions for dense\nretrieval models (DRMs) and neural ranking models (NRMs), respectively,\nrecognizing them as pivotal components of the neural IR pipeline. We provide an\nin-depth discussion of existing methods, datasets, and evaluation metrics,\nshedding light on challenges and future directions in the era of large language\nmodels. To the best of our knowledge, this is the first comprehensive survey on\nthe robustness of neural IR models, and we will also be giving our first\ntutorial presentation at SIGIR 2024\n\\url{https://sigir2024-robust-information-retrieval.github.io}. Along with the\norganization of existing work, we introduce a Benchmark for robust IR (BestIR),\na heterogeneous evaluation benchmark for robust neural information retrieval,\nwhich is publicly available at \\url{https://github.com/Davion-Liu/BestIR}. We\nhope that this study provides useful clues for future research on the\nrobustness of IR models and helps to develop trustworthy search engines\n\\url{https://github.com/Davion-Liu/Awesome-Robustness-in-Information-Retrieval}.\n","authors":["Yu-An Liu","Ruqing Zhang","Jiafeng Guo","Maarten de Rijke","Yixing Fan","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2407.06992v2.pdf","comment":"Survey paper"},{"id":"http://arxiv.org/abs/2402.16998v2","updated":"2024-08-16T08:13:38Z","published":"2024-02-26T20:13:58Z","title":"What Do Language Models Hear? Probing for Auditory Representations in\n Language Models","summary":" This work explores whether language models encode meaningfully grounded\nrepresentations of sounds of objects. We learn a linear probe that retrieves\nthe correct text representation of an object given a snippet of audio related\nto that object, where the sound representation is given by a pretrained audio\nmodel. This probe is trained via a contrastive loss that pushes the language\nrepresentations and sound representations of an object to be close to one\nanother. After training, the probe is tested on its ability to generalize to\nobjects that were not seen during training. Across different language models\nand audio models, we find that the probe generalization is above chance in many\ncases, indicating that despite being trained only on raw text, language models\nencode grounded knowledge of sounds for some objects.\n","authors":["Jerry Ngo","Yoon Kim"],"pdf_url":"https://arxiv.org/pdf/2402.16998v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08590v1","updated":"2024-08-16T07:47:39Z","published":"2024-08-16T07:47:39Z","title":"A Mechanistic Interpretation of Syllogistic Reasoning in Auto-Regressive\n Language Models","summary":" Recent studies on logical reasoning in auto-regressive Language Models (LMs)\nhave sparked a debate on whether such models can learn systematic reasoning\nprinciples during pre-training or merely exploit superficial patterns in the\ntraining data. This paper presents a mechanistic interpretation of syllogistic\nreasoning in LMs to further enhance our understanding of internal dynamics.\nSpecifically, we present a methodology for circuit discovery aimed at\ndisentangling content-independent reasoning mechanisms from world knowledge\nacquired during pre-training. Through two distinct intervention methods, we\nuncover a sufficient and necessary circuit involving middle-term suppression\nthat elucidates how LMs transfer information to derive valid conclusions from\npremises. Furthermore, we investigate how belief biases manifest in syllogistic\nreasoning, finding evidence of partial contamination from additional attention\nheads responsible for encoding commonsense and contextualized knowledge.\nFinally, we explore the generalization of the discovered mechanisms across\nvarious syllogistic schemes and model sizes, finding that the identified\ncircuit is sufficient and necessary for all the schemes on which the model\nachieves high downstream accuracy ($\\geq$ 60\\%). Overall, our findings suggest\nthat LMs indeed learn transferable content-independent reasoning mechanisms,\nbut that, at the same time, such mechanisms do not involve generalisable and\nabstract logical primitives, being susceptible to contamination by the same\nworld knowledge acquired during pre-training.\n","authors":["Geonhee Kim","Marco Valentino","André Freitas"],"pdf_url":"https://arxiv.org/pdf/2408.08590v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.06566v4","updated":"2024-08-16T07:43:55Z","published":"2024-06-03T07:44:32Z","title":"Natural Language Interaction with a Household Electricity\n Knowledge-based Digital Twin","summary":" Domain specific digital twins, representing a digital replica of various\nsegments of the smart grid, are foreseen as able to model, simulate, and\ncontrol the respective segments. At the same time, knowledge-based digital\ntwins, coupled with AI, may also empower humans to understand aspects of the\nsystem through natural language interaction in view of planning and policy\nmaking. This paper is the first to assess and report on the potential of\nRetrieval Augmented Generation (RAG) question answers related to household\nelectrical energy measurement aspects leveraging a knowledge-based energy\ndigital twin. Relying on the recently published electricity consumption\nknowledge graph that actually represents a knowledge-based digital twin, we\nstudy the capabilities of ChatGPT, Gemini and Llama in answering electricity\nrelated questions. Furthermore, we compare the answers with the ones generated\nthrough a RAG techniques that leverages an existing electricity knowledge-based\ndigital twin. Our findings illustrate that the RAG approach not only reduces\nthe incidence of incorrect information typically generated by LLMs but also\nsignificantly improves the quality of the output by grounding responses in\nverifiable data. This paper details our methodology, presents a comparative\nanalysis of responses with and without RAG, and discusses the implications of\nour findings for future applications of AI in specialized sectors like energy\ndata analysis.\n","authors":["Carolina Fortuna","Vid Hanžel","Blaž Bertalanič"],"pdf_url":"https://arxiv.org/pdf/2406.06566v4.pdf","comment":"Accepted at IEEE SmartGridComm'24"},{"id":"http://arxiv.org/abs/2408.08566v1","updated":"2024-08-16T07:00:08Z","published":"2024-08-16T07:00:08Z","title":"Overview of the BioLaySumm 2024 Shared Task on the Lay Summarization of\n Biomedical Research Articles","summary":" This paper presents the setup and results of the second edition of the\nBioLaySumm shared task on the Lay Summarisation of Biomedical Research\nArticles, hosted at the BioNLP Workshop at ACL 2024. In this task edition, we\naim to build on the first edition's success by further increasing research\ninterest in this important task and encouraging participants to explore novel\napproaches that will help advance the state-of-the-art. Encouragingly, we found\nresearch interest in the task to be high, with this edition of the task\nattracting a total of 53 participating teams, a significant increase in\nengagement from the previous edition. Overall, our results show that a broad\nrange of innovative approaches were adopted by task participants, with a\npredictable shift towards the use of Large Language Models (LLMs).\n","authors":["Tomas Goldsack","Carolina Scarton","Matthew Shardlow","Chenghua Lin"],"pdf_url":"https://arxiv.org/pdf/2408.08566v1.pdf","comment":"Published in: Proceedings of the 23rd Workshop on Biomedical Natural\n Language Processing"},{"id":"http://arxiv.org/abs/2408.08564v1","updated":"2024-08-16T06:54:10Z","published":"2024-08-16T06:54:10Z","title":"Collaborative Cross-modal Fusion with Large Language Model for\n Recommendation","summary":" Despite the success of conventional collaborative filtering (CF) approaches\nfor recommendation systems, they exhibit limitations in leveraging semantic\nknowledge within the textual attributes of users and items. Recent focus on the\napplication of large language models for recommendation (LLM4Rec) has\nhighlighted their capability for effective semantic knowledge capture. However,\nthese methods often overlook the collaborative signals in user behaviors. Some\nsimply instruct-tune a language model, while others directly inject the\nembeddings of a CF-based model, lacking a synergistic fusion of different\nmodalities. To address these issues, we propose a framework of Collaborative\nCross-modal Fusion with Large Language Models, termed CCF-LLM, for\nrecommendation. In this framework, we translate the user-item interactions into\na hybrid prompt to encode both semantic knowledge and collaborative signals,\nand then employ an attentive cross-modal fusion strategy to effectively fuse\nlatent embeddings of both modalities. Extensive experiments demonstrate that\nCCF-LLM outperforms existing methods by effectively utilizing semantic and\ncollaborative signals in the LLM4Rec context.\n","authors":["Zhongzhou Liu","Hao Zhang","Kuicai Dong","Yuan Fang"],"pdf_url":"https://arxiv.org/pdf/2408.08564v1.pdf","comment":"10 pages, 4 figures, accepted by CIKM 2024"},{"id":"http://arxiv.org/abs/2408.08551v1","updated":"2024-08-16T06:35:31Z","published":"2024-08-16T06:35:31Z","title":"Integrating Multi-view Analysis: Multi-view Mixture-of-Expert for\n Textual Personality Detection","summary":" Textual personality detection aims to identify personality traits by\nanalyzing user-generated content. To achieve this effectively, it is essential\nto thoroughly examine user-generated content from various perspectives.\nHowever, previous studies have struggled with automatically extracting and\neffectively integrating information from multiple perspectives, thereby\nlimiting their performance on personality detection. To address these\nchallenges, we propose the Multi-view Mixture-of-Experts Model for Textual\nPersonality Detection (MvP). MvP introduces a Multi-view Mixture-of-Experts\n(MoE) network to automatically analyze user posts from various perspectives.\nAdditionally, it employs User Consistency Regularization to mitigate conflicts\namong different perspectives and learn a multi-view generic user\nrepresentation. The model's training is optimized via a multi-task joint\nlearning strategy that balances supervised personality detection with\nself-supervised user consistency constraints. Experimental results on two\nwidely-used personality detection datasets demonstrate the effectiveness of the\nMvP model and the benefits of automatically analyzing user posts from diverse\nperspectives for textual personality detection.\n","authors":["Haohao Zhu","Xiaokun Zhang","Junyu Lu","Liang Yang","Hongfei Lin"],"pdf_url":"https://arxiv.org/pdf/2408.08551v1.pdf","comment":"Accepted by NLPCC 2024"},{"id":"http://arxiv.org/abs/2408.08545v1","updated":"2024-08-16T06:11:21Z","published":"2024-08-16T06:11:21Z","title":"SelectLLM: Query-Aware Efficient Selection Algorithm for Large Language\n Models","summary":" Large language models (LLMs) have gained increased popularity due to their\nremarkable success across various tasks, which has led to the active\ndevelopment of a large set of diverse LLMs. However, individual LLMs have\nlimitations when applied to complex tasks because of such factors as training\nbiases, model sizes, and the datasets used. A promising approach is to\nefficiently harness the diverse capabilities of LLMs to overcome these\nindividual limitations. Towards this goal, we introduce a novel LLM selection\nalgorithm called SelectLLM. This algorithm directs input queries to the most\nsuitable subset of LLMs from a large pool, ensuring they collectively provide\nthe correct response efficiently. SelectLLM uses a multi-label classifier,\nutilizing the classifier's predictions and confidence scores to design optimal\npolicies for selecting an optimal, query-aware, and lightweight subset of LLMs.\nOur findings show that the proposed model outperforms individual LLMs and\nachieves competitive performance compared to similarly sized, computationally\nexpensive top-performing LLM subsets. Specifically, with a similarly sized\ntop-performing LLM subset, we achieve a significant reduction in latency on two\nstandard reasoning benchmarks: 13% lower latency for GSM8K and 70% lower\nlatency for MMLU. Additionally, we conduct comprehensive analyses and ablation\nstudies, which validate the robustness of the proposed model.\n","authors":["Kaushal Kumar Maurya","KV Aditya Srivatsa","Ekaterina Kochmar"],"pdf_url":"https://arxiv.org/pdf/2408.08545v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05074v2","updated":"2024-08-16T06:04:31Z","published":"2024-08-09T14:02:24Z","title":"RT-Surv: Improving Mortality Prediction After Radiotherapy with Large\n Language Model Structuring of Large-Scale Unstructured Electronic Health\n Records","summary":" Accurate patient selection is critical in radiotherapy (RT) to prevent\nineffective treatments. Traditional survival prediction models, relying on\nstructured data, often lack precision. This study explores the potential of\nlarge language models (LLMs) to structure unstructured electronic health record\n(EHR) data, thereby improving survival prediction accuracy through\ncomprehensive clinical information integration. Data from 34,276 patients\ntreated with RT at Yonsei Cancer Center between 2013 and 2023 were analyzed,\nencompassing both structured and unstructured data. An open-source LLM was used\nto structure the unstructured EHR data via single-shot learning, with its\nperformance compared against a domain-specific medical LLM and a smaller\nvariant. Survival prediction models were developed using statistical, machine\nlearning, and deep learning approaches, incorporating both structured and\nLLM-structured data. Clinical experts evaluated the accuracy of the\nLLM-structured data. The open-source LLM achieved 87.5% accuracy in structuring\nunstructured EHR data without additional training, significantly outperforming\nthe domain-specific medical LLM, which reached only 35.8% accuracy. Larger LLMs\nwere more effective, particularly in extracting clinically relevant features\nlike general condition and disease extent, which closely correlated with\npatient survival. Incorporating LLM-structured clinical features into survival\nprediction models significantly improved accuracy, with the C-index of deep\nlearning models increasing from 0.737 to 0.820. These models also became more\ninterpretable by emphasizing clinically significant factors. This study shows\nthat general-domain LLMs, even without specific medical training, can\neffectively structure large-scale unstructured EHR data, substantially\nenhancing the accuracy and interpretability of clinical predictive models.\n","authors":["Sangjoon Park","Chan Woo Wee","Seo Hee Choi","Kyung Hwan Kim","Jee Suk Chang","Hong In Yoon","Ik Jae Lee","Yong Bae Kim","Jaeho Cho","Ki Chang Keum","Chang Geol Lee","Hwa Kyung Byun","Woong Sub Koom"],"pdf_url":"https://arxiv.org/pdf/2408.05074v2.pdf","comment":"23 pages, 2 tables, 4 figures"},{"id":"http://arxiv.org/abs/2408.04575v2","updated":"2024-08-16T06:01:15Z","published":"2024-08-08T16:36:24Z","title":"SCENE: Evaluating Explainable AI Techniques Using Soft Counterfactuals","summary":" Explainable Artificial Intelligence (XAI) plays a crucial role in enhancing\nthe transparency and accountability of AI models, particularly in natural\nlanguage processing (NLP) tasks. However, popular XAI methods such as LIME and\nSHAP have been found to be unstable and potentially misleading, underscoring\nthe need for a standardized evaluation approach. This paper introduces SCENE\n(Soft Counterfactual Evaluation for Natural language Explainability), a novel\nevaluation method that leverages large language models (LLMs) to generate Soft\nCounterfactual explanations in a zero-shot manner. By focusing on token-based\nsubstitutions, SCENE creates contextually appropriate and semantically\nmeaningful Soft Counterfactuals without extensive fine-tuning. SCENE adopts\nValiditysoft and Csoft metrics to assess the effectiveness of model-agnostic\nXAI methods in text classification tasks. Applied to CNN, RNN, and Transformer\narchitectures, SCENE provides valuable insights into the strengths and\nlimitations of various XAI techniques.\n","authors":["Haoran Zheng","Utku Pamuksuz"],"pdf_url":"https://arxiv.org/pdf/2408.04575v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08541v1","updated":"2024-08-16T05:56:10Z","published":"2024-08-16T05:56:10Z","title":"Where is the signal in tokenization space?","summary":" Large Language Models (LLMs) are typically shipped with tokenizers that\ndeterministically encode text into so-called canonical token sequences, to\nwhich the LLMs assign probability values. One common assumption is that the\nprobability of a piece of text is the probability of its canonical token\nsequence. However, the tokenization of a string is not unique: e.g., the Llama2\ntokenizer encodes Tokens as [Tok,ens], but [Tok,en,s] also represents the same\ntext. In this paper, we study non-canonical tokenizations. We prove that, given\na string, it is computationally hard to find the most likely tokenization for\nan autoregressive LLM, as well as to compute the marginal probability over all\npossible tokenizations. We then show how the marginal is, in most cases,\nindistinguishable from the canonical probability. Surprisingly, we then\nempirically demonstrate the existence of a significant amount of signal hidden\nwithin tokenization space. Notably, by simply aggregating the probabilities of\nnon-canonical tokenizations, we achieve improvements across a range of LLM\nevaluation benchmarks for a variety of architectures, including transformers\nand state space models.\n","authors":["Renato Lui Geh","Honghua Zhang","Kareem Ahmed","Benjie Wang","Guy Van den Broeck"],"pdf_url":"https://arxiv.org/pdf/2408.08541v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02817v2","updated":"2024-08-16T05:52:17Z","published":"2024-05-05T05:43:20Z","title":"Labeling supervised fine-tuning data with the scaling law","summary":" This paper introduces a multi-stage manual annotation calibrated by the\nscaling law, offering a high-quality Supervised Fine-Tuning data acquisition\nmethod for environments with constrained resources like GPU poor, limited GPT\naccess, and funding restrictions. We have preprocessed 58k authentic chat data\nand manually annotated 2.3k questions. After this, we conducted fine-tuning on\nQwen models, ranging from 0.5B to 32B parameters. The optimal version improved\n29.07 in F1 score. This confirms the viability of fine-tuning Large Language\nModel (LLM) for downstream Natural Language Processing (NLP) tasks. Our\ncontributions are: 1) Created Supervised Fine-Tuning (SFT) training data in\nalpaca format, along with a set of Low-Rank Adaptation (LoRA) weights, and 2)\nDeveloped a method for acquiring high-quality data leveraging scaling law\nprinciple. The script, raw data with alpaca format and experiments track are\nopen-sourced on Github\n(https://github.com/InternLM/HuixiangDou/tree/main/web/tools), HuggingFace\n(https://huggingface.co/tpoisonooo) and WandB\n(https://wandb.ai/tpoisonooo/huixiangdou-cr/table?nw=nwusertpoisonooo). The\nprivacy of the data involved has been authorized by users. SFT data and license\ncomes from ncnn contributors group.\n","authors":["Huanjun Kong"],"pdf_url":"https://arxiv.org/pdf/2405.02817v2.pdf","comment":"5 pages, 3 tables, 3 figures"},{"id":"http://arxiv.org/abs/2408.08535v1","updated":"2024-08-16T05:15:12Z","published":"2024-08-16T05:15:12Z","title":"CommunityKG-RAG: Leveraging Community Structures in Knowledge Graphs for\n Advanced Retrieval-Augmented Generation in Fact-Checking","summary":" Despite advancements in Large Language Models (LLMs) and Retrieval-Augmented\nGeneration (RAG) systems, their effectiveness is often hindered by a lack of\nintegration with entity relationships and community structures, limiting their\nability to provide contextually rich and accurate information retrieval for\nfact-checking. We introduce CommunityKG-RAG (Community Knowledge\nGraph-Retrieval Augmented Generation), a novel zero-shot framework that\nintegrates community structures within Knowledge Graphs (KGs) with RAG systems\nto enhance the fact-checking process. Capable of adapting to new domains and\nqueries without additional training, CommunityKG-RAG utilizes the multi-hop\nnature of community structures within KGs to significantly improve the accuracy\nand relevance of information retrieval. Our experimental results demonstrate\nthat CommunityKG-RAG outperforms traditional methods, representing a\nsignificant advancement in fact-checking by offering a robust, scalable, and\nefficient solution.\n","authors":["Rong-Ching Chang","Jiawei Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.08535v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08521v1","updated":"2024-08-16T04:32:10Z","published":"2024-08-16T04:32:10Z","title":"MuRAR: A Simple and Effective Multimodal Retrieval and Answer Refinement\n Framework for Multimodal Question Answering","summary":" Recent advancements in retrieval-augmented generation (RAG) have demonstrated\nimpressive performance in the question-answering (QA) task. However, most\nprevious works predominantly focus on text-based answers. While some studies\naddress multimodal data, they still fall short in generating comprehensive\nmultimodal answers, particularly for explaining concepts or providing\nstep-by-step tutorials on how to accomplish specific goals. This capability is\nespecially valuable for applications such as enterprise chatbots and settings\nsuch as customer service and educational systems, where the answers are sourced\nfrom multimodal data. In this paper, we introduce a simple and effective\nframework named MuRAR (Multimodal Retrieval and Answer Refinement). MuRAR\nenhances text-based answers by retrieving relevant multimodal data and refining\nthe responses to create coherent multimodal answers. This framework can be\neasily extended to support multimodal answers in enterprise chatbots with\nminimal modifications. Human evaluation results indicate that multimodal\nanswers generated by MuRAR are more useful and readable compared to plain text\nanswers.\n","authors":["Zhengyuan Zhu","Daniel Lee","Hong Zhang","Sai Sree Harsha","Loic Feujio","Akash Maharaj","Yunyao Li"],"pdf_url":"https://arxiv.org/pdf/2408.08521v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2402.10753v2","updated":"2024-08-16T04:12:00Z","published":"2024-02-16T15:19:46Z","title":"ToolSword: Unveiling Safety Issues of Large Language Models in Tool\n Learning Across Three Stages","summary":" Tool learning is widely acknowledged as a foundational approach or deploying\nlarge language models (LLMs) in real-world scenarios. While current research\nprimarily emphasizes leveraging tools to augment LLMs, it frequently neglects\nemerging safety considerations tied to their application. To fill this gap, we\npresent *ToolSword*, a comprehensive framework dedicated to meticulously\ninvestigating safety issues linked to LLMs in tool learning. Specifically,\nToolSword delineates six safety scenarios for LLMs in tool learning,\nencompassing **malicious queries** and **jailbreak attacks** in the input\nstage, **noisy misdirection** and **risky cues** in the execution stage, and\n**harmful feedback** and **error conflicts** in the output stage. Experiments\nconducted on 11 open-source and closed-source LLMs reveal enduring safety\nchallenges in tool learning, such as handling harmful queries, employing risky\ntools, and delivering detrimental feedback, which even GPT-4 is susceptible to.\nMoreover, we conduct further studies with the aim of fostering research on tool\nlearning safety. The data is released in\nhttps://github.com/Junjie-Ye/ToolSword.\n","authors":["Junjie Ye","Sixian Li","Guanyu Li","Caishuang Huang","Songyang Gao","Yilong Wu","Qi Zhang","Tao Gui","Xuanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2402.10753v2.pdf","comment":"Accepted by ACL 2024 Main Conference"},{"id":"http://arxiv.org/abs/2408.08506v1","updated":"2024-08-16T03:06:57Z","published":"2024-08-16T03:06:57Z","title":"Ex3: Automatic Novel Writing by Extracting, Excelsior and Expanding","summary":" Generating long-term texts such as novels using artificial intelligence has\nalways been a challenge. A common approach is to use large language models\n(LLMs) to construct a hierarchical framework that first plans and then writes.\nDespite the fact that the generated novels reach a sufficient length, they\nexhibit poor logical coherence and appeal in their plots and deficiencies in\ncharacter and event depiction, ultimately compromising the overall narrative\nquality. In this paper, we propose a method named Extracting Excelsior and\nExpanding. Ex3 initially extracts structure information from raw novel data. By\ncombining this structure information with the novel data, an\ninstruction-following dataset is meticulously crafted. This dataset is then\nutilized to fine-tune the LLM, aiming for excelsior generation performance. In\nthe final stage, a tree-like expansion method is deployed to facilitate the\ngeneration of arbitrarily long novels. Evaluation against previous methods\nshowcases Ex3's ability to produce higher-quality long-form novels.\n","authors":["Huang Lei","Jiaming Guo","Guanhua He","Xishan Zhang","Rui Zhang","Shaohui Peng","Shaoli Liu","Tianshi Chen"],"pdf_url":"https://arxiv.org/pdf/2408.08506v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07930v2","updated":"2024-08-16T02:55:45Z","published":"2024-08-15T04:57:55Z","title":"MAG-SQL: Multi-Agent Generative Approach with Soft Schema Linking and\n Iterative Sub-SQL Refinement for Text-to-SQL","summary":" Recent In-Context Learning based methods have achieved remarkable success in\nText-to-SQL task. However, there is still a large gap between the performance\nof these models and human performance on datasets with complex database schema\nand difficult questions, such as BIRD. Besides, existing work has neglected to\nsupervise intermediate steps when solving questions iteratively with question\ndecomposition methods, and the schema linking methods used in these works are\nvery rudimentary. To address these issues, we propose MAG-SQL, a multi-agent\ngenerative approach with soft schema linking and iterative Sub-SQL refinement.\nIn our framework, an entity-based method with tables' summary is used to select\nthe columns in database, and a novel targets-conditions decomposition method is\nintroduced to decompose those complex questions. Additionally, we build a\niterative generating module which includes a Sub-SQL Generator and Sub-SQL\nRefiner, introducing external oversight for each step of generation. Through a\nseries of ablation studies, the effectiveness of each agent in our framework\nhas been demonstrated. When evaluated on the BIRD benchmark with GPT-4, MAG-SQL\nachieves an execution accuracy of 61.08%, compared to the baseline accuracy of\n46.35% for vanilla GPT-4 and the baseline accuracy of 57.56% for MAC-SQL.\nBesides, our approach makes similar progress on Spider.\n","authors":["Wenxuan Xie","Gaochen Wu","Bowen Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.07930v2.pdf","comment":"22 pages, 14 figures"},{"id":"http://arxiv.org/abs/2404.09170v5","updated":"2024-08-16T02:21:13Z","published":"2024-04-14T07:19:27Z","title":"Distilling Reasoning Ability from Large Language Models with Adaptive\n Thinking","summary":" Chain of thought finetuning (cot-finetuning) aims to endow small language\nmodels (SLM) with reasoning ability to improve their performance towards\nspecific tasks by allowing them to imitate the reasoning procedure of large\nlanguage models (LLM) beyond simply predicting the answers. Most existing\ncot-finetuning methods adopt a pre-thinking mechanism, allowing the SLM to\ngenerate a rationale before providing an answer. This mechanism enables SLM to\nanalyze and think about complex questions, but it also makes answer correctness\nhighly sensitive to minor errors in rationale. Therefore, we propose a robust\npost-thinking mechanism to generate answers before rationale. Thanks to this\nanswer-first setting, 1) the answer can escape from the adverse effects caused\nby minor errors in the rationale; 2) the rationale serves as an error amplifier\nto the answer, which makes the SLM focus on learning hard samples; 3) the\ninferring efficiency can also benefit from the setting since users can stop the\ngeneration right after answers are outputted when inference is conducted.\nHowever, although the post-thinking mechanism brings many advantages and\nimproves the overall performance of SLM on specific tasks, it may lose the\nability to think about the questions and decompose complex questions into\nsimple sub-questions compared to pre-thinking mechanism. Therefore, a\nplug-and-play adaptive-thinking mechanism is proposed with the aid of the soft\nprompt tuning to integrate the merits of the pre-thinking mechanism and\npost-thinking mechanism, in which a perception module is introduced to\nadaptively prompt SLM answer or think first based on perceiving the complexity\nof the questions. Extensive experiments are conducted across 12 reasoning tasks\nand 2 representative language models to demonstrate the effectiveness of the\nproposed mechanism.\n","authors":["Xiaoshu Chen","Sihang Zhou","Ke Liang","Xinwang Liu"],"pdf_url":"https://arxiv.org/pdf/2404.09170v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16035v3","updated":"2024-08-16T01:30:12Z","published":"2023-09-27T21:26:03Z","title":"MKRAG: Medical Knowledge Retrieval Augmented Generation for Medical\n Question Answering","summary":" Large Language Models (LLMs), although powerful in general domains, often\nperform poorly on domain-specific tasks such as medical question answering\n(QA). In addition, LLMs tend to function as \"black-boxes\", making it\nchallenging to modify their behavior. To address the problem, our work employs\na transparent process of retrieval augmented generation (RAG), aiming to\nimprove LLM responses without the need for fine-tuning or retraining.\nSpecifically, we propose a comprehensive retrieval strategy to extract medical\nfacts from an external knowledge base, and then inject them into the LLM's\nquery prompt. Focusing on medical QA, we evaluate the impact of different\nretrieval models and the number of facts on LLM performance using the\nMedQA-SMILE dataset. Notably, our retrieval-augmented Vicuna-7B model exhibited\nan accuracy improvement from 44.46% to 48.54%. This work underscores the\npotential of RAG to enhance LLM performance, offering a practical approach to\nmitigate the challenges posed by black-box LLMs.\n","authors":["Yucheng Shi","Shaochen Xu","Tianze Yang","Zhengliang Liu","Tianming Liu","Quanzheng Li","Xiang Li","Ninghao Liu"],"pdf_url":"https://arxiv.org/pdf/2309.16035v3.pdf","comment":"Accepted by AMIA 2024 Annual Symposium"},{"id":"http://arxiv.org/abs/2402.02212v2","updated":"2024-08-16T01:16:20Z","published":"2024-02-03T17:13:03Z","title":"A Data Generation Perspective to the Mechanism of In-Context Learning","summary":" In-Context Learning (ICL) empowers Large Language Models (LLMs) with the\ncapacity to learn in context, achieving downstream generalization without\ngradient updates but with a few in-context examples. Despite the encouraging\nempirical success, the underlying mechanism of ICL remains unclear, and\nexisting research offers various viewpoints of understanding. These studies\npropose intuition-driven and ad-hoc technical solutions for interpreting ICL,\nillustrating an ambiguous road map. In this paper, we leverage a data\ngeneration perspective to reinterpret recent efforts and demonstrate the\npotential broader usage of popular technical solutions, approaching a\nsystematic angle. For a conceptual definition, we rigorously adopt the terms of\nskill learning and skill recognition. The difference between them is skill\nlearning can learn new data generation functions from in-context data. We also\nprovide a comprehensive study on the merits and weaknesses of different\nsolutions, and highlight the uniformity among them given the perspective of\ndata generation, establishing a technical foundation for future research to\nincorporate the strengths of different lines of research.\n","authors":["Haitao Mao","Guangliang Liu","Yao Ma","Rongrong Wang","Kristen Johnson","Jiliang Tang"],"pdf_url":"https://arxiv.org/pdf/2402.02212v2.pdf","comment":"11 pages, 1 figure"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2408.08872v1","updated":"2024-08-16T17:57:01Z","published":"2024-08-16T17:57:01Z","title":"xGen-MM (BLIP-3): A Family of Open Large Multimodal Models","summary":" This report introduces xGen-MM (also known as BLIP-3), a framework for\ndeveloping Large Multimodal Models (LMMs). The framework comprises meticulously\ncurated datasets, a training recipe, model architectures, and a resulting suite\nof LMMs. xGen-MM, short for xGen-MultiModal, expands the Salesforce xGen\ninitiative on foundation AI models. Our models undergo rigorous evaluation\nacross a range of tasks, including both single and multi-image benchmarks. Our\npre-trained base model exhibits strong in-context learning capabilities and the\ninstruction-tuned model demonstrates competitive performance among open-source\nLMMs with similar model sizes. In addition, we introduce a safety-tuned model\nwith DPO, aiming to mitigate harmful behaviors such as hallucinations and\nimprove safety. We open-source our models, curated large-scale datasets, and\nour fine-tuning codebase to facilitate further advancements in LMM research.\nAssociated resources will be available on our project page above.\n","authors":["Le Xue","Manli Shu","Anas Awadalla","Jun Wang","An Yan","Senthil Purushwalkam","Honglu Zhou","Viraj Prabhu","Yutong Dai","Michael S Ryoo","Shrikant Kendre","Jieyu Zhang","Can Qin","Shu Zhang","Chia-Chih Chen","Ning Yu","Juntao Tan","Tulika Manoj Awalgaonkar","Shelby Heinecke","Huan Wang","Yejin Choi","Ludwig Schmidt","Zeyuan Chen","Silvio Savarese","Juan Carlos Niebles","Caiming Xiong","Ran Xu"],"pdf_url":"https://arxiv.org/pdf/2408.08872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08870v1","updated":"2024-08-16T17:55:38Z","published":"2024-08-16T17:55:38Z","title":"SAM2-UNet: Segment Anything 2 Makes Strong Encoder for Natural and\n Medical Image Segmentation","summary":" Image segmentation plays an important role in vision understanding. Recently,\nthe emerging vision foundation models continuously achieved superior\nperformance on various tasks. Following such success, in this paper, we prove\nthat the Segment Anything Model 2 (SAM2) can be a strong encoder for U-shaped\nsegmentation models. We propose a simple but effective framework, termed\nSAM2-UNet, for versatile image segmentation. Specifically, SAM2-UNet adopts the\nHiera backbone of SAM2 as the encoder, while the decoder uses the classic\nU-shaped design. Additionally, adapters are inserted into the encoder to allow\nparameter-efficient fine-tuning. Preliminary experiments on various downstream\ntasks, such as camouflaged object detection, salient object detection, marine\nanimal segmentation, mirror detection, and polyp segmentation, demonstrate that\nour SAM2-UNet can simply beat existing specialized state-of-the-art methods\nwithout bells and whistles. Project page:\n\\url{https://github.com/WZH0120/SAM2-UNet}.\n","authors":["Xinyu Xiong","Zihuang Wu","Shuangyi Tan","Wenxue Li","Feilong Tang","Ying Chen","Siying Li","Jie Ma","Guanbin Li"],"pdf_url":"https://arxiv.org/pdf/2408.08870v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2408.08855v1","updated":"2024-08-16T17:30:27Z","published":"2024-08-16T17:30:27Z","title":"DPA: Dual Prototypes Alignment for Unsupervised Adaptation of\n Vision-Language Models","summary":" Vision-language models (VLMs), e.g., CLIP, have shown remarkable potential in\nzero-shot image classification. However, adapting these models to new domains\nremains challenging, especially in unsupervised settings where labelled data is\nunavailable. Recent research has proposed pseudo-labelling approaches to adapt\nCLIP in an unsupervised manner using unlabelled target data. Nonetheless, these\nmethods struggle due to noisy pseudo-labels resulting from the misalignment\nbetween CLIP's visual and textual representations. This study introduces DPA,\nan unsupervised domain adaptation method for VLMs. DPA introduces the concept\nof dual prototypes, acting as distinct classifiers, along with the convex\ncombination of their outputs, thereby leading to accurate pseudo-label\nconstruction. Next, it ranks pseudo-labels to facilitate robust self-training,\nparticularly during early training. Finally, it addresses visual-textual\nmisalignment by aligning textual prototypes with image prototypes to further\nimprove the adaptation performance. Experiments on 13 downstream vision tasks\ndemonstrate that DPA significantly outperforms zero-shot CLIP and the\nstate-of-the-art unsupervised adaptation baselines.\n","authors":["Eman Ali","Sathira Silva","Muhammad Haris Khan"],"pdf_url":"https://arxiv.org/pdf/2408.08855v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08847v1","updated":"2024-08-16T17:19:07Z","published":"2024-08-16T17:19:07Z","title":"HistoGym: A Reinforcement Learning Environment for Histopathological\n Image Analysis","summary":" In pathological research, education, and clinical practice, the\ndecision-making process based on pathological images is critically important.\nThis significance extends to digital pathology image analysis: its adequacy is\ndemonstrated by the extensive information contained within tissue structures,\nwhich is essential for accurate cancer classification and grading.\nAdditionally, its necessity is highlighted by the inherent requirement for\ninterpretability in the conclusions generated by algorithms. For humans,\ndetermining tumor type and grade typically involves multi-scale analysis, which\npresents a significant challenge for AI algorithms. Traditional patch-based\nmethods are inadequate for modeling such complex structures, as they fail to\ncapture the intricate, multi-scale information inherent in whole slide images.\nConsequently, there is a pressing need for advanced AI techniques capable of\nefficiently and accurately replicating this complex analytical process. To\naddress this issue, we introduce HistoGym, an open-source reinforcement\nlearning environment for histopathological image analysis. Following OpenAI Gym\nAPIs, HistoGym aims to foster whole slide image diagnosis by mimicking the\nreal-life processes of doctors. Leveraging the pyramid feature of WSIs and the\nOpenSlide API, HistoGym provides a unified framework for various clinical\ntasks, including tumor detection and classification. We detail the observation,\naction, and reward specifications tailored for the histopathological image\nanalysis domain and provide an open-source Python-based interface for both\nclinicians and researchers. To accommodate different clinical demands, we offer\nvarious scenarios for different organs and cancers, including both WSI-based\nand selected region-based scenarios, showcasing several noteworthy results.\n","authors":["Zhi-Bo Liu","Xiaobo Pang","Jizhao Wang","Shuai Liu","Chen Li"],"pdf_url":"https://arxiv.org/pdf/2408.08847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06400v2","updated":"2024-08-16T17:13:02Z","published":"2024-03-11T03:24:44Z","title":"DivCon: Divide and Conquer for Progressive Text-to-Image Generation","summary":" Diffusion-driven text-to-image (T2I) generation has achieved remarkable\nadvancements. To further improve T2I models' capability in numerical and\nspatial reasoning, the layout is employed as an intermedium to bridge large\nlanguage models and layout-based diffusion models. However, these methods still\nstruggle with generating images from textural prompts with multiple objects and\ncomplicated spatial relationships. To tackle this challenge, we introduce a\ndivide-and-conquer approach which decouples the T2I generation task into simple\nsubtasks. Our approach divides the layout prediction stage into numerical &\nspatial reasoning and bounding box prediction. Then, the layout-to-image\ngeneration stage is conducted in an iterative manner to reconstruct objects\nfrom easy ones to difficult ones. We conduct experiments on the HRS and NSR-1K\nbenchmarks and our approach outperforms previous state-of-the-art models with\nnotable margins. In addition, visual results demonstrate that our approach\nsignificantly improves the controllability and consistency in generating\nmultiple objects from complex textural prompts.\n","authors":["Yuhao Jia","Wenhan Tan"],"pdf_url":"https://arxiv.org/pdf/2403.06400v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03291v2","updated":"2024-08-16T17:10:27Z","published":"2024-08-06T16:40:04Z","title":"DopQ-ViT: Towards Distribution-Friendly and Outlier-Aware Post-Training\n Quantization for Vision Transformers","summary":" Vision transformers (ViTs) have garnered significant attention for their\nperformance in vision tasks, but the high computational cost and significant\nlatency issues have hindered widespread adoption. Post-training quantization\n(PTQ), a promising method for model compression, still faces accuracy\ndegradation challenges with ViTs. There are two reasons for this: the existing\nquantization paradigm does not fit the power-law distribution of post-Softmax\nactivations well, and accuracy inevitably decreases after reparameterizing\npost-LayerNorm activations. We propose a Distribution-Friendly and\nOutlier-Aware Post-training Quantization method for Vision Transformers, named\nDopQ-ViT. DopQ-ViT analyzes the inefficiencies of current quantizers and\nintroduces a distribution-friendly Tan Quantizer called TanQ. TanQ focuses more\non values near 1, more accurately preserving the power-law distribution of\npost-Softmax activations, and achieves favorable results. Besides, during the\nreparameterization of post-LayerNorm activations from channel-wise to\nlayer-wise quantization, the accuracy degradation is mainly due to the\nsignificant impact of outliers in the scaling factors. Therefore, DopQ-ViT\nproposes a method to select Median as the Optimal Scaling Factor, denoted as\nMOSF, which compensates for the influence of outliers and preserves the\nperformance of the quantization model. DopQ-ViT has been extensively validated\nand significantly improves the performance of quantization models, especially\nin low-bit settings.\n","authors":["Lianwei Yang","Haisong Gong","Qingyi Gu"],"pdf_url":"https://arxiv.org/pdf/2408.03291v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07246v2","updated":"2024-08-16T16:46:32Z","published":"2024-08-14T01:16:40Z","title":"ChemVLM: Exploring the Power of Multimodal Large Language Models in\n Chemistry Area","summary":" Large Language Models (LLMs) have achieved remarkable success and have been\napplied across various scientific fields, including chemistry. However, many\nchemical tasks require the processing of visual information, which cannot be\nsuccessfully handled by existing chemical LLMs. This brings a growing need for\nmodels capable of integrating multimodal information in the chemical domain. In\nthis paper, we introduce \\textbf{ChemVLM}, an open-source chemical multimodal\nlarge language model specifically designed for chemical applications. ChemVLM\nis trained on a carefully curated bilingual multimodal dataset that enhances\nits ability to understand both textual and visual chemical information,\nincluding molecular structures, reactions, and chemistry examination questions.\nWe develop three datasets for comprehensive evaluation, tailored to Chemical\nOptical Character Recognition (OCR), Multimodal Chemical Reasoning (MMCR), and\nMultimodal Molecule Understanding tasks. We benchmark ChemVLM against a range\nof open-source and proprietary multimodal large language models on various\ntasks. Experimental results demonstrate that ChemVLM achieves competitive\nperformance across all evaluated tasks. Our model can be found at\nhttps://huggingface.co/AI4Chem/ChemVLM-26B.\n","authors":["Junxian Li","Di Zhang","Xunzhi Wang","Zeying Hao","Jingdi Lei","Qian Tan","Cai Zhou","Wei Liu","Yaotian Yang","Xinrui Xiong","Weiyun Wang","Zhe Chen","Wenhai Wang","Wei Li","Shufei Zhang","Mao Su","Wanli Ouyang","Yuqiang Li","Dongzhan Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.07246v2.pdf","comment":"11 pages, updated version"},{"id":"http://arxiv.org/abs/2408.08827v1","updated":"2024-08-16T16:22:34Z","published":"2024-08-16T16:22:34Z","title":"RGBT Tracking via All-layer Multimodal Interactions with Progressive\n Fusion Mamba","summary":" Existing RGBT tracking methods often design various interaction models to\nperform cross-modal fusion of each layer, but can not execute the feature\ninteractions among all layers, which plays a critical role in robust multimodal\nrepresentation, due to large computational burden. To address this issue, this\npaper presents a novel All-layer multimodal Interaction Network, named AINet,\nwhich performs efficient and effective feature interactions of all modalities\nand layers in a progressive fusion Mamba, for robust RGBT tracking. Even though\nmodality features in different layers are known to contain different cues, it\nis always challenging to build multimodal interactions in each layer due to\nstruggling in balancing interaction capabilities and efficiency. Meanwhile,\nconsidering that the feature discrepancy between RGB and thermal modalities\nreflects their complementary information to some extent, we design a\nDifference-based Fusion Mamba (DFM) to achieve enhanced fusion of different\nmodalities with linear complexity. When interacting with features from all\nlayers, a huge number of token sequences (3840 tokens in this work) are\ninvolved and the computational burden is thus large. To handle this problem, we\ndesign an Order-dynamic Fusion Mamba (OFM) to execute efficient and effective\nfeature interactions of all layers by dynamically adjusting the scan order of\ndifferent layers in Mamba. Extensive experiments on four public RGBT tracking\ndatasets show that AINet achieves leading performance against existing\nstate-of-the-art methods.\n","authors":["Andong Lu","Wanyu Wang","Chenglong Li","Jin Tang","Bin Luo"],"pdf_url":"https://arxiv.org/pdf/2408.08827v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08822v1","updated":"2024-08-16T16:12:44Z","published":"2024-08-16T16:12:44Z","title":"PFDiff: Training-free Acceleration of Diffusion Models through the\n Gradient Guidance of Past and Future","summary":" Diffusion Probabilistic Models (DPMs) have shown remarkable potential in\nimage generation, but their sampling efficiency is hindered by the need for\nnumerous denoising steps. Most existing solutions accelerate the sampling\nprocess by proposing fast ODE solvers. However, the inevitable discretization\nerrors of the ODE solvers are significantly magnified when the number of\nfunction evaluations (NFE) is fewer. In this work, we propose PFDiff, a novel\ntraining-free and orthogonal timestep-skipping strategy, which enables existing\nfast ODE solvers to operate with fewer NFE. Based on two key observations: a\nsignificant similarity in the model's outputs at time step size that is not\nexcessively large during the denoising process of existing ODE solvers, and a\nhigh resemblance between the denoising process and SGD. PFDiff, by employing\ngradient replacement from past time steps and foresight updates inspired by\nNesterov momentum, rapidly updates intermediate states, thereby reducing\nunnecessary NFE while correcting for discretization errors inherent in\nfirst-order ODE solvers. Experimental results demonstrate that PFDiff exhibits\nflexible applicability across various pre-trained DPMs, particularly excelling\nin conditional DPMs and surpassing previous state-of-the-art training-free\nmethods. For instance, using DDIM as a baseline, we achieved 16.46 FID (4 NFE)\ncompared to 138.81 FID with DDIM on ImageNet 64x64 with classifier guidance,\nand 13.06 FID (10 NFE) on Stable Diffusion with 7.5 guidance scale.\n","authors":["Guangyi Wang","Yuren Cai","Lijiang Li","Wei Peng","Songzhi Su"],"pdf_url":"https://arxiv.org/pdf/2408.08822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.04745v6","updated":"2024-08-16T15:49:24Z","published":"2022-12-09T09:45:43Z","title":"SLAM for Visually Impaired People: a Survey","summary":" In recent decades, several assistive technologies have been developed to\nimprove the ability of blind and visually impaired (BVI) individuals to\nnavigate independently and safely. At the same time, simultaneous localization\nand mapping (SLAM) techniques have become sufficiently robust and efficient to\nbe adopted in developing these assistive technologies. We present the first\nsystematic literature review of 54 recent studies on SLAM-based solutions for\nblind and visually impaired people, focusing on literature published from 2017\nonward. This review explores various localization and mapping techniques\nemployed in this context. We systematically identified and categorized diverse\nSLAM approaches and analyzed their localization and mapping techniques, sensor\ntypes, computing resources, and machine-learning methods. We discuss the\nadvantages and limitations of these techniques for blind and visually impaired\nnavigation. Moreover, we examine the major challenges described across studies,\nincluding practical challenges and considerations that affect usability and\nadoption. Our analysis also evaluates the effectiveness of these SLAM-based\nsolutions in real-world scenarios and user satisfaction, providing insights\ninto their practical impact on BVI mobility. The insights derived from this\nreview identify critical gaps and opportunities for future research activities,\nparticularly in addressing the challenges presented by dynamic and complex\nenvironments. We explain how SLAM technology offers the potential to improve\nthe ability of visually impaired individuals to navigate effectively. Finally,\nwe present future opportunities and challenges in this domain.\n","authors":["Marziyeh Bamdad","Davide Scaramuzza","Alireza Darvishy"],"pdf_url":"https://arxiv.org/pdf/2212.04745v6.pdf","comment":"47 pages, 42 tables, 6 figures"},{"id":"http://arxiv.org/abs/2408.08813v1","updated":"2024-08-16T15:48:07Z","published":"2024-08-16T15:48:07Z","title":"Retrieval-augmented Few-shot Medical Image Segmentation with Foundation\n Models","summary":" Medical image segmentation is crucial for clinical decision-making, but the\nscarcity of annotated data presents significant challenges. Few-shot\nsegmentation (FSS) methods show promise but often require retraining on the\ntarget domain and struggle to generalize across different modalities.\nSimilarly, adapting foundation models like the Segment Anything Model (SAM) for\nmedical imaging has limitations, including the need for finetuning and\ndomain-specific adaptation. To address these issues, we propose a novel method\nthat adapts DINOv2 and Segment Anything Model 2 (SAM 2) for retrieval-augmented\nfew-shot medical image segmentation. Our approach uses DINOv2's feature as\nquery to retrieve similar samples from limited annotated data, which are then\nencoded as memories and stored in memory bank. With the memory attention\nmechanism of SAM 2, the model leverages these memories as conditions to\ngenerate accurate segmentation of the target image. We evaluated our framework\non three medical image segmentation tasks, demonstrating superior performance\nand generalizability across various modalities without the need for any\nretraining or finetuning. Overall, this method offers a practical and effective\nsolution for few-shot medical image segmentation and holds significant\npotential as a valuable annotation tool in clinical applications.\n","authors":["Lin Zhao","Xiao Chen","Eric Z. Chen","Yikang Liu","Terrence Chen","Shanhui Sun"],"pdf_url":"https://arxiv.org/pdf/2408.08813v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08802v1","updated":"2024-08-16T15:26:23Z","published":"2024-08-16T15:26:23Z","title":"PriorMapNet: Enhancing Online Vectorized HD Map Construction with Priors","summary":" Online vectorized High-Definition (HD) map construction is crucial for\nsubsequent prediction and planning tasks in autonomous driving. Following MapTR\nparadigm, recent works have made noteworthy achievements. However, reference\npoints are randomly initialized in mainstream methods, leading to unstable\nmatching between predictions and ground truth. To address this issue, we\nintroduce PriorMapNet to enhance online vectorized HD map construction with\npriors. We propose the PPS-Decoder, which provides reference points with\nposition and structure priors. Fitted from the map elements in the dataset,\nprior reference points lower the learning difficulty and achieve stable\nmatching. Furthermore, we propose the PF-Encoder to enhance the image-to-BEV\ntransformation with BEV feature priors. Besides, we propose the DMD\ncross-attention, which decouples cross-attention along multi-scale and\nmulti-sample respectively to achieve efficiency. Our proposed PriorMapNet\nachieves state-of-the-art performance in the online vectorized HD map\nconstruction task on nuScenes and Argoverse2 datasets. The code will be\nreleased publicly soon.\n","authors":["Rongxuan Wang","Xin Lu","Xiaoyang Liu","Xiaoyi Zou","Tongyi Cao","Ying Li"],"pdf_url":"https://arxiv.org/pdf/2408.08802v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.03967v4","updated":"2024-08-16T15:18:05Z","published":"2023-11-07T13:06:50Z","title":"CeCNN: Copula-enhanced convolutional neural networks in joint prediction\n of refraction error and axial length based on ultra-widefield fundus images","summary":" The ultra-widefield (UWF) fundus image is an attractive 3D biomarker in\nAI-aided myopia screening because it provides much richer myopia-related\ninformation. Though axial length (AL) has been acknowledged to be highly\nrelated to the two key targets of myopia screening, Spherical Equivalence (SE)\nmeasurement and high myopia diagnosis, its prediction based on the UWF fundus\nimage is rarely considered. To save the high expense and time costs of\nmeasuring SE and AL, we propose the Copula-enhanced Convolutional Neural\nNetwork (CeCNN), a one-stop UWF-based ophthalmic AI framework to jointly\npredict SE, AL, and myopia status. The CeCNN formulates a multiresponse\nregression that relates multiple dependent discrete-continuous responses and\nthe image covariate, where the nonlinearity of the association is modeled by a\nbackbone CNN. To thoroughly describe the dependence structure among the\nresponses, we model and incorporate the conditional dependence among responses\nin a CNN through a new copula-likelihood loss. We provide statistical\ninterpretations of the conditional dependence among responses, and reveal that\nsuch dependence is beyond the dependence explained by the image covariate. We\nheuristically justify that the proposed loss can enhance the estimation\nefficiency of the CNN weights. We apply the CeCNN to the UWF dataset collected\nby us and demonstrate that the CeCNN sharply enhances the predictive capability\nof various backbone CNNs. Our study evidences the ophthalmology view that\nbesides SE, AL is also an important measure to myopia.\n","authors":["Chong Zhong","Yang Li","Danjuan Yang","Meiyan Li","Xingyao Zhou","Bo Fu","Catherine C. Liu","A. H. Welsh"],"pdf_url":"https://arxiv.org/pdf/2311.03967v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08793v1","updated":"2024-08-16T15:05:28Z","published":"2024-08-16T15:05:28Z","title":"Backward-Compatible Aligned Representations via an Orthogonal\n Transformation Layer","summary":" Visual retrieval systems face significant challenges when updating models\nwith improved representations due to misalignment between the old and new\nrepresentations. The costly and resource-intensive backfilling process involves\nrecalculating feature vectors for images in the gallery set whenever a new\nmodel is introduced. To address this, prior research has explored\nbackward-compatible training methods that enable direct comparisons between new\nand old representations without backfilling. Despite these advancements,\nachieving a balance between backward compatibility and the performance of\nindependently trained models remains an open problem. In this paper, we address\nit by expanding the representation space with additional dimensions and\nlearning an orthogonal transformation to achieve compatibility with old models\nand, at the same time, integrate new information. This transformation preserves\nthe original feature space's geometry, ensuring that our model aligns with\nprevious versions while also learning new data. Our Orthogonal Compatible\nAligned (OCA) approach eliminates the need for re-indexing during model updates\nand ensures that features can be compared directly across different model\nupdates without additional mapping functions. Experimental results on CIFAR-100\nand ImageNet-1k demonstrate that our method not only maintains compatibility\nwith previous models but also achieves state-of-the-art accuracy, outperforming\nseveral existing methods.\n","authors":["Simone Ricci","Niccolò Biondi","Federico Pernici","Alberto Del Bimbo"],"pdf_url":"https://arxiv.org/pdf/2408.08793v1.pdf","comment":"Accepted at BEW2024 Workshop at ECCV2024"},{"id":"http://arxiv.org/abs/2408.08792v1","updated":"2024-08-16T15:04:13Z","published":"2024-08-16T15:04:13Z","title":"Assessing Generalization Capabilities of Malaria Diagnostic Models from\n Thin Blood Smears","summary":" Malaria remains a significant global health challenge, necessitating rapid\nand accurate diagnostic methods. While computer-aided diagnosis (CAD) tools\nutilizing deep learning have shown promise, their generalization to diverse\nclinical settings remains poorly assessed. This study evaluates the\ngeneralization capabilities of a CAD model for malaria diagnosis from thin\nblood smear images across four sites. We explore strategies to enhance\ngeneralization, including fine-tuning and incremental learning. Our results\ndemonstrate that incorporating site-specific data significantly improves model\nperformance, paving the way for broader clinical application.\n","authors":["Louise Guillon","Soheib Biga","Axel Puyo","Grégoire Pasquier","Valentin Foucher","Yendoubé E. Kantchire","Stéphane E. Sossou","Ameyo M. Dorkenoo","Laurent Bonnardot","Marc Thellier","Laurence Lachaud","Renaud Piarroux"],"pdf_url":"https://arxiv.org/pdf/2408.08792v1.pdf","comment":"MICCAI 2024 AMAI Workshop, Accepted for presentation, Submitted\n Manuscript Version, 10 pages"},{"id":"http://arxiv.org/abs/2408.08790v1","updated":"2024-08-16T15:03:06Z","published":"2024-08-16T15:03:06Z","title":"A Disease-Specific Foundation Model Using Over 100K Fundus Images:\n Release and Validation for Abnormality and Multi-Disease Classification on\n Downstream Tasks","summary":" Artificial intelligence applied to retinal images offers significant\npotential for recognizing signs and symptoms of retinal conditions and\nexpediting the diagnosis of eye diseases and systemic disorders. However,\ndeveloping generalized artificial intelligence models for medical data often\nrequires a large number of labeled images representing various disease signs,\nand most models are typically task-specific, focusing on major retinal\ndiseases. In this study, we developed a Fundus-Specific Pretrained Model\n(Image+Fundus), a supervised artificial intelligence model trained to detect\nabnormalities in fundus images. A total of 57,803 images were used to develop\nthis pretrained model, which achieved superior performance across various\ndownstream tasks, indicating that our proposed model outperforms other general\nmethods. Our Image+Fundus model offers a generalized approach to improve model\nperformance while reducing the number of labeled datasets required.\nAdditionally, it provides more disease-specific insights into fundus images,\nwith visualizations generated by our model. These disease-specific foundation\nmodels are invaluable in enhancing the performance and efficiency of deep\nlearning models in the field of fundus imaging.\n","authors":["Boa Jang","Youngbin Ahn","Eun Kyung Choe","Chang Ki Yoon","Hyuk Jin Choi","Young-Gon Kim"],"pdf_url":"https://arxiv.org/pdf/2408.08790v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.08784v1","updated":"2024-08-16T14:56:17Z","published":"2024-08-16T14:56:17Z","title":"Multi-task Learning Approach for Intracranial Hemorrhage Prognosis","summary":" Prognosis after intracranial hemorrhage (ICH) is influenced by a complex\ninterplay between imaging and tabular data. Rapid and reliable prognosis are\ncrucial for effective patient stratification and informed treatment\ndecision-making. In this study, we aim to enhance image-based prognosis by\nlearning a robust feature representation shared between prognosis and the\nclinical and demographic variables most highly correlated with it. Our approach\nmimics clinical decision-making by reinforcing the model to learn valuable\nprognostic data embedded in the image. We propose a 3D multi-task image model\nto predict prognosis, Glasgow Coma Scale and age, improving accuracy and\ninterpretability. Our method outperforms current state-of-the-art baseline\nimage models, and demonstrates superior performance in ICH prognosis compared\nto four board-certified neuroradiologists using only CT scans as input. We\nfurther validate our model with interpretability saliency maps. Code is\navailable at https://github.com/MiriamCobo/MultitaskLearning_ICH_Prognosis.git.\n","authors":["Miriam Cobo","Amaia Pérez del Barrio","Pablo Menéndez Fernández-Miranda","Pablo Sanz Bellón","Lara Lloret Iglesias","Wilson Silva"],"pdf_url":"https://arxiv.org/pdf/2408.08784v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2312.01677v3","updated":"2024-08-16T14:53:09Z","published":"2023-12-04T06:59:55Z","title":"Multi-task Image Restoration Guided By Robust DINO Features","summary":" Multi-task image restoration has gained significant interest due to its\ninherent versatility and efficiency compared to its single-task counterpart.\nHowever, performance decline is observed with an increase in the number of\ntasks, primarily attributed to the restoration model's challenge in handling\ndifferent tasks with distinct natures at the same time. Thus, a perspective\nemerged aiming to explore the degradation-insensitive semantic commonalities\namong different degradation tasks. In this paper, we observe that the features\nof DINOv2 can effectively model semantic information and are independent of\ndegradation factors. Motivated by this observation, we propose\n\\mbox{\\textbf{DINO-IR}}, a multi-task image restoration approach leveraging\nrobust features extracted from DINOv2 to solve multi-task image restoration\nsimultaneously. We first propose a pixel-semantic fusion (PSF) module to\ndynamically fuse DINOV2's shallow features containing pixel-level information\nand deep features containing degradation-independent semantic information. To\nguide the restoration model with the features of DINOv2, we develop a\nDINO-Restore adaption and fusion module to adjust the channel of fused features\nfrom PSF and then integrate them with the features from the restoration model.\nBy formulating these modules into a unified deep model, we propose a DINO\nperception contrastive loss to constrain the model training. Extensive\nexperimental results demonstrate that our DINO-IR performs favorably against\nexisting multi-task image restoration approaches in various tasks by a large\nmargin. The source codes and trained models will be made available.\n","authors":["Xin Lin","Jingtong Yue","Kelvin C. K. Chan","Lu Qi","Chao Ren","Jinshan Pan","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2312.01677v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.01826v2","updated":"2024-08-16T14:45:14Z","published":"2024-08-03T17:18:26Z","title":"GLDiTalker: Speech-Driven 3D Facial Animation with Graph Latent\n Diffusion Transformer","summary":" Speech-driven talking head generation is an important but challenging task\nfor many downstream applications such as augmented reality. Existing methods\nhave achieved remarkable performance by utilizing autoregressive models or\ndiffusion models. However, most still suffer from modality inconsistencies,\nspecifically the misalignment between audio and mesh modalities, which causes\ninconsistencies in motion diversity and lip-sync accuracy. To address this\nissue, this paper introduces GLDiTalker, a novel speech-driven 3D facial\nanimation model that employs a Graph Latent Diffusion Transformer. The core\nidea behind GLDiTalker is that the audio-mesh modality misalignment can be\nresolved by diffusing the signal in a latent quantilized spatial-temporal\nspace. To achieve this, GLDiTalker builds upon a quantilized space-time\ndiffusion training pipeline, which consists of a Graph Enhanced Quantilized\nSpace Learning Stage and a Space-Time Powered Latent Diffusion Stage. The first\nstage ensures lip-sync accuracy, while the second stage enhances motion\ndiversity. Together, these stages enable GLDiTalker to generate temporally and\nspatially stable, realistic models. Extensive evaluations on several widely\nused benchmarks demonstrate that our method achieves superior performance\ncompared to existing methods.\n","authors":["Yihong Lin","Zhaoxin Fan","Lingyu Xiong","Liang Peng","Xiandong Li","Wenxiong Kang","Xianjia Wu","Songju Lei","Huang Xu"],"pdf_url":"https://arxiv.org/pdf/2408.01826v2.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.06128v2","updated":"2024-08-16T14:36:20Z","published":"2024-04-09T08:51:44Z","title":"Gaussian Pancakes: Geometrically-Regularized 3D Gaussian Splatting for\n Realistic Endoscopic Reconstruction","summary":" Within colorectal cancer diagnostics, conventional colonoscopy techniques\nface critical limitations, including a limited field of view and a lack of\ndepth information, which can impede the detection of precancerous lesions.\nCurrent methods struggle to provide comprehensive and accurate 3D\nreconstructions of the colonic surface which can help minimize the missing\nregions and reinspection for pre-cancerous polyps. Addressing this, we\nintroduce 'Gaussian Pancakes', a method that leverages 3D Gaussian Splatting\n(3D GS) combined with a Recurrent Neural Network-based Simultaneous\nLocalization and Mapping (RNNSLAM) system. By introducing geometric and depth\nregularization into the 3D GS framework, our approach ensures more accurate\nalignment of Gaussians with the colon surface, resulting in smoother 3D\nreconstructions with novel viewing of detailed textures and structures.\nEvaluations across three diverse datasets show that Gaussian Pancakes enhances\nnovel view synthesis quality, surpassing current leading methods with a 18%\nboost in PSNR and a 16% improvement in SSIM. It also delivers over 100X faster\nrendering and more than 10X shorter training times, making it a practical tool\nfor real-time applications. Hence, this holds promise for achieving clinical\ntranslation for better detection and diagnosis of colorectal cancer.\n","authors":["Sierra Bonilla","Shuai Zhang","Dimitrios Psychogyios","Danail Stoyanov","Francisco Vasconcelos","Sophia Bano"],"pdf_url":"https://arxiv.org/pdf/2404.06128v2.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.08766v1","updated":"2024-08-16T14:22:02Z","published":"2024-08-16T14:22:02Z","title":"VF-NeRF: Learning Neural Vector Fields for Indoor Scene Reconstruction","summary":" Implicit surfaces via neural radiance fields (NeRF) have shown surprising\naccuracy in surface reconstruction. Despite their success in reconstructing\nrichly textured surfaces, existing methods struggle with planar regions with\nweak textures, which account for the majority of indoor scenes. In this paper,\nwe address indoor dense surface reconstruction by revisiting key aspects of\nNeRF in order to use the recently proposed Vector Field (VF) as the implicit\nrepresentation. VF is defined by the unit vector directed to the nearest\nsurface point. It therefore flips direction at the surface and equals to the\nexplicit surface normals. Except for this flip, VF remains constant along\nplanar surfaces and provides a strong inductive bias in representing planar\nsurfaces. Concretely, we develop a novel density-VF relationship and a training\nscheme that allows us to learn VF via volume rendering By doing this, VF-NeRF\ncan model large planar surfaces and sharp corners accurately. We show that,\nwhen depth cues are available, our method further improves and achieves\nstate-of-the-art results in reconstructing indoor scenes and rendering novel\nviews. We extensively evaluate VF-NeRF on indoor datasets and run ablations of\nits components.\n","authors":["Albert Gassol Puigjaner","Edoardo Mello Rella","Erik Sandström","Ajad Chhatkuli","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2408.08766v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2408.08753v1","updated":"2024-08-16T13:53:53Z","published":"2024-08-16T13:53:53Z","title":"PCP-MAE: Learning to Predict Centers for Point Masked Autoencoders","summary":" Masked autoencoder has been widely explored in point cloud self-supervised\nlearning, whereby the point cloud is generally divided into visible and masked\nparts. These methods typically include an encoder accepting visible patches\n(normalized) and corresponding patch centers (position) as input, with the\ndecoder accepting the output of the encoder and the centers (position) of the\nmasked parts to reconstruct each point in the masked patches. Then, the\npre-trained encoders are used for downstream tasks. In this paper, we show a\nmotivating empirical result that when directly feeding the centers of masked\npatches to the decoder without information from the encoder, it still\nreconstructs well. In other words, the centers of patches are important and the\nreconstruction objective does not necessarily rely on representations of the\nencoder, thus preventing the encoder from learning semantic representations.\nBased on this key observation, we propose a simple yet effective method, i.e.,\nlearning to Predict Centers for Point Masked AutoEncoders (PCP-MAE) which\nguides the model to learn to predict the significant centers and use the\npredicted centers to replace the directly provided centers. Specifically, we\npropose a Predicting Center Module (PCM) that shares parameters with the\noriginal encoder with extra cross-attention to predict centers. Our method is\nof high pre-training efficiency compared to other alternatives and achieves\ngreat improvement over Point-MAE, particularly outperforming it by 5.50%,\n6.03%, and 5.17% on three variants of ScanObjectNN. The code will be made\npublicly available.\n","authors":["Xiangdong Zhang","Shaofeng Zhang","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2408.08753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08751v1","updated":"2024-08-16T13:50:50Z","published":"2024-08-16T13:50:50Z","title":"Comparative Analysis of Generative Models: Enhancing Image Synthesis\n with VAEs, GANs, and Stable Diffusion","summary":" This paper examines three major generative modelling frameworks: Variational\nAutoencoders (VAEs), Generative Adversarial Networks (GANs), and Stable\nDiffusion models. VAEs are effective at learning latent representations but\nfrequently yield blurry results. GANs can generate realistic images but face\nissues such as mode collapse. Stable Diffusion models, while producing\nhigh-quality images with strong semantic coherence, are demanding in terms of\ncomputational resources. Additionally, the paper explores how incorporating\nGrounding DINO and Grounded SAM with Stable Diffusion improves image accuracy\nby utilising sophisticated segmentation and inpainting techniques. The analysis\nguides on selecting suitable models for various applications and highlights\nareas for further research.\n","authors":["Sanchayan Vivekananthan"],"pdf_url":"https://arxiv.org/pdf/2408.08751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08747v1","updated":"2024-08-16T13:49:18Z","published":"2024-08-16T13:49:18Z","title":"MicroSSIM: Improved Structural Similarity for Comparing Microscopy Data","summary":" Microscopy is routinely used to image biological structures of interest. Due\nto imaging constraints, acquired images are typically low-SNR and contain\nnoise. Over the last few years, regression-based tasks like unsupervised\ndenoising and splitting have found utility in working with such noisy\nmicrographs. For evaluation, Structural Similarity (SSIM) is one of the most\npopular measures used in the field. For such tasks, the best evaluation would\nbe when both low-SNR noisy images and corresponding high-SNR clean images are\nobtained directly from a microscope. However, due to the following three\npeculiar properties of the microscopy data, we observe that SSIM is not well\nsuited to this data regime: (a) high-SNR micrographs have higher intensity\npixels as compared to low SNR micrographs, (b) high-SNR micrographs have higher\nintensity pixels than found in natural images, images for which SSIM was\ndeveloped, and (c) a digitally configurable offset is added by the detector\npresent inside the microscope. We show that SSIM components behave unexpectedly\nwhen the prediction generated from low-SNR input is compared with the\ncorresponding high-SNR data. We explain this behavior by introducing the\nphenomenon of saturation, where the value of SSIM components becomes less\nsensitive to (dis)similarity between the images. We introduce microSSIM, a\nvariant of SSIM, which overcomes the above-discussed issues. We justify the\nsoundness and utility of microSSIM using theoretical and empirical arguments\nand show the utility of microSSIM on two tasks: unsupervised denoising and\njoint image splitting with unsupervised denoising. Since our formulation can be\napplied to a broad family of SSIM-based measures, we also introduce MicroMS3IM,\na microscopy-specific variation of MS-SSIM. The source code and python package\nis available at https://github.com/juglab/MicroSSIM.\n","authors":["Ashesh Ashesh","Joran Deschamps","Florian Jug"],"pdf_url":"https://arxiv.org/pdf/2408.08747v1.pdf","comment":"Accepted at BIC workshop, ECCV 24"},{"id":"http://arxiv.org/abs/2408.08742v1","updated":"2024-08-16T13:41:34Z","published":"2024-08-16T13:41:34Z","title":"A lifted Bregman strategy for training unfolded proximal neural network\n Gaussian denoisers","summary":" Unfolded proximal neural networks (PNNs) form a family of methods that\ncombines deep learning and proximal optimization approaches. They consist in\ndesigning a neural network for a specific task by unrolling a proximal\nalgorithm for a fixed number of iterations, where linearities can be learned\nfrom prior training procedure. PNNs have shown to be more robust than\ntraditional deep learning approaches while reaching at least as good\nperformances, in particular in computational imaging. However, training PNNs\nstill depends on the efficiency of available training algorithms. In this work,\nwe propose a lifted training formulation based on Bregman distances for\nunfolded PNNs. Leveraging the deterministic mini-batch block-coordinate\nforward-backward method, we design a bespoke computational strategy beyond\ntraditional back-propagation methods for solving the resulting learning problem\nefficiently. We assess the behaviour of the proposed training approach for PNNs\nthrough numerical simulations on image denoising, considering a denoising PNN\nwhose structure is based on dual proximal-gradient iterations.\n","authors":["Xiaoyu Wang","Martin Benning","Audrey Repetti"],"pdf_url":"https://arxiv.org/pdf/2408.08742v1.pdf","comment":"2024 IEEE International Workshop on Machine Learning for Signal\n Processing, Sept. 22--25, 2024, London, UK"},{"id":"http://arxiv.org/abs/2408.08736v1","updated":"2024-08-16T13:35:52Z","published":"2024-08-16T13:35:52Z","title":"Task-Aware Dynamic Transformer for Efficient Arbitrary-Scale Image\n Super-Resolution","summary":" Arbitrary-scale super-resolution (ASSR) aims to learn a single model for\nimage super-resolution at arbitrary magnifying scales. Existing ASSR networks\ntypically comprise an off-the-shelf scale-agnostic feature extractor and an\narbitrary scale upsampler. These feature extractors often use fixed network\narchitectures to address different ASSR inference tasks, each of which is\ncharacterized by an input image and an upsampling scale. However, this\noverlooks the difficulty variance of super-resolution on different inference\nscenarios, where simple images or small SR scales could be resolved with less\ncomputational effort than difficult images or large SR scales. To tackle this\ndifficulty variability, in this paper, we propose a Task-Aware Dynamic\nTransformer (TADT) as an input-adaptive feature extractor for efficient image\nASSR. Our TADT consists of a multi-scale feature extraction backbone built upon\ngroups of Multi-Scale Transformer Blocks (MSTBs) and a Task-Aware Routing\nController (TARC). The TARC predicts the inference paths within feature\nextraction backbone, specifically selecting MSTBs based on the input images and\nSR scales. The prediction of inference path is guided by a new loss function to\ntrade-off the SR accuracy and efficiency. Experiments demonstrate that, when\nworking with three popular arbitrary-scale upsamplers, our TADT achieves\nstate-of-the-art ASSR performance when compared with mainstream feature\nextractors, but with relatively fewer computational costs. The code will be\npublicly released.\n","authors":["Tianyi Xu","Yiji Zhou","Xiaotao Hu","Kai Zhang","Anran Zhang","Xingye Qiu","Jun Xu"],"pdf_url":"https://arxiv.org/pdf/2408.08736v1.pdf","comment":"ECAI 2024"},{"id":"http://arxiv.org/abs/2408.08723v1","updated":"2024-08-16T13:11:22Z","published":"2024-08-16T13:11:22Z","title":"Correspondence-Guided SfM-Free 3D Gaussian Splatting for NVS","summary":" Novel View Synthesis (NVS) without Structure-from-Motion (SfM) pre-processed\ncamera poses--referred to as SfM-free methods--is crucial for promoting rapid\nresponse capabilities and enhancing robustness against variable operating\nconditions. Recent SfM-free methods have integrated pose optimization,\ndesigning end-to-end frameworks for joint camera pose estimation and NVS.\nHowever, most existing works rely on per-pixel image loss functions, such as L2\nloss. In SfM-free methods, inaccurate initial poses lead to misalignment issue,\nwhich, under the constraints of per-pixel image loss functions, results in\nexcessive gradients, causing unstable optimization and poor convergence for\nNVS. In this study, we propose a correspondence-guided SfM-free 3D Gaussian\nsplatting for NVS. We use correspondences between the target and the rendered\nresult to achieve better pixel alignment, facilitating the optimization of\nrelative poses between frames. We then apply the learned poses to optimize the\nentire scene. Each 2D screen-space pixel is associated with its corresponding\n3D Gaussians through approximated surface rendering to facilitate gradient back\npropagation. Experimental results underline the superior performance and time\nefficiency of the proposed approach compared to the state-of-the-art baselines.\n","authors":["Wei Sun","Xiaosong Zhang","Fang Wan","Yanzhao Zhou","Yuan Li","Qixiang Ye","Jianbin Jiao"],"pdf_url":"https://arxiv.org/pdf/2408.08723v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2312.07504 by other authors"},{"id":"http://arxiv.org/abs/2401.17542v3","updated":"2024-08-16T12:46:03Z","published":"2024-01-31T02:09:21Z","title":"A Medical Data-Effective Learning Benchmark for Highly Efficient\n Pre-training of Foundation Models","summary":" Foundation models, pre-trained on massive datasets, have achieved\nunprecedented generalizability. However, is it truly necessary to involve such\nvast amounts of data in pre-training, consuming extensive computational\nresources? This paper introduces data-effective learning, aiming to use data in\nthe most impactful way to pre-train foundation models. This involves strategies\nthat focus on data quality rather than quantity, ensuring the data used for\ntraining has high informational value. Data-effective learning plays a profound\nrole in accelerating foundation model training, reducing computational costs,\nand saving data storage, which is very important as the volume of medical data\nin recent years has grown beyond many people's expectations. However, due to\nthe lack of standards and comprehensive benchmarks, research on medical\ndata-effective learning is poorly studied. To address this gap, our paper\nintroduces a comprehensive benchmark specifically for evaluating data-effective\nlearning in the medical field. This benchmark includes a dataset with millions\nof data samples from 31 medical centers (DataDEL), a baseline method for\ncomparison (MedDEL), and a new evaluation metric (NormDEL) to objectively\nmeasure data-effective learning performance. Our extensive experimental results\nshow the baseline MedDEL can achieve performance comparable to the original\nlarge dataset with only 5% of the data. Establishing such an open\ndata-effective learning benchmark is crucial for the medical foundation model\nresearch community because it facilitates efficient data use, promotes\ncollaborative breakthroughs, and fosters the development of cost-effective,\nscalable, and impactful healthcare solutions.\n","authors":["Wenxuan Yang","Weimin Tan","Yuqi Sun","Bo Yan"],"pdf_url":"https://arxiv.org/pdf/2401.17542v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08708v1","updated":"2024-08-16T12:43:11Z","published":"2024-08-16T12:43:11Z","title":"Decoupling Feature Representations of Ego and Other Modalities for\n Incomplete Multi-modal Brain Tumor Segmentation","summary":" Multi-modal brain tumor segmentation typically involves four magnetic\nresonance imaging (MRI) modalities, while incomplete modalities significantly\ndegrade performance. Existing solutions employ explicit or implicit modality\nadaptation, aligning features across modalities or learning a fused feature\nrobust to modality incompleteness. They share a common goal of encouraging each\nmodality to express both itself and the others. However, the two expression\nabilities are entangled as a whole in a seamless feature space, resulting in\nprohibitive learning burdens. In this paper, we propose DeMoSeg to enhance the\nmodality adaptation by Decoupling the task of representing the ego and other\nModalities for robust incomplete multi-modal Segmentation. The decoupling is\nsuper lightweight by simply using two convolutions to map each modality onto\nfour feature sub-spaces. The first sub-space expresses itself (Self-feature),\nwhile the remaining sub-spaces substitute for other modalities\n(Mutual-features). The Self- and Mutual-features interactively guide each other\nthrough a carefully-designed Channel-wised Sparse Self-Attention (CSSA). After\nthat, a Radiologist-mimic Cross-modality expression Relationships (RCR) is\nintroduced to have available modalities provide Self-feature and also `lend'\ntheir Mutual-features to compensate for the absent ones by exploiting the\nclinical prior knowledge. The benchmark results on BraTS2020, BraTS2018 and\nBraTS2015 verify the DeMoSeg's superiority thanks to the alleviated modality\nadaptation difficulty. Concretely, for BraTS2020, DeMoSeg increases Dice by at\nleast 0.92%, 2.95% and 4.95% on whole tumor, tumor core and enhanced tumor\nregions, respectively, compared to other state-of-the-arts. Codes are at\nhttps://github.com/kk42yy/DeMoSeg\n","authors":["Kaixiang Yang","Wenqi Shan","Xudong Li","Xuan Wang","Xikai Yang","Xi Wang","Pheng-Ann Heng","Qiang Li","Zhiwei Wang"],"pdf_url":"https://arxiv.org/pdf/2408.08708v1.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.08704v1","updated":"2024-08-16T12:32:44Z","published":"2024-08-16T12:32:44Z","title":"Beyond the Hype: A dispassionate look at vision-language models in\n medical scenario","summary":" Recent advancements in Large Vision-Language Models (LVLMs) have demonstrated\nremarkable capabilities across diverse tasks, garnering significant attention\nin AI communities. However, their performance and reliability in specialized\ndomains such as medicine remain insufficiently assessed. In particular, most\nassessments over-concentrate in evaluating VLMs based on simple Visual Question\nAnswering (VQA) on multi-modality data, while ignoring the in-depth\ncharacteristic of LVLMs. In this study, we introduce RadVUQA, a novel\nRadiological Visual Understanding and Question Answering benchmark, to\ncomprehensively evaluate existing LVLMs. RadVUQA mainly validates LVLMs across\nfive dimensions: 1) Anatomical understanding, assessing the models' ability to\nvisually identify biological structures; 2) Multimodal comprehension, which\ninvolves the capability of interpreting linguistic and visual instructions to\nproduce desired outcomes; 3) Quantitative and spatial reasoning, evaluating the\nmodels' spatial awareness and proficiency in combining quantitative analysis\nwith visual and linguistic information; 4) Physiological knowledge, measuring\nthe models' capability to comprehend functions and mechanisms of organs and\nsystems; and 5) Robustness, which assesses the models' capabilities against\nunharmonised and synthetic data. The results indicate that both generalized\nLVLMs and medical-specific LVLMs have critical deficiencies with weak\nmultimodal comprehension and quantitative reasoning capabilities. Our findings\nreveal the large gap between existing LVLMs and clinicians, highlighting the\nurgent need for more robust and intelligent LVLMs. The code and dataset will be\navailable after the acceptance of this paper.\n","authors":["Yang Nan","Huichi Zhou","Xiaodan Xing","Guang Yang"],"pdf_url":"https://arxiv.org/pdf/2408.08704v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2312.04960v3","updated":"2024-08-16T12:31:38Z","published":"2023-12-08T10:50:02Z","title":"MIMIR: Masked Image Modeling for Mutual Information-based Adversarial\n Robustness","summary":" Vision Transformers (ViTs) achieve excellent performance in various tasks,\nbut they are also vulnerable to adversarial attacks. Building robust ViTs is\nhighly dependent on dedicated Adversarial Training (AT) strategies. However,\ncurrent ViTs' adversarial training only employs well-established training\napproaches from convolutional neural network (CNN) training, where pre-training\nprovides the basis for AT fine-tuning with the additional help of tailored data\naugmentations. In this paper, we take a closer look at the adversarial\nrobustness of ViTs by providing a novel theoretical Mutual Information (MI)\nanalysis in its autoencoder-based self-supervised pre-training. Specifically,\nwe show that MI between the adversarial example and its latent representation\nin ViT-based autoencoders should be constrained by utilizing the MI bounds.\nBased on this finding, we propose a masked autoencoder-based pre-training\nmethod, MIMIR, that employs an MI penalty to facilitate the adversarial\ntraining of ViTs. Extensive experiments show that MIMIR outperforms\nstate-of-the-art adversarially trained ViTs on benchmark datasets with higher\nnatural and robust accuracy, indicating that ViTs can substantially benefit\nfrom exploiting MI. In addition, we consider two adaptive attacks by assuming\nthat the adversary is aware of the MIMIR design, which further verifies the\nprovided robustness.\n","authors":["Xiaoyun Xu","Shujian Yu","Zhuoran Liu","Stjepan Picek"],"pdf_url":"https://arxiv.org/pdf/2312.04960v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08703v1","updated":"2024-08-16T12:30:29Z","published":"2024-08-16T12:30:29Z","title":"TsCA: On the Semantic Consistency Alignment via Conditional Transport\n for Compositional Zero-Shot Learning","summary":" Compositional Zero-Shot Learning (CZSL) aims to recognize novel\n\\textit{state-object} compositions by leveraging the shared knowledge of their\nprimitive components. Despite considerable progress, effectively calibrating\nthe bias between semantically similar multimodal representations, as well as\ngeneralizing pre-trained knowledge to novel compositional contexts, remains an\nenduring challenge. In this paper, our interest is to revisit the conditional\ntransport (CT) theory and its homology to the visual-semantics interaction in\nCZSL and further, propose a novel Trisets Consistency Alignment framework\n(dubbed TsCA) that well-addresses these issues. Concretely, we utilize three\ndistinct yet semantically homologous sets, i.e., patches, primitives, and\ncompositions, to construct pairwise CT costs to minimize their semantic\ndiscrepancies. To further ensure the consistency transfer within these sets, we\nimplement a cycle-consistency constraint that refines the learning by\nguaranteeing the feature consistency of the self-mapping during transport flow,\nregardless of modality. Moreover, we extend the CT plans to an open-world\nsetting, which enables the model to effectively filter out unfeasible pairs,\nthereby speeding up the inference as well as increasing the accuracy. Extensive\nexperiments are conducted to verify the effectiveness of the proposed method.\n","authors":["Miaoge Li","Jingcai Guo","Richard Yi Da Xu","Dongsheng Wang","Xiaofeng Cao","Song Guo"],"pdf_url":"https://arxiv.org/pdf/2408.08703v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08700v1","updated":"2024-08-16T12:27:46Z","published":"2024-08-16T12:27:46Z","title":"HyCoT: Hyperspectral Compression Transformer with an Efficient Training\n Strategy","summary":" The development of learning-based hyperspectral image (HSI) compression\nmodels has recently attracted significant interest. Existing models\npredominantly utilize convolutional filters, which capture only local\ndependencies. Furthermore, they often incur high training costs and exhibit\nsubstantial computational complexity. To address these limitations, in this\npaper we propose Hyperspectral Compression Transformer (HyCoT) that is a\ntransformer-based autoencoder for pixelwise HSI compression. Additionally, we\nintroduce an efficient training strategy to accelerate the training process.\nExperimental results on the HySpecNet-11k dataset demonstrate that HyCoT\nsurpasses the state-of-the-art across various compression ratios by over 1 dB\nwith significantly reduced computational requirements. Our code and pre-trained\nweights are publicly available at https://git.tu-berlin.de/rsim/hycot .\n","authors":["Martin Hermann Paul Fuchs","Behnood Rasti","Begüm Demir"],"pdf_url":"https://arxiv.org/pdf/2408.08700v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.02504v2","updated":"2024-08-16T12:24:54Z","published":"2023-02-05T22:51:27Z","title":"Motion-compensated MR CINE reconstruction with reconstruction-driven\n motion estimation","summary":" In cardiac CINE, motion-compensated MR reconstruction (MCMR) is an effective\napproach to address highly undersampled acquisitions by incorporating motion\ninformation between frames. In this work, we propose a novel perspective for\naddressing the MCMR problem and a more integrated and efficient solution to the\nMCMR field. Contrary to state-of-the-art (SOTA) MCMR methods which break the\noriginal problem into two sub-optimization problems, i.e. motion estimation and\nreconstruction, we formulate this problem as a single entity with one single\noptimization. Our approach is unique in that the motion estimation is directly\ndriven by the ultimate goal, reconstruction, but not by the canonical\nmotion-warping loss (similarity measurement between motion-warped images and\ntarget images). We align the objectives of motion estimation and\nreconstruction, eliminating the drawbacks of artifacts-affected motion\nestimation and therefore error-propagated reconstruction. Further, we can\ndeliver high-quality reconstruction and realistic motion without applying any\nregularization/smoothness loss terms, circumventing the non-trivial weighting\nfactor tuning. We evaluate our method on two datasets: 1) an in-house acquired\n2D CINE dataset for the retrospective study and 2) the public OCMR cardiac\ndataset for the prospective study. The conducted experiments indicate that the\nproposed MCMR framework can deliver artifact-free motion estimation and\nhigh-quality MR images even for imaging accelerations up to 20x, outperforming\nSOTA non-MCMR and MCMR methods in both qualitative and quantitative evaluation\nacross all experiments. The code is available at\nhttps://github.com/JZPeterPan/MCMR-Recon-Driven-Motion.\n","authors":["Jiazhen Pan","Wenqi Huang","Daniel Rueckert","Thomas Küstner","Kerstin Hammernik"],"pdf_url":"https://arxiv.org/pdf/2302.02504v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08682v1","updated":"2024-08-16T11:55:44Z","published":"2024-08-16T11:55:44Z","title":"LLM-PCGC: Large Language Model-based Point Cloud Geometry Compression","summary":" The key to effective point cloud compression is to obtain a robust context\nmodel consistent with complex 3D data structures. Recently, the advancement of\nlarge language models (LLMs) has highlighted their capabilities not only as\npowerful generators for in-context learning and generation but also as\neffective compressors. These dual attributes of LLMs make them particularly\nwell-suited to meet the demands of data compression. Therefore, this paper\nexplores the potential of using LLM for compression tasks, focusing on lossless\npoint cloud geometry compression (PCGC) experiments. However, applying LLM\ndirectly to PCGC tasks presents some significant challenges, i.e., LLM does not\nunderstand the structure of the point cloud well, and it is a difficult task to\nfill the gap between text and point cloud through text description, especially\nfor large complicated and small shapeless point clouds. To address these\nproblems, we introduce a novel architecture, namely the Large Language\nModel-based Point Cloud Geometry Compression (LLM-PCGC) method, using LLM to\ncompress point cloud geometry information without any text description or\naligning operation. By utilizing different adaptation techniques for\ncross-modality representation alignment and semantic consistency, including\nclustering, K-tree, token mapping invariance, and Low Rank Adaptation (LoRA),\nthe proposed method can translate LLM to a compressor/generator for point\ncloud. To the best of our knowledge, this is the first structure to employ LLM\nas a compressor for point cloud data. Experiments demonstrate that the LLM-PCGC\noutperforms the other existing methods significantly, by achieving -40.213% bit\nrate reduction compared to the reference software of MPEG Geometry-based Point\nCloud Compression (G-PCC) standard, and by achieving -2.267% bit rate reduction\ncompared to the state-of-the-art learning-based method.\n","authors":["Yuqi Ye","Wei Gao"],"pdf_url":"https://arxiv.org/pdf/2408.08682v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08191v2","updated":"2024-08-16T11:54:53Z","published":"2024-08-15T14:49:12Z","title":"Beyond Full Label: Single-Point Prompt for Infrared Small Target Label\n Generation","summary":" In this work, we make the first attempt to construct a learning-based\nsingle-point annotation paradigm for infrared small target label generation\n(IRSTLG). Our intuition is that label generation requires just one more point\nprompt than target detection: IRSTLG can be regarded as an infrared small\ntarget detection (IRSTD) task with the target location hint. Based on this\ninsight, we introduce an energy double guided single-point prompt (EDGSP)\nframework, which adeptly transforms the target detection network into a refined\nlabel generation method. Specifically, the proposed EDGSP includes: 1) target\nenergy initialization (TEI) to create a foundational outline for sufficient\nshape evolution of pseudo label, 2) double prompt embedding (DPE) for rapid\nlocalization of interested regions and reinforcement of individual differences\nto avoid label adhesion, and 3) bounding box-based matching (BBM) to eliminate\nfalse alarms. Experimental results show that pseudo labels generated by three\nbaselines equipped with EDGSP achieve 100% object-level probability of\ndetection (Pd) and 0% false-alarm rate (Fa) on SIRST, NUDT-SIRST, and IRSTD-1k\ndatasets, with a pixel-level intersection over union (IoU) improvement of\n13.28% over state-of-the-art (SOTA) label generation methods. In the practical\napplication of downstream IRSTD, EDGSP realizes, for the first time, a\nsingle-point generated pseudo mask beyond the full label. Even with coarse\nsingle-point annotations, it still achieves 99.5% performance of full labeling.\n","authors":["Shuai Yuan","Hanlin Qin","Renke Kou","Xiang Yan","Zechuan Li","Chenxu Peng","Abd-Krim Seghouane"],"pdf_url":"https://arxiv.org/pdf/2408.08191v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08671v1","updated":"2024-08-16T11:29:33Z","published":"2024-08-16T11:29:33Z","title":"Towards Physical World Backdoor Attacks against Skeleton Action\n Recognition","summary":" Skeleton Action Recognition (SAR) has attracted significant interest for its\nefficient representation of the human skeletal structure. Despite its\nadvancements, recent studies have raised security concerns in SAR models,\nparticularly their vulnerability to adversarial attacks. However, such\nstrategies are limited to digital scenarios and ineffective in physical\nattacks, limiting their real-world applicability. To investigate the\nvulnerabilities of SAR in the physical world, we introduce the Physical\nSkeleton Backdoor Attacks (PSBA), the first exploration of physical backdoor\nattacks against SAR. Considering the practicalities of physical execution, we\nintroduce a novel trigger implantation method that integrates infrequent and\nimperceivable actions as triggers into the original skeleton data. By\nincorporating a minimal amount of this manipulated data into the training set,\nPSBA enables the system misclassify any skeleton sequences into the target\nclass when the trigger action is present. We examine the resilience of PSBA in\nboth poisoned and clean-label scenarios, demonstrating its efficacy across a\nrange of datasets, poisoning ratios, and model architectures. Additionally, we\nintroduce a trigger-enhancing strategy to strengthen attack performance in the\nclean label setting. The robustness of PSBA is tested against three distinct\nbackdoor defenses, and the stealthiness of PSBA is evaluated using two\nquantitative metrics. Furthermore, by employing a Kinect V2 camera, we compile\na dataset of human actions from the real world to mimic physical attack\nsituations, with our findings confirming the effectiveness of our proposed\nattacks. Our project website can be found at\nhttps://qichenzheng.github.io/psba-website.\n","authors":["Qichen Zheng","Yi Yu","Siyuan Yang","Jun Liu","Kwok-Yan Lam","Alex Kot"],"pdf_url":"https://arxiv.org/pdf/2408.08671v1.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2408.08670v1","updated":"2024-08-16T11:27:52Z","published":"2024-08-16T11:27:52Z","title":"Adaptive Layer Selection for Efficient Vision Transformer Fine-Tuning","summary":" Recently, foundation models based on Vision Transformers (ViTs) have become\nwidely available. However, their fine-tuning process is highly\nresource-intensive, and it hinders their adoption in several edge or low-energy\napplications. To this end, in this paper we introduce an efficient fine-tuning\nmethod for ViTs called $\\textbf{ALaST}$ ($\\textit{Adaptive Layer Selection\nFine-Tuning for Vision Transformers}$) to speed up the fine-tuning process\nwhile reducing computational cost, memory load, and training time. Our approach\nis based on the observation that not all layers are equally critical during\nfine-tuning, and their importance varies depending on the current mini-batch.\nTherefore, at each fine-tuning step, we adaptively estimate the importance of\nall layers and we assign what we call ``compute budgets'' accordingly. Layers\nthat were allocated lower budgets are either trained with a reduced number of\ninput tokens or kept frozen. Freezing a layer reduces the computational cost\nand memory usage by preventing updates to its weights, while discarding tokens\nremoves redundant data, speeding up processing and reducing memory\nrequirements. We show that this adaptive compute allocation enables a\nnearly-optimal schedule for distributing computational resources across layers,\nresulting in substantial reductions in training time (up to 1.5x), FLOPs (up to\n2x), and memory load (up to 2x) compared to traditional full fine-tuning\napproaches. Additionally, it can be successfully combined with other\nparameter-efficient fine-tuning methods, such as LoRA.\n","authors":["Alessio Devoto","Federico Alvetreti","Jary Pomponi","Paolo Di Lorenzo","Pasquale Minervini","Simone Scardapane"],"pdf_url":"https://arxiv.org/pdf/2408.08670v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08665v1","updated":"2024-08-16T11:15:29Z","published":"2024-08-16T11:15:29Z","title":"QMambaBSR: Burst Image Super-Resolution with Query State Space Model","summary":" Burst super-resolution aims to reconstruct high-resolution images with higher\nquality and richer details by fusing the sub-pixel information from multiple\nburst low-resolution frames. In BusrtSR, the key challenge lies in extracting\nthe base frame's content complementary sub-pixel details while simultaneously\nsuppressing high-frequency noise disturbance. Existing methods attempt to\nextract sub-pixels by modeling inter-frame relationships frame by frame while\noverlooking the mutual correlations among multi-current frames and neglecting\nthe intra-frame interactions, leading to inaccurate and noisy sub-pixels for\nbase frame super-resolution. Further, existing methods mainly employ static\nupsampling with fixed parameters to improve spatial resolution for all scenes,\nfailing to perceive the sub-pixel distribution difference across multiple\nframes and cannot balance the fusion weights of different frames, resulting in\nover-smoothed details and artifacts. To address these limitations, we introduce\na novel Query Mamba Burst Super-Resolution (QMambaBSR) network, which\nincorporates a Query State Space Model (QSSM) and Adaptive Up-sampling module\n(AdaUp). Specifically, based on the observation that sub-pixels have consistent\nspatial distribution while random noise is inconsistently distributed, a novel\nQSSM is proposed to efficiently extract sub-pixels through inter-frame querying\nand intra-frame scanning while mitigating noise interference in a single step.\nMoreover, AdaUp is designed to dynamically adjust the upsampling kernel based\non the spatial distribution of multi-frame sub-pixel information in the\ndifferent burst scenes, thereby facilitating the reconstruction of the spatial\narrangement of high-resolution details. Extensive experiments on four popular\nsynthetic and real-world benchmarks demonstrate that our method achieves a new\nstate-of-the-art performance.\n","authors":["Xin Di","Long Peng","Peizhe Xia","Wenbo Li","Renjing Pei","Yang Cao","Yang Wang","Zheng-Jun Zha"],"pdf_url":"https://arxiv.org/pdf/2408.08665v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09281v2","updated":"2024-08-16T11:10:24Z","published":"2024-03-14T11:08:33Z","title":"CLIP-EBC: CLIP Can Count Accurately through Enhanced Blockwise\n Classification","summary":" We propose CLIP-EBC, the first fully CLIP-based model for accurate crowd\ndensity estimation. While the CLIP model has demonstrated remarkable success in\naddressing recognition tasks such as zero-shot image classification, its\npotential for counting has been largely unexplored due to the inherent\nchallenges in transforming a regression problem, such as counting, into a\nrecognition task. In this work, we investigate and enhance CLIP's ability to\ncount, focusing specifically on the task of estimating crowd sizes from images.\nExisting classification-based crowd-counting frameworks have significant\nlimitations, including the quantization of count values into bordering\nreal-valued bins and the sole focus on classification errors. These practices\nresult in label ambiguity near the shared borders and inaccurate prediction of\ncount values. Hence, directly applying CLIP within these frameworks may yield\nsuboptimal performance.\n To address these challenges, we first propose the Enhanced Blockwise\nClassification (EBC) framework. Unlike previous methods, EBC utilizes\ninteger-valued bins, effectively reducing ambiguity near bin boundaries.\nAdditionally, it incorporates a regression loss based on density maps to\nimprove the prediction of count values. Within our backbone-agnostic EBC\nframework, we then introduce CLIP-EBC to fully leverage CLIP's recognition\ncapabilities for this task. Extensive experiments demonstrate the effectiveness\nof EBC and the competitive performance of CLIP-EBC. Specifically, our EBC\nframework can improve existing classification-based methods by up to 44.5% on\nthe UCF-QNRF dataset, and CLIP-EBC achieves state-of-the-art performance on the\nNWPU-Crowd test set, with an MAE of 58.2 and an RMSE of 268.5, representing\nimprovements of 8.6% and 13.3% over the previous best method, STEERER. The code\nand weights are available at https://github.com/Yiming-M/CLIP-EBC.\n","authors":["Yiming Ma","Victor Sanchez","Tanaya Guha"],"pdf_url":"https://arxiv.org/pdf/2403.09281v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.07459v2","updated":"2024-08-16T10:53:01Z","published":"2024-05-13T04:21:00Z","title":"DualFocus: Integrating Plausible Descriptions in Text-based Person\n Re-identification","summary":" Text-based Person Re-identification (TPR) aims to retrieve specific\nindividual images from datasets based on textual descriptions. Existing TPR\nmethods primarily focus on recognizing explicit and positive characteristics,\noften overlooking the role of negative descriptions. This oversight can lead to\nfalse positives-images that meet positive criteria but should be excluded based\non negative descriptions. To address these limitations, we introduce DualFocus,\na unified framework that integrates plausible descriptions to enhance the\ninterpretative accuracy of vision-language models in TPR tasks. DualFocus\nleverages Dual (Positive/Negative) Attribute Prompt Learning (DAPL), which\nincorporates Dual Image-Attribute Contrastive (DIAC) Learning and Sensitive\nImage-Attributes Matching (SIAM) Learning, enabling the detection of\nnon-existent attributes and reducing false positives. To achieve a balance\nbetween coarse and fine-grained alignment of visual and textual embeddings, we\npropose the Dynamic Tokenwise Similarity (DTS) loss, which refines the\nrepresentation of both matching and non-matching descriptions, thereby\nimproving the matching process through detailed and adaptable similarity\nassessments. The comprehensive experiments on CUHK-PEDES, ICFG-PEDES, and\nRSTPReid, DualFocus demonstrates superior performance over state-of-the-art\nmethods, significantly enhancing both precision and robustness in TPR.\n","authors":["Yuchuan Deng","Zhanpeng Hu","Jiakun Han","Chuang Deng","Qijun Zhao"],"pdf_url":"https://arxiv.org/pdf/2405.07459v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19821v2","updated":"2024-08-16T10:23:55Z","published":"2024-07-29T09:14:21Z","title":"Distilling High Diagnostic Value Patches for Whole Slide Image\n Classification Using Attention Mechanism","summary":" Multiple Instance Learning (MIL) has garnered widespread attention in the\nfield of Whole Slide Image (WSI) classification as it replaces pixel-level\nmanual annotation with diagnostic reports as labels, significantly reducing\nlabor costs. Recent research has shown that bag-level MIL methods often yield\nbetter results because they can consider all patches of the WSI as a whole.\nHowever, a drawback of such methods is the incorporation of more redundant\npatches, leading to interference. To extract patches with high diagnostic value\nwhile excluding interfering patches to address this issue, we developed an\nattention-based feature distillation multi-instance learning (AFD-MIL)\napproach. This approach proposed the exclusion of redundant patches as a\npreprocessing operation in weakly supervised learning, directly mitigating\ninterference from extensive noise. It also pioneers the use of attention\nmechanisms to distill features with high diagnostic value, as opposed to the\ntraditional practice of indiscriminately and forcibly integrating all patches.\nAdditionally, we introduced global loss optimization to finely control the\nfeature distillation module. AFD-MIL is orthogonal to many existing MIL\nmethods, leading to consistent performance improvements. This approach has\nsurpassed the current state-of-the-art method, achieving 91.47% ACC (accuracy)\nand 94.29% AUC (area under the curve) on the Camelyon16 (Camelyon Challenge\n2016, breast cancer), while 93.33% ACC and 98.17% AUC on the TCGA-NSCLC (The\nCancer Genome Atlas Program: non-small cell lung cancer). Different feature\ndistillation methods were used for the two datasets, tailored to the specific\ndiseases, thereby improving performance and interpretability.\n","authors":["Tianhang Nan","Hao Quan","Yong Ding","Xingyu Li","Kai Yang","Xiaoyu Cui"],"pdf_url":"https://arxiv.org/pdf/2407.19821v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08647v1","updated":"2024-08-16T10:22:54Z","published":"2024-08-16T10:22:54Z","title":"Modeling the Neonatal Brain Development Using Implicit Neural\n Representations","summary":" The human brain undergoes rapid development during the third trimester of\npregnancy. In this work, we model the neonatal development of the infant brain\nin this age range. As a basis, we use MR images of preterm- and term-birth\nneonates from the developing human connectome project (dHCP). We propose a\nneural network, specifically an implicit neural representation (INR), to\npredict 2D- and 3D images of varying time points. In order to model a\nsubject-specific development process, it is necessary to disentangle the age\nfrom the subjects' identity in the latent space of the INR. We propose two\nmethods, Subject Specific Latent Vectors (SSL) and Stochastic Global Latent\nAugmentation (SGLA), enabling this disentanglement. We perform an analysis of\nthe results and compare our proposed model to an age-conditioned denoising\ndiffusion model as a baseline. We also show that our method can be applied in a\nmemory-efficient way, which is especially important for 3D data.\n","authors":["Florentin Bieder","Paul Friedrich","Hélène Corbaz","Alicia Durrer","Julia Wolleb","Philippe C. Cattin"],"pdf_url":"https://arxiv.org/pdf/2408.08647v1.pdf","comment":"Preprint, Accepted for PRIME MICCAI 2024"},{"id":"http://arxiv.org/abs/2408.08645v1","updated":"2024-08-16T10:21:13Z","published":"2024-08-16T10:21:13Z","title":"Extracting polygonal footprints in off-nadir images with Segment\n Anything Model","summary":" Building Footprint Extraction (BFE) in off-nadir aerial images often relies\non roof segmentation and roof-to-footprint offset prediction, then drugging\nroof-to-footprint via the offset. However, the results from this multi-stage\ninference are not applicable in data production, because of the low quality of\nmasks given by prediction. To solve this problem, we proposed OBMv2 in this\npaper, which supports both end-to-end and promptable polygonal footprint\nprediction. Different from OBM, OBMv2 using a newly proposed Self Offset\nAttention (SOFA) to bridge the performance gap on bungalow and skyscraper,\nwhich realized a real end-to-end footprint polygon prediction without\npostprocessing. %, such as Non-Maximum Suppression (NMS) and Distance NMS\n(DNMS). % To fully use information contained in roof masks, building masks and\noffsets, we proposed a Multi-level Information SyStem (MISS) for footprint\nprediction, with which OBMv2 can predict footprints even with insufficient\npredictions. Additionally, to squeeze information from the same model, we were\ninspired by Retrieval-Augmented Generation (RAG) in Nature Language Processing\nand proposed \"RAG in BFE\" problem. To verify the effectiveness of the proposed\nmethod, experiments were conducted on open datasets BONAI and OmniCity-view3. A\ngeneralization test was also conducted on Huizhou test set. The code will be\navailable at \\url{https://github.com/likaiucas/OBM}.\n","authors":["Kai Li","Jingbo Chen","Yupeng Deng","Yu Meng","Diyou Liu","Junxian Ma","Chenhao Wang"],"pdf_url":"https://arxiv.org/pdf/2408.08645v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08633v1","updated":"2024-08-16T09:54:12Z","published":"2024-08-16T09:54:12Z","title":"Historical Printed Ornaments: Dataset and Tasks","summary":" This paper aims to develop the study of historical printed ornaments with\nmodern unsupervised computer vision. We highlight three complex tasks that are\nof critical interest to book historians: clustering, element discovery, and\nunsupervised change localization. For each of these tasks, we introduce an\nevaluation benchmark, and we adapt and evaluate state-of-the-art models. Our\nRey's Ornaments dataset is designed to be a representative example of a set of\nornaments historians would be interested in. It focuses on an XVIIIth century\nbookseller, Marc-Michel Rey, providing a consistent set of ornaments with a\nwide diversity and representative challenges. Our results highlight the\nlimitations of state-of-the-art models when faced with real data and show\nsimple baselines such as k-means or congealing can outperform more\nsophisticated approaches on such data. Our dataset and code can be found at\nhttps://printed-ornaments.github.io/.\n","authors":["Sayan Kumar Chaki","Zeynep Sonat Baltaci","Elliot Vincent","Remi Emonet","Fabienne Vial-Bonacci","Christelle Bahier-Porte","Mathieu Aubry","Thierry Fournel"],"pdf_url":"https://arxiv.org/pdf/2408.08633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08632v1","updated":"2024-08-16T09:52:02Z","published":"2024-08-16T09:52:02Z","title":"A Survey on Benchmarks of Multimodal Large Language Models","summary":" Multimodal Large Language Models (MLLMs) are gaining increasing popularity in\nboth academia and industry due to their remarkable performance in various\napplications such as visual question answering, visual perception,\nunderstanding, and reasoning. Over the past few years, significant efforts have\nbeen made to examine MLLMs from multiple perspectives. This paper presents a\ncomprehensive review of \\textbf{180 benchmarks} and evaluation for MLLMs,\nfocusing on (1)perception and understanding, (2)cognition and reasoning,\n(3)specific domains, (4)key capabilities, and (5)other modalities. Finally, we\ndiscuss the limitations of the current evaluation methods for MLLMs and explore\npromising future directions. Our key argument is that evaluation should be\nregarded as a crucial discipline to better support the development of MLLMs.\nFor more details, please visit our GitHub repository:\nhttps://github.com/swordlidev/Evaluation-Multimodal-LLMs-Survey.\n","authors":["Jian Li","Weiheng Lu"],"pdf_url":"https://arxiv.org/pdf/2408.08632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08013v2","updated":"2024-08-16T09:48:44Z","published":"2024-08-15T08:22:10Z","title":"Adaptive Learning of Consistency and Inconsistency Information for Fake\n News Detection","summary":" The rapid advancement of social media platforms has significantly reduced the\ncost of information dissemination, yet it has also led to a proliferation of\nfake news, posing a threat to societal trust and credibility. Most of fake news\ndetection research focused on integrating text and image information to\nrepresent the consistency of multiple modes in news content, while paying less\nattention to inconsistent information. Besides, existing methods that leveraged\ninconsistent information often caused one mode overshadowing another, leading\nto ineffective use of inconsistent clue. To address these issues, we propose an\nadaptive multi-modal feature fusion network (MFF-Net). Inspired by human\njudgment processes for determining truth and falsity in news, MFF-Net focuses\non inconsistent parts when news content is generally consistent and consistent\nparts when it is generally inconsistent. Specifically, MFF-Net extracts\nsemantic and global features from images and texts respectively, and learns\nconsistency information between modes through a multiple feature fusion module.\nTo deal with the problem of modal information being easily masked, we design a\nsingle modal feature filtering strategy to capture inconsistent information\nfrom corresponding modes separately. Finally, similarity scores are calculated\nbased on global features with adaptive adjustments made to achieve weighted\nfusion of consistent and inconsistent features. Extensive experimental results\ndemonstrate that MFF-Net outperforms state-of-the-art methods across three\npublic news datasets derived from real social medias.\n","authors":["Aohan Li","Jiaxin Chen","Xin Liao","Dengyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.08013v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07343v2","updated":"2024-08-16T09:44:49Z","published":"2024-08-14T07:37:07Z","title":"Gradient Alignment Improves Test-Time Adaptation for Medical Image\n Segmentation","summary":" Although recent years have witnessed significant advancements in medical\nimage segmentation, the pervasive issue of domain shift among medical images\nfrom diverse centres hinders the effective deployment of pre-trained models.\nMany Test-time Adaptation (TTA) methods have been proposed to address this\nissue by fine-tuning pre-trained models with test data during inference. These\nmethods, however, often suffer from less-satisfactory optimization due to\nsuboptimal optimization direction (dictated by the gradient) and fixed\nstep-size (predicated on the learning rate). In this paper, we propose the\nGradient alignment-based Test-time adaptation (GraTa) method to improve both\nthe gradient direction and learning rate in the optimization procedure. Unlike\nconventional TTA methods, which primarily optimize the pseudo gradient derived\nfrom a self-supervised objective, our method incorporates an auxiliary gradient\nwith the pseudo one to facilitate gradient alignment. Such gradient alignment\nenables the model to excavate the similarities between different gradients and\ncorrect the gradient direction to approximate the empirical gradient related to\nthe current segmentation task. Additionally, we design a dynamic learning rate\nbased on the cosine similarity between the pseudo and auxiliary gradients,\nthereby empowering the adaptive fine-tuning of pre-trained models on diverse\ntest data. Extensive experiments establish the effectiveness of the proposed\ngradient alignment and dynamic learning rate and substantiate the superiority\nof our GraTa method over other state-of-the-art TTA methods on a benchmark\nmedical image segmentation task. The code and weights of pre-trained source\nmodels will be available.\n","authors":["Ziyang Chen","Yiwen Ye","Yongsheng Pan","Yong Xia"],"pdf_url":"https://arxiv.org/pdf/2408.07343v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08623v1","updated":"2024-08-16T09:32:26Z","published":"2024-08-16T09:32:26Z","title":"SketchRef: A Benchmark Dataset and Evaluation Metrics for Automated\n Sketch Synthesis","summary":" Sketch, a powerful artistic technique to capture essential visual information\nabout real-world objects, is increasingly gaining attention in the image\nsynthesis field. However, evaluating the quality of synthesized sketches\npresents unique unsolved challenges. Current evaluation methods for sketch\nsynthesis are inadequate due to the lack of a unified benchmark dataset,\nover-reliance on classification accuracy for recognizability, and unfair\nevaluation of sketches with different levels of simplification. To address\nthese issues, we introduce SketchRef, a benchmark dataset comprising 4\ncategories of reference photos--animals, human faces, human bodies, and common\nobjects--alongside novel evaluation metrics. Considering that classification\naccuracy is insufficient to measure the structural consistency between a sketch\nand its reference photo, we propose the mean Object Keypoint Similarity (mOKS)\nmetric, utilizing pose estimation to assess structure-level recognizability. To\nensure fair evaluation sketches with different simplification levels, we\npropose a recognizability calculation method constrained by simplicity. We also\ncollect 8K responses from art enthusiasts, validating the effectiveness of our\nproposed evaluation methods. We hope this work can provide a comprehensive\nevaluation of sketch synthesis algorithms, thereby aligning their performance\nmore closely with human understanding.\n","authors":["Xingyue Lin","Xingjian Hu","Shuai Peng","Jianhua Zhu","Liangcai Gao"],"pdf_url":"https://arxiv.org/pdf/2408.08623v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08616v1","updated":"2024-08-16T09:14:12Z","published":"2024-08-16T09:14:12Z","title":"Reference-free Axial Super-resolution of 3D Microscopy Images using\n Implicit Neural Representation with a 2D Diffusion Prior","summary":" Analysis and visualization of 3D microscopy images pose challenges due to\nanisotropic axial resolution, demanding volumetric super-resolution along the\naxial direction. While training a learning-based 3D super-resolution model\nseems to be a straightforward solution, it requires ground truth isotropic\nvolumes and suffers from the curse of dimensionality. Therefore, existing\nmethods utilize 2D neural networks to reconstruct each axial slice, eventually\npiecing together the entire volume. However, reconstructing each slice in the\npixel domain fails to give consistent reconstruction in all directions leading\nto misalignment artifacts. In this work, we present a reconstruction framework\nbased on implicit neural representation (INR), which allows 3D coherency even\nwhen optimized by independent axial slices in a batch-wise manner. Our method\noptimizes a continuous volumetric representation from low-resolution axial\nslices, using a 2D diffusion prior trained on high-resolution lateral slices\nwithout requiring isotropic volumes. Through experiments on real and synthetic\nanisotropic microscopy images, we demonstrate that our method surpasses other\nstate-of-the-art reconstruction methods. The source code is available on\nGitHub: https://github.com/hvcl/INR-diffusion.\n","authors":["Kyungryun Lee","Won-Ki Jeong"],"pdf_url":"https://arxiv.org/pdf/2408.08616v1.pdf","comment":"MICCAI2024 accepted"},{"id":"http://arxiv.org/abs/2408.08610v1","updated":"2024-08-16T08:52:02Z","published":"2024-08-16T08:52:02Z","title":"Generative Dataset Distillation Based on Diffusion Model","summary":" This paper presents our method for the generative track of The First Dataset\nDistillation Challenge at ECCV 2024. Since the diffusion model has become the\nmainstay of generative models because of its high-quality generative effects,\nwe focus on distillation methods based on the diffusion model. Considering that\nthe track can only generate a fixed number of images in 10 minutes using a\ngenerative model for CIFAR-100 and Tiny-ImageNet datasets, we need to use a\ngenerative model that can generate images at high speed. In this study, we\nproposed a novel generative dataset distillation method based on Stable\nDiffusion. Specifically, we use the SDXL-Turbo model which can generate images\nat high speed and quality. Compared to other diffusion models that can only\ngenerate images per class (IPC) = 1, our method can achieve an IPC = 10 for\nTiny-ImageNet and an IPC = 20 for CIFAR-100, respectively. Additionally, to\ngenerate high-quality distilled datasets for CIFAR-100 and Tiny-ImageNet, we\nuse the class information as text prompts and post data augmentation for the\nSDXL-Turbo model. Experimental results show the effectiveness of the proposed\nmethod, and we achieved third place in the generative track of the ECCV 2024 DD\nChallenge. Codes are available at https://github.com/Guang000/BANKO.\n","authors":["Duo Su","Junjie Hou","Guang Li","Ren Togo","Rui Song","Takahiro Ogawa","Miki Haseyama"],"pdf_url":"https://arxiv.org/pdf/2408.08610v1.pdf","comment":"The Third Place Winner in Generative Track of the ECCV 2024 DD\n Challenge"},{"id":"http://arxiv.org/abs/2408.08604v1","updated":"2024-08-16T08:45:25Z","published":"2024-08-16T08:45:25Z","title":"Bi-Directional Deep Contextual Video Compression","summary":" Deep video compression has made remarkable process in recent years, with the\nmajority of advancements concentrated on P-frame coding. Although efforts to\nenhance B-frame coding are ongoing, their compression performance is still far\nbehind that of traditional bi-directional video codecs. In this paper, we\nintroduce a bi-directional deep contextual video compression scheme tailored\nfor B-frames, termed DCVC-B, to improve the compression performance of deep\nB-frame coding. Our scheme mainly has three key innovations. First, we develop\na bi-directional motion difference context propagation method for effective\nmotion difference coding, which significantly reduces the bit cost of\nbi-directional motions. Second, we propose a bi-directional contextual\ncompression model and a corresponding bi-directional temporal entropy model, to\nmake better use of the multi-scale temporal contexts. Third, we propose a\nhierarchical quality structure-based training strategy, leading to an effective\nbit allocation across large groups of pictures (GOP). Experimental results show\nthat our DCVC-B achieves an average reduction of 26.6% in BD-Rate compared to\nthe reference software for H.265/HEVC under random access conditions.\nRemarkably, it surpasses the performance of the H.266/VVC reference software on\ncertain test datasets under the same configuration.\n","authors":["Xihua Sheng","Li Li","Dong Liu","Shiqi Wang"],"pdf_url":"https://arxiv.org/pdf/2408.08604v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08601v1","updated":"2024-08-16T08:37:56Z","published":"2024-08-16T08:37:56Z","title":"Learning A Low-Level Vision Generalist via Visual Task Prompt","summary":" Building a unified model for general low-level vision tasks holds significant\nresearch and practical value. Current methods encounter several critical\nissues. Multi-task restoration approaches can address multiple\ndegradation-to-clean restoration tasks, while their applicability to tasks with\ndifferent target domains (e.g., image stylization) is limited. Methods like\nPromptGIP can handle multiple input-target domains but rely on the Masked\nAutoencoder (MAE) paradigm. Consequently, they are tied to the ViT\narchitecture, resulting in suboptimal image reconstruction quality. In\naddition, these methods are sensitive to prompt image content and often\nstruggle with low-frequency information processing. In this paper, we propose a\nVisual task Prompt-based Image Processing (VPIP) framework to overcome these\nchallenges. VPIP employs visual task prompts to manage tasks with different\ninput-target domains and allows flexible selection of backbone network suitable\nfor general tasks. Besides, a new prompt cross-attention is introduced to\nfacilitate interaction between the input and prompt information. Based on the\nVPIP framework, we train a low-level vision generalist model, namely GenLV, on\n30 diverse tasks. Experimental results show that GenLV can successfully address\na variety of low-level tasks, significantly outperforming existing methods both\nquantitatively and qualitatively. Codes are available at\nhttps://github.com/chxy95/GenLV.\n","authors":["Xiangyu Chen","Yihao Liu","Yuandong Pu","Wenlong Zhang","Jiantao Zhou","Yu Qiao","Chao Dong"],"pdf_url":"https://arxiv.org/pdf/2408.08601v1.pdf","comment":"Accepted to ACMMM24"},{"id":"http://arxiv.org/abs/2408.08600v1","updated":"2024-08-16T08:34:50Z","published":"2024-08-16T08:34:50Z","title":"MM-UNet: A Mixed MLP Architecture for Improved Ophthalmic Image\n Segmentation","summary":" Ophthalmic image segmentation serves as a critical foundation for ocular\ndisease diagnosis. Although fully convolutional neural networks (CNNs) are\ncommonly employed for segmentation, they are constrained by inductive biases\nand face challenges in establishing long-range dependencies. Transformer-based\nmodels address these limitations but introduce substantial computational\noverhead. Recently, a simple yet efficient Multilayer Perceptron (MLP)\narchitecture was proposed for image classification, achieving competitive\nperformance relative to advanced transformers. However, its effectiveness for\nophthalmic image segmentation remains unexplored. In this paper, we introduce\nMM-UNet, an efficient Mixed MLP model tailored for ophthalmic image\nsegmentation. Within MM-UNet, we propose a multi-scale MLP (MMLP) module that\nfacilitates the interaction of features at various depths through a grouping\nstrategy, enabling simultaneous capture of global and local information. We\nconducted extensive experiments on both a private anterior segment optical\ncoherence tomography (AS-OCT) image dataset and a public fundus image dataset.\nThe results demonstrated the superiority of our MM-UNet model in comparison to\nstate-of-the-art deep segmentation networks.\n","authors":["Zunjie Xiao","Xiaoqing Zhang","Risa Higashita","Jiang Liu"],"pdf_url":"https://arxiv.org/pdf/2408.08600v1.pdf","comment":"OMIA2024"},{"id":"http://arxiv.org/abs/2407.04203v2","updated":"2024-08-16T08:27:14Z","published":"2024-07-05T01:02:12Z","title":"HCS-TNAS: Hybrid Constraint-driven Semi-supervised Transformer-NAS for\n Ultrasound Image Segmentation","summary":" Precise ultrasound segmentation is vital for clinicians to provide\ncomprehensive diagnoses. However, developing a model that accurately segments\nultrasound images is challenging due to the images' low quality and the\nscarcity of extensive labeled data. This results in two main solutions: (1)\noptimizing multi-scale feature representations, and (2) increasing resistance\nto data dependency. The first approach necessitates an advanced network\narchitecture, but a handcrafted network is knowledge-intensive and often yields\nlimited improvement. In contrast, neural architecture search (NAS) can more\neasily attain optimal performance, albeit with significant computational costs.\nRegarding the second issue, semi-supervised learning (SSL) is an established\nmethod, but combining it with complex NAS faces the risk of overfitting to a\nfew labeled samples without extra constraints. Therefore, we introduce a hybrid\nconstraint-driven semi-supervised Transformer-NAS (HCS-TNAS), balancing both\nsolutions for segmentation. HCS-TNAS includes an Efficient NAS-ViT module for\nmulti-scale token search before ViT's attention calculation, effectively\ncapturing contextual and local information with lower computational costs, and\na hybrid SSL framework that adds network independence and contrastive learning\nto the optimization for solving data dependency. By further developing a\nstage-wise optimization strategy, a rational network structure is identified.\nExperiments on public datasets show that HCS-TNAS achieves state-of-the-art\nperformance, pushing the limit of ultrasound segmentation.\n","authors":["Renqi Chen","Xinzhe Zheng","Haoyang Su","Kehan Wu"],"pdf_url":"https://arxiv.org/pdf/2407.04203v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08591v1","updated":"2024-08-16T07:52:00Z","published":"2024-08-16T07:52:00Z","title":"Zero-Shot Dual-Path Integration Framework for Open-Vocabulary 3D\n Instance Segmentation","summary":" Open-vocabulary 3D instance segmentation transcends traditional\nclosed-vocabulary methods by enabling the identification of both previously\nseen and unseen objects in real-world scenarios. It leverages a dual-modality\napproach, utilizing both 3D point clouds and 2D multi-view images to generate\nclass-agnostic object mask proposals. Previous efforts predominantly focused on\nenhancing 3D mask proposal models; consequently, the information that could\ncome from 2D association to 3D was not fully exploited. This bias towards 3D\ndata, while effective for familiar indoor objects, limits the system's\nadaptability to new and varied object types, where 2D models offer greater\nutility. Addressing this gap, we introduce Zero-Shot Dual-Path Integration\nFramework that equally values the contributions of both 3D and 2D modalities.\nOur framework comprises three components: 3D pathway, 2D pathway, and Dual-Path\nIntegration. 3D pathway generates spatially accurate class-agnostic mask\nproposals of common indoor objects from 3D point cloud data using a pre-trained\n3D model, while 2D pathway utilizes pre-trained open-vocabulary instance\nsegmentation model to identify a diverse array of object proposals from\nmulti-view RGB-D images. In Dual-Path Integration, our Conditional Integration\nprocess, which operates in two stages, filters and merges the proposals from\nboth pathways adaptively. This process harmonizes output proposals to enhance\nsegmentation capabilities. Our framework, utilizing pre-trained models in a\nzero-shot manner, is model-agnostic and demonstrates superior performance on\nboth seen and unseen data, as evidenced by comprehensive evaluations on the\nScanNet200 and qualitative results on ARKitScenes datasets.\n","authors":["Tri Ton","Ji Woo Hong","SooHwan Eom","Jun Yeop Shim","Junyeong Kim","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2408.08591v1.pdf","comment":"OpenSUN 3D: 2nd Workshop on Open-Vocabulary 3D Scene Understanding\n (CVPR 2024)"},{"id":"http://arxiv.org/abs/2408.08584v1","updated":"2024-08-16T07:37:05Z","published":"2024-08-16T07:37:05Z","title":"S-RAF: A Simulation-Based Robustness Assessment Framework for\n Responsible Autonomous Driving","summary":" As artificial intelligence (AI) technology advances, ensuring the robustness\nand safety of AI-driven systems has become paramount. However, varying\nperceptions of robustness among AI developers create misaligned evaluation\nmetrics, complicating the assessment and certification of safety-critical and\ncomplex AI systems such as autonomous driving (AD) agents. To address this\nchallenge, we introduce Simulation-Based Robustness Assessment Framework\n(S-RAF) for autonomous driving. S-RAF leverages the CARLA Driving simulator to\nrigorously assess AD agents across diverse conditions, including faulty\nsensors, environmental changes, and complex traffic situations. By quantifying\nrobustness and its relationship with other safety-critical factors, such as\ncarbon emissions, S-RAF aids developers and stakeholders in building safe and\nresponsible driving agents, and streamlining safety certification processes.\nFurthermore, S-RAF offers significant advantages, such as reduced testing\ncosts, and the ability to explore edge cases that may be unsafe to test in the\nreal world. The code for this framework is available here:\nhttps://github.com/cognitive-robots/rai-leaderboard\n","authors":["Daniel Omeiza","Pratik Somaiya","Jo-Ann Pattinson","Carolyn Ten-Holter","Jack Stilgoe","Marina Jirotka","Lars Kunze"],"pdf_url":"https://arxiv.org/pdf/2408.08584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.09032v2","updated":"2024-08-16T07:26:54Z","published":"2024-05-15T02:03:44Z","title":"ICAL: Implicit Character-Aided Learning for Enhanced Handwritten\n Mathematical Expression Recognition","summary":" Significant progress has been made in the field of handwritten mathematical\nexpression recognition, while existing encoder-decoder methods are usually\ndifficult to model global information in $LaTeX$. Therefore, this paper\nintroduces a novel approach, Implicit Character-Aided Learning (ICAL), to mine\nthe global expression information and enhance handwritten mathematical\nexpression recognition. Specifically, we propose the Implicit Character\nConstruction Module (ICCM) to predict implicit character sequences and use a\nFusion Module to merge the outputs of the ICCM and the decoder, thereby\nproducing corrected predictions. By modeling and utilizing implicit character\ninformation, ICAL achieves a more accurate and context-aware interpretation of\nhandwritten mathematical expressions. Experimental results demonstrate that\nICAL notably surpasses the state-of-the-art(SOTA) models, improving the\nexpression recognition rate (ExpRate) by 2.25\\%/1.81\\%/1.39\\% on the CROHME\n2014/2016/2019 datasets respectively, and achieves a remarkable 69.06\\% on the\nchallenging HME100k test set. We make our code available on the GitHub:\nhttps://github.com/qingzhenduyu/ICAL\n","authors":["Jianhua Zhu","Liangcai Gao","Wenqi Zhao"],"pdf_url":"https://arxiv.org/pdf/2405.09032v2.pdf","comment":"ICDAR 2024 Oral Paper"},{"id":"http://arxiv.org/abs/2408.08578v1","updated":"2024-08-16T07:24:19Z","published":"2024-08-16T07:24:19Z","title":"TAMER: Tree-Aware Transformer for Handwritten Mathematical Expression\n Recognition","summary":" Handwritten Mathematical Expression Recognition (HMER) has extensive\napplications in automated grading and office automation. However, existing\nsequence-based decoding methods, which directly predict $\\LaTeX$ sequences,\nstruggle to understand and model the inherent tree structure of $\\LaTeX$ and\noften fail to ensure syntactic correctness in the decoded results. To address\nthese challenges, we propose a novel model named TAMER (Tree-Aware Transformer)\nfor handwritten mathematical expression recognition. TAMER introduces an\ninnovative Tree-aware Module while maintaining the flexibility and efficient\ntraining of Transformer. TAMER combines the advantages of both sequence\ndecoding and tree decoding models by jointly optimizing sequence prediction and\ntree structure prediction tasks, which enhances the model's understanding and\ngeneralization of complex mathematical expression structures. During inference,\nTAMER employs a Tree Structure Prediction Scoring Mechanism to improve the\nstructural validity of the generated $\\LaTeX$ sequences. Experimental results\non CROHME datasets demonstrate that TAMER outperforms traditional sequence\ndecoding and tree decoding models, especially in handling complex mathematical\nstructures, achieving state-of-the-art (SOTA) performance.\n","authors":["Jianhua Zhu","Wenqi Zhao","Yu Li","Xingjian Hu","Liangcai Gao"],"pdf_url":"https://arxiv.org/pdf/2408.08578v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08576v1","updated":"2024-08-16T07:23:22Z","published":"2024-08-16T07:23:22Z","title":"Tuning a SAM-Based Model with Multi-Cognitive Visual Adapter to Remote\n Sensing Instance Segmentation","summary":" The Segment Anything Model (SAM), a foundational model designed for\npromptable segmentation tasks, demonstrates exceptional generalization\ncapabilities, making it highly promising for natural scene image segmentation.\nHowever, SAM's lack of pretraining on massive remote sensing images and its\ninteractive structure limit its automatic mask prediction capabilities. In this\npaper, a Multi-Cognitive SAM-Based Instance Segmentation Model (MC-SAM SEG) is\nintroduced to employ SAM on remote sensing domain. The SAM-Mona encoder\nutilizing the Multi-cognitive Visual Adapter (Mona) is conducted to facilitate\nSAM's transfer learning in remote sensing applications. The proposed method\nnamed MC-SAM SEG extracts high-quality features by fine-tuning the SAM-Mona\nencoder along with a feature aggregator. Subsequently, a pixel decoder and\ntransformer decoder are designed for prompt-free mask generation and instance\nclassification. The comprehensive experiments are conducted on the HRSID and\nWHU datasets for instance segmentation tasks on Synthetic Aperture Radar (SAR)\nimages and optical remote sensing images respectively. The evaluation results\nindicate the proposed method surpasses other deep learning algorithms and\nverify its effectiveness and generalization.\n","authors":["Linghao Zheng","Xinyang Pu","Feng Xu"],"pdf_url":"https://arxiv.org/pdf/2408.08576v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08575v1","updated":"2024-08-16T07:23:18Z","published":"2024-08-16T07:23:18Z","title":"Tell Codec What Worth Compressing: Semantically Disentangled Image\n Coding for Machine with LMMs","summary":" We present a new image compression paradigm to achieve ``intelligently coding\nfor machine'' by cleverly leveraging the common sense of Large Multimodal\nModels (LMMs). We are motivated by the evidence that large language/multimodal\nmodels are powerful general-purpose semantics predictors for understanding the\nreal world. Different from traditional image compression typically optimized\nfor human eyes, the image coding for machines (ICM) framework we focus on\nrequires the compressed bitstream to more comply with different downstream\nintelligent analysis tasks. To this end, we employ LMM to \\textcolor{red}{tell\ncodec what to compress}: 1) first utilize the powerful semantic understanding\ncapability of LMMs w.r.t object grounding, identification, and importance\nranking via prompts, to disentangle image content before compression, 2) and\nthen based on these semantic priors we accordingly encode and transmit objects\nof the image in order with a structured bitstream. In this way, diverse vision\nbenchmarks including image classification, object detection, instance\nsegmentation, etc., can be well supported with such a semantically structured\nbitstream. We dub our method ``\\textit{SDComp}'' for ``\\textit{S}emantically\n\\textit{D}isentangled \\textit{Comp}ression'', and compare it with\nstate-of-the-art codecs on a wide variety of different vision tasks. SDComp\ncodec leads to more flexible reconstruction results, promised decoded visual\nquality, and a more generic/satisfactory intelligent task-supporting ability.\n","authors":["Jinming Liu","Yuntao Wei","Junyan Lin","Shengyang Zhao","Heming Sun","Zhibo Chen","Wenjun Zeng","Xin Jin"],"pdf_url":"https://arxiv.org/pdf/2408.08575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08570v1","updated":"2024-08-16T07:12:47Z","published":"2024-08-16T07:12:47Z","title":"EraW-Net: Enhance-Refine-Align W-Net for Scene-Associated Driver\n Attention Estimation","summary":" Associating driver attention with driving scene across two fields of views\n(FOVs) is a hard cross-domain perception problem, which requires comprehensive\nconsideration of cross-view mapping, dynamic driving scene analysis, and driver\nstatus tracking. Previous methods typically focus on a single view or map\nattention to the scene via estimated gaze, failing to exploit the implicit\nconnection between them. Moreover, simple fusion modules are insufficient for\nmodeling the complex relationships between the two views, making information\nintegration challenging. To address these issues, we propose a novel method for\nend-to-end scene-associated driver attention estimation, called EraW-Net. This\nmethod enhances the most discriminative dynamic cues, refines feature\nrepresentations, and facilitates semantically aligned cross-domain integration\nthrough a W-shaped architecture, termed W-Net. Specifically, a Dynamic Adaptive\nFilter Module (DAF-Module) is proposed to address the challenges of frequently\nchanging driving environments by extracting vital regions. It suppresses the\nindiscriminately recorded dynamics and highlights crucial ones by innovative\njoint frequency-spatial analysis, enhancing the model's ability to parse\ncomplex dynamics. Additionally, to track driver states during non-fixed facial\nposes, we propose a Global Context Sharing Module (GCS-Module) to construct\nrefined feature representations by capturing hierarchical features that adapt\nto various scales of head and eye movements. Finally, W-Net achieves systematic\ncross-view information integration through its \"Encoding-Independent Partial\nDecoding-Fusion Decoding\" structure, addressing semantic misalignment in\nheterogeneous data integration. Experiments demonstrate that the proposed\nmethod robustly and accurately estimates the mapping of driver attention in\nscene on large public datasets.\n","authors":["Jun Zhou","Chunsheng Liu","Faliang Chang","Wenqian Wang","Penghui Hao","Yiming Huang","Zhiqiang Yang"],"pdf_url":"https://arxiv.org/pdf/2408.08570v1.pdf","comment":"13pages, 9 figures,"},{"id":"http://arxiv.org/abs/2408.08568v1","updated":"2024-08-16T07:02:19Z","published":"2024-08-16T07:02:19Z","title":"Unsupervised Non-Rigid Point Cloud Matching through Large Vision Models","summary":" In this paper, we propose a novel learning-based framework for non-rigid\npoint cloud matching, which can be trained purely on point clouds without any\ncorrespondence annotation but also be extended naturally to partial-to-full\nmatching. Our key insight is to incorporate semantic features derived from\nlarge vision models (LVMs) to geometry-based shape feature learning. Our\nframework effectively leverages the structural information contained in the\nsemantic features to address ambiguities arise from self-similarities among\nlocal geometries. Furthermore, our framework also enjoys the strong\ngeneralizability and robustness regarding partial observations of LVMs, leading\nto improvements in the regarding point cloud matching tasks. In order to\nachieve the above, we propose a pixel-to-point feature aggregation module, a\nlocal and global attention network as well as a geometrical similarity loss\nfunction. Experimental results show that our method achieves state-of-the-art\nresults in matching non-rigid point clouds in both near-isometric and\nheterogeneous shape collection as well as more realistic partial and noisy\ndata.\n","authors":["Zhangquan Chen","Puhua Jiang","Ruqi Huang"],"pdf_url":"https://arxiv.org/pdf/2408.08568v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.08567v1","updated":"2024-08-16T07:01:46Z","published":"2024-08-16T07:01:46Z","title":"S$^3$Attention: Improving Long Sequence Attention with Smoothed Skeleton\n Sketching","summary":" Attention based models have achieved many remarkable breakthroughs in\nnumerous applications. However, the quadratic complexity of Attention makes the\nvanilla Attention based models hard to apply to long sequence tasks. Various\nimproved Attention structures are proposed to reduce the computation cost by\ninducing low rankness and approximating the whole sequence by sub-sequences.\nThe most challenging part of those approaches is maintaining the proper balance\nbetween information preservation and computation reduction: the longer\nsub-sequences used, the better information is preserved, but at the price of\nintroducing more noise and computational costs. In this paper, we propose a\nsmoothed skeleton sketching based Attention structure, coined S$^3$Attention,\nwhich significantly improves upon the previous attempts to negotiate this\ntrade-off. S$^3$Attention has two mechanisms to effectively minimize the impact\nof noise while keeping the linear complexity to the sequence length: a\nsmoothing block to mix information over long sequences and a matrix sketching\nmethod that simultaneously selects columns and rows from the input matrix. We\nverify the effectiveness of S$^3$Attention both theoretically and empirically.\nExtensive studies over Long Range Arena (LRA) datasets and six time-series\nforecasting show that S$^3$Attention significantly outperforms both vanilla\nAttention and other state-of-the-art variants of Attention structures.\n","authors":["Xue Wang","Tian Zhou","Jianqing Zhu","Jialin Liu","Kun Yuan","Tao Yao","Wotao Yin","Rong Jin","HanQin Cai"],"pdf_url":"https://arxiv.org/pdf/2408.08567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08561v1","updated":"2024-08-16T06:52:38Z","published":"2024-08-16T06:52:38Z","title":"A New Chinese Landscape Paintings Generation Model based on Stable\n Diffusion using DreamBooth","summary":" This study mainly introduces a method combining the Stable Diffusion Model\n(SDM) and Parameter-Efficient Fine-Tuning method for generating Chinese\nLandscape Paintings. This training process is accelerated by combining LoRA\nwith pre-trained SDM and DreamBooth with pre-trained SDM, respectively. On the\nChinese Landscape Paintings Internet dataset used in this paper, this study\nfinds that SDM combined with DreamBooth exhibits superior performance,\noutperforming other models, including the generic pre-trained SDM and\nLoRA-based fine-tuning SDM. The SDM combined with DreamBooth achieves a FID of\n12.75 on the dataset and outperforms all other models in terms of expert\nevaluation, highlighting the model's versatility in the field of Chinese\nLandscape Paintings given the unique identifier, high fidelity and high\nquality. This study illustrates the potential of specialised fine-tuning method\nto improve the performance of SDM on domain-specific tasks, particularly in the\ndomain of Landscape Paintings.\n","authors":["Yujia Gu","Xinyu Fang","Xueyuan Deng"],"pdf_url":"https://arxiv.org/pdf/2408.08561v1.pdf","comment":"accepted by AHPCAI"},{"id":"http://arxiv.org/abs/2408.08560v1","updated":"2024-08-16T06:52:06Z","published":"2024-08-16T06:52:06Z","title":"A training regime to learn unified representations from complementary\n breast imaging modalities","summary":" Full Field Digital Mammograms (FFDMs) and Digital Breast Tomosynthesis (DBT)\nare the two most widely used imaging modalities for breast cancer screening.\nAlthough DBT has increased cancer detection compared to FFDM, its widespread\nadoption in clinical practice has been slowed by increased interpretation times\nand a perceived decrease in the conspicuity of specific lesion types.\nSpecifically, the non-inferiority of DBT for microcalcifications remains under\ndebate. Due to concerns about the decrease in visual acuity, combined DBT-FFDM\nacquisitions remain popular, leading to overall increased exam times and\nradiation dosage. Enabling DBT to provide diagnostic information present in\nboth FFDM and DBT would reduce reliance on FFDM, resulting in a reduction in\nboth quantities. We propose a machine learning methodology that learns\nhigh-level representations leveraging the complementary diagnostic signal from\nboth DBT and FFDM. Experiments on a large-scale data set validate our claims\nand show that our representations enable more accurate breast lesion detection\nthan any DBT- or FFDM-based model.\n","authors":["Umang Sharma","Jungkyu Park","Laura Heacock","Sumit Chopra","Krzysztof Geras"],"pdf_url":"https://arxiv.org/pdf/2408.08560v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08189v2","updated":"2024-08-16T06:51:05Z","published":"2024-08-15T14:47:44Z","title":"FancyVideo: Towards Dynamic and Consistent Video Generation via\n Cross-frame Textual Guidance","summary":" Synthesizing motion-rich and temporally consistent videos remains a challenge\nin artificial intelligence, especially when dealing with extended durations.\nExisting text-to-video (T2V) models commonly employ spatial cross-attention for\ntext control, equivalently guiding different frame generations without\nframe-specific textual guidance. Thus, the model's capacity to comprehend the\ntemporal logic conveyed in prompts and generate videos with coherent motion is\nrestricted. To tackle this limitation, we introduce FancyVideo, an innovative\nvideo generator that improves the existing text-control mechanism with the\nwell-designed Cross-frame Textual Guidance Module (CTGM). Specifically, CTGM\nincorporates the Temporal Information Injector (TII), Temporal Affinity Refiner\n(TAR), and Temporal Feature Booster (TFB) at the beginning, middle, and end of\ncross-attention, respectively, to achieve frame-specific textual guidance.\nFirstly, TII injects frame-specific information from latent features into text\nconditions, thereby obtaining cross-frame textual conditions. Then, TAR refines\nthe correlation matrix between cross-frame textual conditions and latent\nfeatures along the time dimension. Lastly, TFB boosts the temporal consistency\nof latent features. Extensive experiments comprising both quantitative and\nqualitative evaluations demonstrate the effectiveness of FancyVideo. Our video\ndemo, code and model are available at https://360cvgroup.github.io/FancyVideo/.\n","authors":["Jiasong Feng","Ao Ma","Jing Wang","Bo Cheng","Xiaodan Liang","Dawei Leng","Yuhui Yin"],"pdf_url":"https://arxiv.org/pdf/2408.08189v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14854v2","updated":"2024-08-16T06:47:53Z","published":"2024-06-21T03:54:10Z","title":"PEANO-ViT: Power-Efficient Approximations of Non-Linearities in Vision\n Transformers","summary":" The deployment of Vision Transformers (ViTs) on hardware platforms, specially\nField-Programmable Gate Arrays (FPGAs), presents many challenges, which are\nmainly due to the substantial computational and power requirements of their\nnon-linear functions, notably layer normalization, softmax, and Gaussian Error\nLinear Unit (GELU). These critical functions pose significant obstacles to\nefficient hardware implementation due to their complex mathematical operations\nand the inherent resource count and architectural limitations of FPGAs.\nPEANO-ViT offers a novel approach to streamlining the implementation of the\nlayer normalization layer by introducing a division-free technique that\nsimultaneously approximates the division and square root function.\nAdditionally, PEANO-ViT provides a multi-scale division strategy to eliminate\ndivision operations in the softmax layer, aided by a Pade-based approximation\nfor the exponential function. Finally, PEANO-ViT introduces a piece-wise linear\napproximation for the GELU function, carefully designed to bypass the\ncomputationally intensive operations associated with GELU. In our comprehensive\nevaluations, PEANO-ViT exhibits minimal accuracy degradation (<= 0.5% for\nDeiT-B) while significantly enhancing power efficiency, achieving improvements\nof 1.91x, 1.39x, 8.01x for layer normalization, softmax, and GELU,\nrespectively. This improvement is achieved through substantial reductions in\nDSP, LUT, and register counts for these non-linear operations. Consequently,\nPEANO-ViT enables efficient deployment of Vision Transformers on resource- and\npower-constrained FPGA platforms.\n","authors":["Mohammad Erfan Sadeghi","Arash Fayyazi","Seyedarmin Azizi","Massoud Pedram"],"pdf_url":"https://arxiv.org/pdf/2406.14854v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08555v1","updated":"2024-08-16T06:40:20Z","published":"2024-08-16T06:40:20Z","title":"Detection and tracking of MAVs using a LiDAR with rosette scanning\n pattern","summary":" The usage of commercial Micro Aerial Vehicles (MAVs) has increased\ndrastically during the last decade. While the added value of MAVs to society is\napparent, their growing use is also coming with increasing risks like violating\npublic airspace at airports or committing privacy violations. To mitigate these\nissues it is becoming critical to develop solutions that incorporate the\ndetection and tracking of MAVs with autonomous systems. This work presents a\nmethod for the detection and tracking of MAVs using a novel, low-cost rosette\nscanning LiDAR on a pan-tilt turret. Once the static background is captured, a\nparticle filter is utilized to detect a possible target and track its position\nwith a physical, programmable pan-tilt system. The tracking makes it possible\nto keep the MAV in the center, maximizing the density of 3D points measured on\nthe target by the LiDAR sensor. The developed algorithm was evaluated within\nthe indoor MIcro aerial vehicle and MOtion capture (MIMO) arena and has\nstate-of-the-art tracking accuracy, stability, and fast re-detection time in\ncase of tracking loss. Based on the outdoor tests, it was possible to\nsignificantly increase the detection distance and number of returned points\ncompared to other similar methods using LiDAR.\n","authors":["Sándor Gazdag","Tom Möller","Tamás Filep","Anita Keszler","András L. Majdik"],"pdf_url":"https://arxiv.org/pdf/2408.08555v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19524v3","updated":"2024-08-16T06:24:24Z","published":"2024-07-28T16:24:07Z","title":"VersusDebias: Universal Zero-Shot Debiasing for Text-to-Image Models via\n SLM-Based Prompt Engineering and Generative Adversary","summary":" With the rapid development of Text-to-Image (T2I) models, biases in human\nimage generation against demographic social groups become a significant\nconcern, impacting fairness and ethical standards in AI. Some researchers\npropose their methods to tackle with the issue. However, existing methods are\ndesigned for specific models with fixed prompts, limiting their adaptability to\nthe fast-evolving models and diverse practical scenarios. Moreover, they\nneglect the impact of hallucinations, leading to discrepancies between expected\nand actual results. To address these issues, we introduce VersusDebias, a novel\nand universal debiasing framework for biases in arbitrary T2I models,\nconsisting of an array generation (AG) module and an image generation (IG)\nmodule. The self-adaptive AG module generates specialized attribute arrays to\npost-process hallucinations and debias multiple attributes simultaneously. The\nIG module employs a small language model to modify prompts according to the\narrays and drives the T2I model to generate debiased images, enabling zero-shot\ndebiasing. Extensive experiments demonstrate VersusDebias's capability to\ndebias any models across gender, race, and age simultaneously. In both\nzero-shot and few-shot scenarios, VersusDebias outperforms existing methods,\nshowcasing its exceptional utility. Our work is accessible at\nhttps://github.com/VersusDebias/VersusDebias to ensure reproducibility and\nfacilitate further research.\n","authors":["Hanjun Luo","Ziye Deng","Haoyu Huang","Xuecheng Liu","Ruizhe Chen","Zuozhu Liu"],"pdf_url":"https://arxiv.org/pdf/2407.19524v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.07307v3","updated":"2024-08-16T06:14:18Z","published":"2023-05-12T08:27:03Z","title":"Self-Learning Symmetric Multi-view Probabilistic Clustering","summary":" Multi-view Clustering (MVC) has achieved significant progress, with many\nefforts dedicated to learn knowledge from multiple views. However, most\nexisting methods are either not applicable or require additional steps for\nincomplete MVC. Such a limitation results in poor-quality clustering\nperformance and poor missing view adaptation. Besides, noise or outliers might\nsignificantly degrade the overall clustering performance, which are not handled\nwell by most existing methods. In this paper, we propose a novel unified\nframework for incomplete and complete MVC named self-learning symmetric\nmulti-view probabilistic clustering (SLS-MPC). SLS-MPC proposes a novel\nsymmetric multi-view probability estimation and equivalently transforms\nmulti-view pairwise posterior matching probability into composition of each\nview's individual distribution, which tolerates data missing and might extend\nto any number of views. Then, SLS-MPC proposes a novel self-learning\nprobability function without any prior knowledge and hyper-parameters to learn\neach view's individual distribution. Next, graph-context-aware refinement with\npath propagation and co-neighbor propagation is used to refine pairwise\nprobability, which alleviates the impact of noise and outliers. Finally,\nSLS-MPC proposes a probabilistic clustering algorithm to adjust clustering\nassignments by maximizing the joint probability iteratively without category\ninformation. Extensive experiments on multiple benchmarks show that SLS-MPC\noutperforms previous state-of-the-art methods.\n","authors":["Junjie Liu","Junlong Liu","Rongxin Jiang","Yaowu Chen","Chen Shen","Jieping Ye"],"pdf_url":"https://arxiv.org/pdf/2305.07307v3.pdf","comment":"accepted by IEEE Transactions on Knowledge and Data Engineering(TKDE)"},{"id":"http://arxiv.org/abs/2408.08544v1","updated":"2024-08-16T06:04:25Z","published":"2024-08-16T06:04:25Z","title":"Scaling up Multimodal Pre-training for Sign Language Understanding","summary":" Sign language serves as the primary meaning of communication for the\ndeaf-mute community. Different from spoken language, it commonly conveys\ninformation by the collaboration of manual features, i.e., hand gestures and\nbody movements, and non-manual features, i.e., facial expressions and mouth\ncues. To facilitate communication between the deaf-mute and hearing people, a\nseries of sign language understanding (SLU) tasks have been studied in recent\nyears, including isolated/continuous sign language recognition (ISLR/CSLR),\ngloss-free sign language translation (GF-SLT) and sign language retrieval\n(SL-RT). Sign language recognition and translation aims to understand the\nsemantic meaning conveyed by sign languages from gloss-level and\nsentence-level, respectively. In contrast, SL-RT focuses on retrieving sign\nvideos or corresponding texts from a closed-set under the query-by-example\nsearch paradigm. These tasks investigate sign language topics from diverse\nperspectives and raise challenges in learning effective representation of sign\nlanguage videos. To advance the development of sign language understanding,\nexploring a generalized model that is applicable across various SLU tasks is a\nprofound research direction.\n","authors":["Wengang Zhou","Weichao Zhao","Hezhen Hu","Zecheng Li","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2408.08544v1.pdf","comment":"Sign language recognition; Sign language translation; Sign language\n retrieval"},{"id":"http://arxiv.org/abs/2408.08543v1","updated":"2024-08-16T05:57:22Z","published":"2024-08-16T05:57:22Z","title":"Language-Driven Interactive Shadow Detection","summary":" Traditional shadow detectors often identify all shadow regions of static\nimages or video sequences. This work presents the Referring Video Shadow\nDetection (RVSD), which is an innovative task that rejuvenates the classic\nparadigm by facilitating the segmentation of particular shadows in videos based\non descriptive natural language prompts. This novel RVSD not only achieves\nsegmentation of arbitrary shadow areas of interest based on descriptions\n(flexibility) but also allows users to interact with visual content more\ndirectly and naturally by using natural language prompts (interactivity),\npaving the way for abundant applications ranging from advanced video editing to\nvirtual reality experiences. To pioneer the RVSD research, we curated a\nwell-annotated RVSD dataset, which encompasses 86 videos and a rich set of\n15,011 paired textual descriptions with corresponding shadows. To the best of\nour knowledge, this dataset is the first one for addressing RVSD. Based on this\ndataset, we propose a Referring Shadow-Track Memory Network (RSM-Net) for\naddressing the RVSD task. In our RSM-Net, we devise a Twin-Track Synergistic\nMemory (TSM) to store intra-clip memory features and hierarchical inter-clip\nmemory features, and then pass these memory features into a memory read module\nto refine features of the current video frame for referring shadow detection.\nWe also develop a Mixed-Prior Shadow Attention (MSA) to utilize physical priors\nto obtain a coarse shadow map for learning more visual features by weighting it\nwith the input video frame. Experimental results show that our RSM-Net achieves\nstate-of-the-art performance for RVSD with a notable Overall IOU increase of\n4.4\\%. Our code and dataset are available at https://github.com/whq-xxh/RVSD.\n","authors":["Hongqiu Wang","Wei Wang","Haipeng Zhou","Huihui Xu","Shaozhi Wu","Lei Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.08543v1.pdf","comment":"ACM MM 2024"},{"id":"http://arxiv.org/abs/2407.15240v3","updated":"2024-08-16T05:53:16Z","published":"2024-07-21T18:09:40Z","title":"BIGbench: A Unified Benchmark for Social Bias in Text-to-Image\n Generative Models Based on Multi-modal LLM","summary":" Text-to-Image (T2I) generative models are becoming increasingly crucial due\nto their ability to generate high-quality images, which also raises concerns\nabout the social biases in their outputs, especially in the human generation.\nSociological research has established systematic classifications of bias.\nHowever, existing bias research about T2I models conflates different types of\nbias, impeding methodological progress. In this paper, we introduce BIGbench, a\nunified benchmark for Biases of Image Generation, featuring a meticulously\ndesigned dataset. Unlike existing benchmarks, BIGbench classifies and evaluates\nbiases across four dimensions: manifestation of bias, visibility of bias,\nacquired attributes, and protected attributes, which ensures exceptional\naccuracy for analysis. Furthermore, BIGbench applies advanced multi-modal large\nlanguage models to achieve fully automated and highly accurate evaluations. We\napply BIGbench to evaluate eight representative general T2I models and three\ndebiased methods. Our human evaluation results underscore BIGbench's\neffectiveness in aligning images and identifying various biases. Besides, our\nstudy also reveal new research directions about biases, such as the effect of\ndistillation and irrelevant protected attributes. Our benchmark is openly\naccessible at https://github.com/BIGbench2024/BIGbench2024/ to ensure\nreproducibility.\n","authors":["Hanjun Luo","Haoyu Huang","Ziye Deng","Xuecheng Liu","Ruizhe Chen","Zuozhu Liu"],"pdf_url":"https://arxiv.org/pdf/2407.15240v3.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2405.17814"},{"id":"http://arxiv.org/abs/2404.15506v2","updated":"2024-08-16T05:36:12Z","published":"2024-03-22T02:30:46Z","title":"Metric3D v2: A Versatile Monocular Geometric Foundation Model for\n Zero-shot Metric Depth and Surface Normal Estimation","summary":" We introduce Metric3D v2, a geometric foundation model for zero-shot metric\ndepth and surface normal estimation from a single image, which is crucial for\nmetric 3D recovery. While depth and normal are geometrically related and highly\ncomplimentary, they present distinct challenges. SoTA monocular depth methods\nachieve zero-shot generalization by learning affine-invariant depths, which\ncannot recover real-world metrics. Meanwhile, SoTA normal estimation methods\nhave limited zero-shot performance due to the lack of large-scale labeled data.\nTo tackle these issues, we propose solutions for both metric depth estimation\nand surface normal estimation. For metric depth estimation, we show that the\nkey to a zero-shot single-view model lies in resolving the metric ambiguity\nfrom various camera models and large-scale data training. We propose a\ncanonical camera space transformation module, which explicitly addresses the\nambiguity problem and can be effortlessly plugged into existing monocular\nmodels. For surface normal estimation, we propose a joint depth-normal\noptimization module to distill diverse data knowledge from metric depth,\nenabling normal estimators to learn beyond normal labels. Equipped with these\nmodules, our depth-normal models can be stably trained with over 16 million of\nimages from thousands of camera models with different-type annotations,\nresulting in zero-shot generalization to in-the-wild images with unseen camera\nsettings. Our method enables the accurate recovery of metric 3D structures on\nrandomly collected internet images, paving the way for plausible single-image\nmetrology. Our project page is at https://JUGGHM.github.io/Metric3Dv2.\n","authors":["Mu Hu","Wei Yin","Chi Zhang","Zhipeng Cai","Xiaoxiao Long","Hao Chen","Kaixuan Wang","Gang Yu","Chunhua Shen","Shaojie Shen"],"pdf_url":"https://arxiv.org/pdf/2404.15506v2.pdf","comment":"Our project page is at https://JUGGHM.github.io/Metric3Dv2. Accpeted\n to TPAMI. arXiv admin note: text overlap with arXiv:2307.10984"},{"id":"http://arxiv.org/abs/2311.17957v2","updated":"2024-08-16T05:35:21Z","published":"2023-11-29T08:52:08Z","title":"HandRefiner: Refining Malformed Hands in Generated Images by\n Diffusion-based Conditional Inpainting","summary":" Diffusion models have achieved remarkable success in generating realistic\nimages but suffer from generating accurate human hands, such as incorrect\nfinger counts or irregular shapes. This difficulty arises from the complex task\nof learning the physical structure and pose of hands from training images,\nwhich involves extensive deformations and occlusions. For correct hand\ngeneration, our paper introduces a lightweight post-processing solution called\n$\\textbf{HandRefiner}$. HandRefiner employs a conditional inpainting approach\nto rectify malformed hands while leaving other parts of the image untouched. We\nleverage the hand mesh reconstruction model that consistently adheres to the\ncorrect number of fingers and hand shape, while also being capable of fitting\nthe desired hand pose in the generated image. Given a generated failed image\ndue to malformed hands, we utilize ControlNet modules to re-inject such correct\nhand information. Additionally, we uncover a phase transition phenomenon within\nControlNet as we vary the control strength. It enables us to take advantage of\nmore readily available synthetic data without suffering from the domain gap\nbetween realistic and synthetic hands. Experiments demonstrate that HandRefiner\ncan significantly improve the generation quality quantitatively and\nqualitatively. The code is available at\nhttps://github.com/wenquanlu/HandRefiner .\n","authors":["Wenquan Lu","Yufei Xu","Jing Zhang","Chaoyue Wang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2311.17957v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08529v1","updated":"2024-08-16T04:57:21Z","published":"2024-08-16T04:57:21Z","title":"Privacy-Preserving Vision Transformer Using Images Encrypted with\n Restricted Random Permutation Matrices","summary":" We propose a novel method for privacy-preserving fine-tuning vision\ntransformers (ViTs) with encrypted images. Conventional methods using encrypted\nimages degrade model performance compared with that of using plain images due\nto the influence of image encryption. In contrast, the proposed encryption\nmethod using restricted random permutation matrices can provide a higher\nperformance than the conventional ones.\n","authors":["Kouki Horio","Kiyoshi Nishikawa","Hitoshi Kiya"],"pdf_url":"https://arxiv.org/pdf/2408.08529v1.pdf","comment":"4 pages, 9 figures"},{"id":"http://arxiv.org/abs/2408.08527v1","updated":"2024-08-16T04:54:10Z","published":"2024-08-16T04:54:10Z","title":"Focus on Focus: Focus-oriented Representation Learning and Multi-view\n Cross-modal Alignment for Glioma Grading","summary":" Recently, multimodal deep learning, which integrates histopathology slides\nand molecular biomarkers, has achieved a promising performance in glioma\ngrading. Despite great progress, due to the intra-modality complexity and\ninter-modality heterogeneity, existing studies suffer from inadequate\nhistopathology representation learning and inefficient molecular-pathology\nknowledge alignment. These two issues hinder existing methods to precisely\ninterpret diagnostic molecular-pathology features, thereby limiting their\ngrading performance. Moreover, the real-world applicability of existing\nmultimodal approaches is significantly restricted as molecular biomarkers are\nnot always available during clinical deployment. To address these problems, we\nintroduce a novel Focus on Focus (FoF) framework with paired pathology-genomic\ntraining and applicable pathology-only inference, enhancing molecular-pathology\nrepresentation effectively. Specifically, we propose a Focus-oriented\nRepresentation Learning (FRL) module to encourage the model to identify regions\npositively or negatively related to glioma grading and guide it to focus on the\ndiagnostic areas with a consistency constraint. To effectively link the\nmolecular biomarkers to morphological features, we propose a Multi-view\nCross-modal Alignment (MCA) module that projects histopathology representations\ninto molecular subspaces, aligning morphological features with corresponding\nmolecular biomarker status by supervised contrastive learning. Experiments on\nthe TCGA GBM-LGG dataset demonstrate that our FoF framework significantly\nimproves the glioma grading. Remarkably, our FoF achieves superior performance\nusing only histopathology slides compared to existing multimodal methods. The\nsource code is available at https://github.com/peterlipan/FoF.\n","authors":["Li Pan","Yupei Zhang","Qiushi Yang","Tan Li","Xiaohan Xing","Maximus C. F. Yeung","Zhen Chen"],"pdf_url":"https://arxiv.org/pdf/2408.08527v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.17074v3","updated":"2024-08-16T04:46:19Z","published":"2023-09-29T09:10:04Z","title":"AdaDiff: Accelerating Diffusion Models through Step-Wise Adaptive\n Computation","summary":" Diffusion models achieve great success in generating diverse and\nhigh-fidelity images, yet their widespread application, especially in real-time\nscenarios, is hampered by their inherently slow generation speed. The slow\ngeneration stems from the necessity of multi-step network inference. While some\ncertain predictions benefit from the full computation of the model in each\nsampling iteration, not every iteration requires the same amount of\ncomputation, potentially leading to inefficient computation. Unlike typical\nadaptive computation challenges that deal with single-step generation problems,\ndiffusion processes with a multi-step generation need to dynamically adjust\ntheir computational resource allocation based on the ongoing assessment of each\nstep's importance to the final image output, presenting a unique set of\nchallenges. In this work, we propose AdaDiff, an adaptive framework that\ndynamically allocates computation resources in each sampling step to improve\nthe generation efficiency of diffusion models. To assess the effects of changes\nin computational effort on image quality, we present a timestep-aware\nuncertainty estimation module (UEM). Integrated at each intermediate layer, the\nUEM evaluates the predictive uncertainty. This uncertainty measurement serves\nas an indicator for determining whether to terminate the inference process.\nAdditionally, we introduce an uncertainty-aware layer-wise loss aimed at\nbridging the performance gap between full models and their adaptive\ncounterparts.\n","authors":["Shengkun Tang","Yaqing Wang","Caiwen Ding","Yi Liang","Yao Li","Dongkuan Xu"],"pdf_url":"https://arxiv.org/pdf/2309.17074v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08524v1","updated":"2024-08-16T04:38:31Z","published":"2024-08-16T04:38:31Z","title":"GS-ID: Illumination Decomposition on Gaussian Splatting via Diffusion\n Prior and Parametric Light Source Optimization","summary":" We present GS-ID, a novel framework for illumination decomposition on\nGaussian Splatting, achieving photorealistic novel view synthesis and intuitive\nlight editing. Illumination decomposition is an ill-posed problem facing three\nmain challenges: 1) priors for geometry and material are often lacking; 2)\ncomplex illumination conditions involve multiple unknown light sources; and 3)\ncalculating surface shading with numerous light sources is computationally\nexpensive. To address these challenges, we first introduce intrinsic diffusion\npriors to estimate the attributes for physically based rendering. Then we\ndivide the illumination into environmental and direct components for joint\noptimization. Last, we employ deferred rendering to reduce the computational\nload. Our framework uses a learnable environment map and Spherical Gaussians\n(SGs) to represent light sources parametrically, therefore enabling\ncontrollable and photorealistic relighting on Gaussian Splatting. Extensive\nexperiments and applications demonstrate that GS-ID produces state-of-the-art\nillumination decomposition results while achieving better geometry\nreconstruction and rendering performance.\n","authors":["Kang Du","Zhihao Liang","Zeyu Wang"],"pdf_url":"https://arxiv.org/pdf/2408.08524v1.pdf","comment":"15 pages, 13 figures"},{"id":"http://arxiv.org/abs/2408.08518v1","updated":"2024-08-16T04:14:28Z","published":"2024-08-16T04:14:28Z","title":"Visual-Friendly Concept Protection via Selective Adversarial\n Perturbations","summary":" Personalized concept generation by tuning diffusion models with a few images\nraises potential legal and ethical concerns regarding privacy and intellectual\nproperty rights. Researchers attempt to prevent malicious personalization using\nadversarial perturbations. However, previous efforts have mainly focused on the\neffectiveness of protection while neglecting the visibility of perturbations.\nThey utilize global adversarial perturbations, which introduce noticeable\nalterations to original images and significantly degrade visual quality. In\nthis work, we propose the Visual-Friendly Concept Protection (VCPro) framework,\nwhich prioritizes the protection of key concepts chosen by the image owner\nthrough adversarial perturbations with lower perceptibility. To ensure these\nperturbations are as inconspicuous as possible, we introduce a relaxed\noptimization objective to identify the least perceptible yet effective\nadversarial perturbations, solved using the Lagrangian multiplier method.\nQualitative and quantitative experiments validate that VCPro achieves a better\ntrade-off between the visibility of perturbations and protection effectiveness,\neffectively prioritizing the protection of target concepts in images with less\nperceptible perturbations.\n","authors":["Xiaoyue Mi","Fan Tang","Juan Cao","Peng Li","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2408.08518v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2405.13571v2","updated":"2024-08-16T03:33:25Z","published":"2024-05-22T12:08:56Z","title":"Incomplete Multimodal Industrial Anomaly Detection via Cross-Modal\n Distillation","summary":" Recent studies of multimodal industrial anomaly detection (IAD) based on 3D\npoint clouds and RGB images have highlighted the importance of exploiting the\nredundancy and complementarity among modalities for accurate classification and\nsegmentation. However, achieving multimodal IAD in practical production lines\nremains a work in progress. It is essential to consider the trade-offs between\nthe costs and benefits associated with the introduction of new modalities while\nensuring compatibility with current processes. Existing quality control\nprocesses combine rapid in-line inspections, such as optical and infrared\nimaging with high-resolution but time-consuming near-line characterization\ntechniques, including industrial CT and electron microscopy to manually or\nsemi-automatically locate and analyze defects in the production of Li-ion\nbatteries and composite materials. Given the cost and time limitations, only a\nsubset of the samples can be inspected by all in-line and near-line methods,\nand the remaining samples are only evaluated through one or two forms of\nin-line inspection. To fully exploit data for deep learning-driven automatic\ndefect detection, the models must have the ability to leverage multimodal\ntraining and handle incomplete modalities during inference. In this paper, we\npropose CMDIAD, a Cross-Modal Distillation framework for IAD to demonstrate the\nfeasibility of a Multi-modal Training, Few-modal Inference (MTFI) pipeline. Our\nfindings show that the MTFI pipeline can more effectively utilize incomplete\nmultimodal information compared to applying only a single modality for training\nand inference. Moreover, we investigate the reasons behind the asymmetric\nperformance improvement using point clouds or RGB images as the main modality\nof inference. This provides a foundation for our future multimodal dataset\nconstruction with additional modalities from manufacturing scenarios.\n","authors":["Wenbo Sui","Daniel Lichau","Josselin Lefèvre","Harold Phelippeau"],"pdf_url":"https://arxiv.org/pdf/2405.13571v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08502v1","updated":"2024-08-16T03:01:07Z","published":"2024-08-16T03:01:07Z","title":"Efficient Image-to-Image Diffusion Classifier for Adversarial Robustness","summary":" Diffusion models (DMs) have demonstrated great potential in the field of\nadversarial robustness, where DM-based defense methods can achieve superior\ndefense capability without adversarial training. However, they all require huge\ncomputational costs due to the usage of large-scale pre-trained DMs, making it\ndifficult to conduct full evaluation under strong attacks and compare with\ntraditional CNN-based methods. Simply reducing the network size and timesteps\nin DMs could significantly harm the image generation quality, which invalidates\nprevious frameworks. To alleviate this issue, we redesign the diffusion\nframework from generating high-quality images to predicting distinguishable\nimage labels. Specifically, we employ an image translation framework to learn\nmany-to-one mapping from input samples to designed orthogonal image labels.\nBased on this framework, we introduce an efficient Image-to-Image diffusion\nclassifier with a pruned U-Net structure and reduced diffusion timesteps.\nBesides the framework, we redesign the optimization objective of DMs to fit the\ntarget of image classification, where a new classification loss is incorporated\nin the DM-based image translation framework to distinguish the generated label\nfrom those of other classes. We conduct sufficient evaluations of the proposed\nclassifier under various attacks on popular benchmarks. Extensive experiments\nshow that our method achieves better adversarial robustness with fewer\ncomputational costs than DM-based and CNN-based methods. The code is available\nat https://github.com/hfmei/IDC.\n","authors":["Hefei Mei","Minjing Dong","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2408.08502v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08500v1","updated":"2024-08-16T02:55:10Z","published":"2024-08-16T02:55:10Z","title":"CoSEC: A Coaxial Stereo Event Camera Dataset for Autonomous Driving","summary":" Conventional frame camera is the mainstream sensor of the autonomous driving\nscene perception, while it is limited in adverse conditions, such as low light.\nEvent camera with high dynamic range has been applied in assisting frame camera\nfor the multimodal fusion, which relies heavily on the pixel-level spatial\nalignment between various modalities. Typically, existing multimodal datasets\nmainly place event and frame cameras in parallel and directly align them\nspatially via warping operation. However, this parallel strategy is less\neffective for multimodal fusion, since the large disparity exacerbates spatial\nmisalignment due to the large event-frame baseline. We argue that baseline\nminimization can reduce alignment error between event and frame cameras. In\nthis work, we introduce hybrid coaxial event-frame devices to build the\nmultimodal system, and propose a coaxial stereo event camera (CoSEC) dataset\nfor autonomous driving. As for the multimodal system, we first utilize the\nmicrocontroller to achieve time synchronization, and then spatially calibrate\ndifferent sensors, where we perform intra- and inter-calibration of stereo\ncoaxial devices. As for the multimodal dataset, we filter LiDAR point clouds to\ngenerate depth and optical flow labels using reference depth, which is further\nimproved by fusing aligned event and frame data in nighttime conditions. With\nthe help of the coaxial device, the proposed dataset can promote the all-day\npixel-level multimodal fusion. Moreover, we also conduct experiments to\ndemonstrate that the proposed dataset can improve the performance and\ngeneralization of the multimodal fusion.\n","authors":["Shihan Peng","Hanyu Zhou","Hao Dong","Zhiwei Shi","Haoyue Liu","Yuxing Duan","Yi Chang","Luxin Yan"],"pdf_url":"https://arxiv.org/pdf/2408.08500v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2408.08495v1","updated":"2024-08-16T02:33:55Z","published":"2024-08-16T02:33:55Z","title":"Achieving Complex Image Edits via Function Aggregation with Diffusion\n Models","summary":" Diffusion models have demonstrated strong performance in generative tasks,\nmaking them ideal candidates for image editing. Recent studies highlight their\nability to apply desired edits effectively by following textual instructions,\nyet two key challenges persist. First, these models struggle to apply multiple\nedits simultaneously, resulting in computational inefficiencies due to their\nreliance on sequential processing. Second, relying on textual prompts to\ndetermine the editing region can lead to unintended alterations in other parts\nof the image. In this work, we introduce FunEditor, an efficient diffusion\nmodel designed to learn atomic editing functions and perform complex edits by\naggregating simpler functions. This approach enables complex editing tasks,\nsuch as object movement, by aggregating multiple functions and applying them\nsimultaneously to specific areas. FunEditor is 5 to 24 times faster inference\nthan existing methods on complex tasks like object movement. Our experiments\ndemonstrate that FunEditor significantly outperforms recent baselines,\nincluding both inference-time optimization methods and fine-tuned models,\nacross various metrics, such as image quality assessment (IQA) and\nobject-background consistency.\n","authors":["Mohammadreza Samadi","Fred X. Han","Mohammad Salameh","Hao Wu","Fengyu Sun","Chunhua Zhou","Di Niu"],"pdf_url":"https://arxiv.org/pdf/2408.08495v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08092v2","updated":"2024-08-16T02:28:12Z","published":"2024-08-15T11:34:53Z","title":"OC3D: Weakly Supervised Outdoor 3D Object Detection with Only Coarse\n Click Annotation","summary":" LiDAR-based outdoor 3D object detection has received widespread attention.\nHowever, training 3D detectors from the LiDAR point cloud typically relies on\nexpensive bounding box annotations. This paper presents OC3D, an innovative\nweakly supervised method requiring only coarse clicks on the bird's eye view of\nthe 3D point cloud. A key challenge here is the absence of complete geometric\ndescriptions of the target objects from such simple click annotations. To\naddress this problem, our proposed OC3D adopts a two-stage strategy. In the\nfirst stage, we initially design a novel dynamic and static classification\nstrategy and then propose the Click2Box and Click2Mask modules to generate\nbox-level and mask-level pseudo-labels for static and dynamic instances,\nrespectively. In the second stage, we design a Mask2Box module, leveraging the\nlearning capabilities of neural networks to update mask-level pseudo-labels,\nwhich contain less information, to box-level pseudo-labels. Experimental\nresults on the widely used KITTI and nuScenes datasets demonstrate that our\nOC3D with only coarse clicks achieves state-of-the-art performance compared to\nweakly-supervised 3D detection methods. Combining OC3D with a missing click\nmining strategy, we propose an OC3D++ pipeline, which requires only 0.2%\nannotation cost in the KITTI dataset to achieve performance comparable to fully\nsupervised methods. The code will be made publicly available.\n","authors":["Qiming Xia","Hongwei Lin","Wei Ye","Hai Wu","Yadan Luo","Shijia Zhao","Xin Li","Chenglu Wen"],"pdf_url":"https://arxiv.org/pdf/2408.08092v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21308v2","updated":"2024-08-16T02:28:07Z","published":"2024-07-31T03:20:11Z","title":"Enhanced Self-Checkout System for Retail Based on Improved YOLOv10","summary":" With the rapid advancement of deep learning technologies, computer vision has\nshown immense potential in retail automation. This paper presents a novel\nself-checkout system for retail based on an improved YOLOv10 network, aimed at\nenhancing checkout efficiency and reducing labor costs. We propose targeted\noptimizations to the YOLOv10 model, by incorporating the detection head\nstructure from YOLOv8, which significantly improves product recognition\naccuracy. Additionally, we develop a post-processing algorithm tailored for\nself-checkout scenarios, to further enhance the application of system.\nExperimental results demonstrate that our system outperforms existing methods\nin both product recognition accuracy and checkout speed. This research not only\nprovides a new technical solution for retail automation but offers valuable\ninsights into optimizing deep learning models for real-world applications.\n","authors":["Lianghao Tan","Shubing Liu","Jing Gao","Xiaoyi Liu","Linyue Chu","Huangqi Jiang"],"pdf_url":"https://arxiv.org/pdf/2407.21308v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08489v1","updated":"2024-08-16T02:18:23Z","published":"2024-08-16T02:18:23Z","title":"DFT-Based Adversarial Attack Detection in MRI Brain Imaging: Enhancing\n Diagnostic Accuracy in Alzheimer's Case Studies","summary":" Recent advancements in deep learning, particularly in medical imaging, have\nsignificantly propelled the progress of healthcare systems. However, examining\nthe robustness of medical images against adversarial attacks is crucial due to\ntheir real-world applications and profound impact on individuals' health. These\nattacks can result in misclassifications in disease diagnosis, potentially\nleading to severe consequences. Numerous studies have explored both the\nimplementation of adversarial attacks on medical images and the development of\ndefense mechanisms against these threats, highlighting the vulnerabilities of\ndeep neural networks to such adversarial activities. In this study, we\ninvestigate adversarial attacks on images associated with Alzheimer's disease\nand propose a defensive method to counteract these attacks. Specifically, we\nexamine adversarial attacks that employ frequency domain transformations on\nAlzheimer's disease images, along with other well-known adversarial attacks.\nOur approach utilizes a convolutional neural network (CNN)-based autoencoder\narchitecture in conjunction with the two-dimensional Fourier transform of\nimages for detection purposes. The simulation results demonstrate that our\ndetection and defense mechanism effectively mitigates several adversarial\nattacks, thereby enhancing the robustness of deep neural networks against such\nvulnerabilities.\n","authors":["Mohammad Hossein Najafi","Mohammad Morsali","Mohammadmahdi Vahediahmar","Saeed Bagheri Shouraki"],"pdf_url":"https://arxiv.org/pdf/2408.08489v1.pdf","comment":"10 pages, 4 figures, conference"},{"id":"http://arxiv.org/abs/2408.07988v2","updated":"2024-08-16T01:50:47Z","published":"2024-08-15T07:30:21Z","title":"Exploring learning environments for label\\-efficient cancer diagnosis","summary":" Despite significant research efforts and advancements, cancer remains a\nleading cause of mortality. Early cancer prediction has become a crucial focus\nin cancer research to streamline patient care and improve treatment outcomes.\nManual tumor detection by histopathologists can be time consuming, prompting\nthe need for computerized methods to expedite treatment planning. Traditional\napproaches to tumor detection rely on supervised learning, necessitates a large\namount of annotated data for model training. However, acquiring such extensive\nlabeled data can be laborious and time\\-intensive. This research examines the\nthree learning environments: supervised learning (SL), semi\\-supervised\nlearning (Semi\\-SL), and self\\-supervised learning (Self\\-SL): to predict\nkidney, lung, and breast cancer. Three pre\\-trained deep learning models\n(Residual Network\\-50, Visual Geometry Group\\-16, and EfficientNetB0) are\nevaluated based on these learning settings using seven carefully curated\ntraining sets. To create the first training set (TS1), SL is applied to all\nannotated image samples. Five training sets (TS2\\-TS6) with different ratios of\nlabeled and unlabeled cancer images are used to evaluateSemi\\-SL. Unlabeled\ncancer images from the final training set (TS7) are utilized for Self\\-SL\nassessment. Among different learning environments, outcomes from the Semi\\-SL\nsetting show a strong degree of agreement with the outcomes achieved in the SL\nsetting. The uniform pattern of observations from the pre\\-trained models\nacross all three datasets validates the methodology and techniques of the\nresearch. Based on modest number of labeled samples and minimal computing cost,\nour study suggests that the Semi\\-SL option can be a highly viable replacement\nfor the SL option under label annotation constraint scenarios.\n","authors":["Samta Rani","Tanvir Ahmad","Sarfaraz Masood","Chandni Saxena"],"pdf_url":"https://arxiv.org/pdf/2408.07988v2.pdf","comment":"Submitted to the journal"},{"id":"http://arxiv.org/abs/2402.17987v3","updated":"2024-08-16T01:37:41Z","published":"2024-02-28T02:11:47Z","title":"Multistatic-Radar RCS-Signature Recognition of Aerial Vehicles: A\n Bayesian Fusion Approach","summary":" Radar Automated Target Recognition (RATR) for Unmanned Aerial Vehicles (UAVs)\ninvolves transmitting Electromagnetic Waves (EMWs) and performing target type\nrecognition on the received radar echo, crucial for defense and aerospace\napplications. Previous studies highlighted the advantages of multistatic radar\nconfigurations over monostatic ones in RATR. However, fusion methods in\nmultistatic radar configurations often suboptimally combine classification\nvectors from individual radars probabilistically. To address this, we propose a\nfully Bayesian RATR framework employing Optimal Bayesian Fusion (OBF) to\naggregate classification probability vectors from multiple radars. OBF, based\non expected 0-1 loss, updates a Recursive Bayesian Classification (RBC)\nposterior distribution for target UAV type, conditioned on historical\nobservations across multiple time steps. We evaluate the approach using\nsimulated random walk trajectories for seven drones, correlating target aspect\nangles to Radar Cross Section (RCS) measurements in an anechoic chamber.\nComparing against single radar Automated Target Recognition (ATR) systems and\nsuboptimal fusion methods, our empirical results demonstrate that the OBF\nmethod integrated with RBC significantly enhances classification accuracy\ncompared to other fusion methods and single radar configurations.\n","authors":["Michael Potter","Murat Akcakaya","Marius Necsoiu","Gunar Schirner","Deniz Erdogmus","Tales Imbiriba"],"pdf_url":"https://arxiv.org/pdf/2402.17987v3.pdf","comment":"Accepted to IEEE Transactions on Aerospace and Electronic Systems"},{"id":"http://arxiv.org/abs/2310.15130v2","updated":"2024-08-16T01:35:52Z","published":"2023-10-23T17:34:31Z","title":"Novel-View Acoustic Synthesis from 3D Reconstructed Rooms","summary":" We investigate the benefit of combining blind audio recordings with 3D scene\ninformation for novel-view acoustic synthesis. Given audio recordings from 2-4\nmicrophones and the 3D geometry and material of a scene containing multiple\nunknown sound sources, we estimate the sound anywhere in the scene. We identify\nthe main challenges of novel-view acoustic synthesis as sound source\nlocalization, separation, and dereverberation. While naively training an\nend-to-end network fails to produce high-quality results, we show that\nincorporating room impulse responses (RIRs) derived from 3D reconstructed rooms\nenables the same network to jointly tackle these tasks. Our method outperforms\nexisting methods designed for the individual tasks, demonstrating its\neffectiveness at utilizing 3D visual information. In a simulated study on the\nMatterport3D-NVAS dataset, our model achieves near-perfect accuracy on source\nlocalization, a PSNR of 26.44dB and a SDR of 14.23dB for source separation and\ndereverberation, resulting in a PSNR of 25.55 dB and a SDR of 14.20 dB on\nnovel-view acoustic synthesis. We release our code and model on our project\nwebsite at https://github.com/apple/ml-nvas3d. Please wear headphones when\nlistening to the results.\n","authors":["Byeongjoo Ahn","Karren Yang","Brian Hamilton","Jonathan Sheaffer","Anurag Ranjan","Miguel Sarabia","Oncel Tuzel","Jen-Hao Rick Chang"],"pdf_url":"https://arxiv.org/pdf/2310.15130v2.pdf","comment":"Interspeech 2024"},{"id":"http://arxiv.org/abs/2407.14001v2","updated":"2024-08-16T01:27:59Z","published":"2024-07-19T03:22:04Z","title":"Component Selection for Craft Assembly Tasks","summary":" Inspired by traditional handmade crafts, where a person improvises assemblies\nbased on the available objects, we formally introduce the Craft Assembly Task.\nIt is a robotic assembly task that involves building an accurate representation\nof a given target object using the available objects, which do not directly\ncorrespond to its parts. In this work, we focus on selecting the subset of\navailable objects for the final craft, when the given input is an RGB image of\nthe target in the wild. We use a mask segmentation neural network to identify\nvisible parts, followed by retrieving labelled template meshes. These meshes\nundergo pose optimization to determine the most suitable template. Then, we\npropose to simplify the parts of the transformed template mesh to primitive\nshapes like cuboids or cylinders. Finally, we design a search algorithm to find\ncorrespondences in the scene based on local and global proportions. We develop\nbaselines for comparison that consider all possible combinations, and choose\nthe highest scoring combination for common metrics used in foreground maps and\nmask accuracy. Our approach achieves comparable results to the baselines for\ntwo different scenes, and we show qualitative results for an implementation in\na real-world scenario.\n","authors":["Vitor Hideyo Isume","Takuya Kiyokawa","Natsuki Yamanobe","Yukiyasu Domae","Weiwei Wan","Kensuke Harada"],"pdf_url":"https://arxiv.org/pdf/2407.14001v2.pdf","comment":"Published on IEEE RA-L"},{"id":"http://arxiv.org/abs/2306.00416v4","updated":"2024-08-16T01:07:21Z","published":"2023-06-01T07:48:34Z","title":"Interactive Character Control with Auto-Regressive Motion Diffusion\n Models","summary":" Real-time character control is an essential component for interactive\nexperiences, with a broad range of applications, including physics simulations,\nvideo games, and virtual reality. The success of diffusion models for image\nsynthesis has led to the use of these models for motion synthesis. However, the\nmajority of these motion diffusion models are primarily designed for offline\napplications, where space-time models are used to synthesize an entire sequence\nof frames simultaneously with a pre-specified length. To enable real-time\nmotion synthesis with diffusion model that allows time-varying controls, we\npropose A-MDM (Auto-regressive Motion Diffusion Model). Our conditional\ndiffusion model takes an initial pose as input, and auto-regressively generates\nsuccessive motion frames conditioned on the previous frame. Despite its\nstreamlined network architecture, which uses simple MLPs, our framework is\ncapable of generating diverse, long-horizon, and high-fidelity motion\nsequences. Furthermore, we introduce a suite of techniques for incorporating\ninteractive controls into A-MDM, such as task-oriented sampling, in-painting,\nand hierarchical reinforcement learning. These techniques enable a pre-trained\nA-MDM to be efficiently adapted for a variety of new downstream tasks. We\nconduct a comprehensive suite of experiments to demonstrate the effectiveness\nof A-MDM, and compare its performance against state-of-the-art auto-regressive\nmethods.\n","authors":["Yi Shi","Jingbo Wang","Xuekun Jiang","Bingkun Lin","Bo Dai","Xue Bin Peng"],"pdf_url":"https://arxiv.org/pdf/2306.00416v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08461v1","updated":"2024-08-16T00:05:16Z","published":"2024-08-16T00:05:16Z","title":"TEXTOC: Text-driven Object-Centric Style Transfer","summary":" We present Text-driven Object-Centric Style Transfer (TEXTOC), a novel method\nthat guides style transfer at an object-centric level using textual inputs. The\ncore of TEXTOC is our Patch-wise Co-Directional (PCD) loss, meticulously\ndesigned for precise object-centric transformations that are closely aligned\nwith the input text. This loss combines a patch directional loss for\ntext-guided style direction and a patch distribution consistency loss for even\nCLIP embedding distribution across object regions. It ensures a seamless and\nharmonious style transfer across object regions. Key to our method are the\nText-Matched Patch Selection (TMPS) and Pre-fixed Region Selection (PRS)\nmodules for identifying object locations via text, eliminating the need for\nsegmentation masks. Lastly, we introduce an Adaptive Background Preservation\n(ABP) loss to maintain the original style and structural essence of the image's\nbackground. This loss is applied to dynamically identified background areas.\nExtensive experiments underline the effectiveness of our approach in creating\nvisually coherent and textually aligned style transfers.\n","authors":["Jihun Park","Jongmin Gim","Kyoungmin Lee","Seunghun Lee","Sunghoon Im"],"pdf_url":"https://arxiv.org/pdf/2408.08461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.11201v3","updated":"2024-08-16T00:00:28Z","published":"2023-01-26T16:22:01Z","title":"Relative-Interior Solution for the (Incomplete) Linear Assignment\n Problem with Applications to the Quadratic Assignment Problem","summary":" We study the set of optimal solutions of the dual linear programming\nformulation of the linear assignment problem (LAP) to propose a method for\ncomputing a solution from the relative interior of this set. Assuming that an\narbitrary dual-optimal solution and an optimal assignment are available (for\nwhich many efficient algorithms already exist), our method computes a\nrelative-interior solution in linear time. Since the LAP occurs as a subproblem\nin the linear programming (LP) relaxation of the quadratic assignment problem\n(QAP), we employ our method as a new component in the family of dual-ascent\nalgorithms that provide bounds on the optimal value of the QAP. To make our\nresults applicable to the incomplete QAP, which is of interest in practical\nuse-cases, we also provide a linear-time reduction from the incomplete LAP to\nthe complete LAP along with a mapping that preserves optimality and membership\nin the relative interior. Our experiments on publicly available benchmarks\nindicate that our approach with relative-interior solution can frequently\nprovide bounds near the optimum of the LP relaxation and its runtime is much\nlower when compared to a commercial LP solver.\n","authors":["Tomáš Dlask","Bogdan Savchynskyy"],"pdf_url":"https://arxiv.org/pdf/2301.11201v3.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2408.08821v1","updated":"2024-08-16T16:09:59Z","published":"2024-08-16T16:09:59Z","title":"EasyRec: Simple yet Effective Language Models for Recommendation","summary":" Deep neural networks have become a powerful technique for learning\nrepresentations from user-item interaction data in collaborative filtering (CF)\nfor recommender systems. However, many existing methods heavily rely on unique\nuser and item IDs, which limits their ability to perform well in practical\nzero-shot learning scenarios where sufficient training data may be unavailable.\nInspired by the success of language models (LMs) and their strong\ngeneralization capabilities, a crucial question arises: How can we harness the\npotential of language models to empower recommender systems and elevate its\ngeneralization capabilities to new heights? In this study, we propose EasyRec -\nan effective and easy-to-use approach that seamlessly integrates text-based\nsemantic understanding with collaborative signals. EasyRec employs a\ntext-behavior alignment framework, which combines contrastive learning with\ncollaborative language model tuning, to ensure a strong alignment between the\ntext-enhanced semantic space and the collaborative behavior information.\nExtensive empirical evaluations across diverse real-world datasets demonstrate\nthe superior performance of EasyRec compared to state-of-the-art alternative\nmodels, particularly in the challenging text-based zero-shot recommendation\nscenarios. Furthermore, the study highlights the potential of seamlessly\nintegrating EasyRec as a plug-and-play component into text-enhanced\ncollaborative filtering frameworks, thereby empowering existing recommender\nsystems to elevate their recommendation performance and adapt to the evolving\nuser preferences in dynamic environments. For better result reproducibility of\nour EasyRec framework, the model implementation details, source code, and\ndatasets are available at the link: https://github.com/HKUDS/EasyRec.\n","authors":["Xubin Ren","Chao Huang"],"pdf_url":"https://arxiv.org/pdf/2408.08821v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12934v2","updated":"2024-08-16T13:47:59Z","published":"2024-05-21T17:05:02Z","title":"Address-Specific Sustainable Accommodation Choice Through Real-World\n Data Integration","summary":" Consumers wish to choose sustainable accommodation for their travels, and in\nthe case of corporations, may be required to do so. Yet accommodation\nmarketplaces provide no meaningful capability for sustainable choice: typically\nCO2 estimates are provided that are identical for all accommodation of the same\ntype across an entire country. We propose a decision support system that\nenables real choice of sustainable accommodation. We develop a data-driven\naddress-specific metric called EcoGrade, which integrates government approved\ndatasets and uses interpolation where data is sparse. We validate the metric on\n10,000 UK addresses in 10 cities, showing the match of our interpolations to\nreality is statistically significant. We show how the metric has been embedded\ninto a decision support system for a global accommodation marketplace and\ntested by real users over several months with positive user feedback. In the\nEU, forty percent of final energy consumption is from buildings. We need to\nencourage all building owners to make their accommodation more efficient. The\nrental sector is one area where change can occur rapidly, as rented\naccommodation is renovated frequently. We anticipate our decision support\nsystem using EcoGrade will encourage this positive change.\n","authors":["Peter J. Bentley","Rajat Mathur","Soo Ling Lim","Sid Narang"],"pdf_url":"https://arxiv.org/pdf/2405.12934v2.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2408.08713v1","updated":"2024-08-16T12:51:52Z","published":"2024-08-16T12:51:52Z","title":"Beyond KAN: Introducing KarSein for Adaptive High-Order Feature\n Interaction Modeling in CTR Prediction","summary":" Modeling feature interactions is crucial for click-through rate (CTR)\nprediction, particularly when it comes to high-order explicit interactions.\nTraditional methods struggle with this task because they often predefine a\nmaximum interaction order, which relies heavily on prior knowledge and can\nlimit the model's effectiveness. Additionally, modeling high-order interactions\ntypically leads to increased computational costs. Therefore, the challenge lies\nin adaptively modeling high-order feature interactions while maintaining\nefficiency. To address this issue, we introduce Kolmogorov-Arnold Represented\nSparse Efficient Interaction Network (KarSein), designed to optimize both\npredictive accuracy and computational efficiency. We firstly identify\nlimitations of directly applying Kolmogorov-Arnold Networks (KAN) to CTR and\nthen introduce KarSein to overcome these issues. It features a novel\narchitecture that reduces the computational costs of KAN and supports embedding\nvectors as feature inputs. Additionally, KarSein employs guided symbolic\nregression to address the challenge of KAN in spontaneously learning\nmultiplicative relationships. Extensive experiments demonstrate KarSein's\nsuperior performance, achieving significant predictive accuracy with minimal\ncomputational overhead. Furthermore, KarSein maintains strong global\nexplainability while enabling the removal of redundant features, resulting in a\nsparse network structure. These advantages also position KarSein as a promising\nmethod for efficient inference.\n","authors":["Yunxiao Shi","Wujiang Wu","Mingyu Jin","Haimin Zhang","Qiang Wu","Yongfeng Zhang","Min Xu"],"pdf_url":"https://arxiv.org/pdf/2408.08713v1.pdf","comment":"KarSein for CTR"},{"id":"http://arxiv.org/abs/2408.08709v1","updated":"2024-08-16T12:43:38Z","published":"2024-08-16T12:43:38Z","title":"Multimodal Relational Triple Extraction with Query-based Entity Object\n Transformer","summary":" Multimodal Relation Extraction is crucial for constructing flexible and\nrealistic knowledge graphs. Recent studies focus on extracting the relation\ntype with entity pairs present in different modalities, such as one entity in\nthe text and another in the image. However, existing approaches require\nentities and objects given beforehand, which is costly and impractical. To\naddress the limitation, we propose a novel task, Multimodal Entity-Object\nRelational Triple Extraction, which aims to extract all triples (entity span,\nrelation, object region) from image-text pairs. To facilitate this study, we\nmodified a multimodal relation extraction dataset MORE, which includes 21\nrelation types, to create a new dataset containing 20,264 triples, averaging\n5.75 triples per image-text pair. Moreover, we propose QEOT, a query-based\nmodel with a selective attention mechanism, to dynamically explore the\ninteraction and fusion of textual and visual information. In particular, the\nproposed method can simultaneously accomplish entity extraction, relation\nclassification, and object detection with a set of queries. Our method is\nsuitable for downstream applications and reduces error accumulation due to the\npipeline-style approaches. Extensive experimental results demonstrate that our\nproposed method outperforms the existing baselines by 8.06% and achieves\nstate-of-the-art performance.\n","authors":["Lei Hei","Ning An","Tingjing Liao","Qi Ma","Jiaqi Wang","Feiliang Ren"],"pdf_url":"https://arxiv.org/pdf/2408.08709v1.pdf","comment":"15 pages, 7 figures, preprint"},{"id":"http://arxiv.org/abs/2408.08686v1","updated":"2024-08-16T11:59:01Z","published":"2024-08-16T11:59:01Z","title":"SC-Rec: Enhancing Generative Retrieval with Self-Consistent Reranking\n for~Sequential Recommendation","summary":" Language Models (LMs) are increasingly employed in recommendation systems due\nto their advanced language understanding and generation capabilities. Recent\nrecommender systems based on generative retrieval have leveraged the\ninferential abilities of LMs to directly generate the index tokens of the next\nitem, based on item sequences within the user's interaction history. Previous\nstudies have mostly focused on item indices based solely on textual semantic or\ncollaborative information. However, although the standalone effectiveness of\nthese aspects has been demonstrated, the integration of this information has\nremained unexplored. Our in-depth analysis finds that there is a significant\ndifference in the knowledge captured by the model from heterogeneous item\nindices and diverse input prompts, which can have a high potential for\ncomplementarity. In this paper, we propose SC-Rec, a unified recommender system\nthat learns diverse preference knowledge from two distinct item indices and\nmultiple prompt templates. Furthermore, SC-Rec adopts a novel reranking\nstrategy that aggregates a set of ranking results, inferred based on different\nindices and prompts, to achieve the self-consistency of the model. Our\nempirical evaluation on three real-world datasets demonstrates that SC-Rec\nconsiderably outperforms the state-of-the-art methods for sequential\nrecommendation, effectively incorporating complementary knowledge from varied\noutputs of the model.\n","authors":["Tongyoung Kim","Soojin Yoon","Seongku Kang","Jinyoung Yeo","Dongha Lee"],"pdf_url":"https://arxiv.org/pdf/2408.08686v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16120v2","updated":"2024-08-16T11:15:18Z","published":"2024-05-25T08:17:19Z","title":"Guaranteeing Accuracy and Fairness under Fluctuating User Traffic: A\n Bankruptcy-Inspired Re-ranking Approach","summary":" Out of sustainable and economical considerations, two-sided recommendation\nplatforms must satisfy the needs of both users and providers. Previous studies\noften show that the two sides' needs show different urgency: providers need a\nrelatively long-term exposure demand while users want more short-term and\naccurate service. However, our empirical study reveals that previous methods\nfor trading off fairness-accuracy often fail to guarantee long-term fairness\nand short-term accuracy simultaneously in real applications of fluctuating user\ntraffic. Especially, when user traffic is low, the user experience often drops\na lot. Our theoretical analysis also confirms that user traffic is a key factor\nin such a trade-off problem. How to guarantee accuracy and fairness under\nfluctuating user traffic remains a problem. Inspired by the bankruptcy problem\nin economics, we propose a novel fairness-aware re-ranking approach named\nBankFair. Intuitively, BankFair employs the Talmud rule to leverage periods of\nabundant user traffic to offset periods of user traffic scarcity, ensuring\nconsistent user service at every period while upholding long-term fairness.\nSpecifically, BankFair consists of two modules: (1) employing the Talmud rule\nto determine the required fairness degree under varying periods of user\ntraffic; and (2) conducting an online re-ranking algorithm based on the\nfairness degree determined by the Talmud rule. Experiments on two real-world\nrecommendation datasets show that BankFair outperforms all baselines regarding\naccuracy and provider fairness.\n","authors":["Xiaopeng Ye","Chen Xu","Jun Xu","Xuyang Xie","Gang Wang","Zhenhua Dong"],"pdf_url":"https://arxiv.org/pdf/2405.16120v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.06992v2","updated":"2024-08-16T08:18:19Z","published":"2024-07-09T16:07:01Z","title":"Robust Neural Information Retrieval: An Adversarial and\n Out-of-distribution Perspective","summary":" Recent advances in neural information retrieval (IR) models have\nsignificantly enhanced their effectiveness over various IR tasks. The\nrobustness of these models, essential for ensuring their reliability in\npractice, has also garnered significant attention. With a wide array of\nresearch on robust IR being proposed, we believe it is the opportune moment to\nconsolidate the current status, glean insights from existing methodologies, and\nlay the groundwork for future development. We view the robustness of IR to be a\nmultifaceted concept, emphasizing its necessity against adversarial attacks,\nout-of-distribution (OOD) scenarios and performance variance. With a focus on\nadversarial and OOD robustness, we dissect robustness solutions for dense\nretrieval models (DRMs) and neural ranking models (NRMs), respectively,\nrecognizing them as pivotal components of the neural IR pipeline. We provide an\nin-depth discussion of existing methods, datasets, and evaluation metrics,\nshedding light on challenges and future directions in the era of large language\nmodels. To the best of our knowledge, this is the first comprehensive survey on\nthe robustness of neural IR models, and we will also be giving our first\ntutorial presentation at SIGIR 2024\n\\url{https://sigir2024-robust-information-retrieval.github.io}. Along with the\norganization of existing work, we introduce a Benchmark for robust IR (BestIR),\na heterogeneous evaluation benchmark for robust neural information retrieval,\nwhich is publicly available at \\url{https://github.com/Davion-Liu/BestIR}. We\nhope that this study provides useful clues for future research on the\nrobustness of IR models and helps to develop trustworthy search engines\n\\url{https://github.com/Davion-Liu/Awesome-Robustness-in-Information-Retrieval}.\n","authors":["Yu-An Liu","Ruqing Zhang","Jiafeng Guo","Maarten de Rijke","Yixing Fan","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2407.06992v2.pdf","comment":"Survey paper"},{"id":"http://arxiv.org/abs/2408.08585v1","updated":"2024-08-16T07:39:38Z","published":"2024-08-16T07:39:38Z","title":"OptDist: Learning Optimal Distribution for Customer Lifetime Value\n Prediction","summary":" Customer Lifetime Value (CLTV) prediction is a critical task in business\napplications. Accurately predicting CLTV is challenging in real-world business\nscenarios, as the distribution of CLTV is complex and mutable. Firstly, there\nis a large number of users without any consumption consisting of a long-tailed\npart that is too complex to fit. Secondly, the small set of high-value users\nspent orders of magnitude more than a typical user leading to a wide range of\nthe CLTV distribution which is hard to capture in a single distribution.\nExisting approaches for CLTV estimation either assume a prior probability\ndistribution and fit a single group of distribution-related parameters for all\nsamples, or directly learn from the posterior distribution with manually\npredefined buckets in a heuristic manner. However, all these methods fail to\nhandle complex and mutable distributions. In this paper, we propose a novel\noptimal distribution selection model OptDist for CLTV prediction, which\nutilizes an adaptive optimal sub-distribution selection mechanism to improve\nthe accuracy of complex distribution modeling. Specifically, OptDist trains\nseveral candidate sub-distribution networks in the distribution learning module\n(DLM) for modeling the probability distribution of CLTV. Then, a distribution\nselection module (DSM) is proposed to select the sub-distribution for each\nsample, thus making the selection automatically and adaptively. Besides, we\ndesign an alignment mechanism that connects both modules, which effectively\nguides the optimization. We conduct extensive experiments on both two public\nand one private dataset to verify that OptDist outperforms state-of-the-art\nbaselines. Furthermore, OptDist has been deployed on a large-scale financial\nplatform for customer acquisition marketing campaigns and the online\nexperiments also demonstrate the effectiveness of OptDist.\n","authors":["Yunpeng Weng","Xing Tang","Zhenhao Xu","Fuyuan Lyu","Dugang Liu","Zexu Sun","Xiuqiang He"],"pdf_url":"https://arxiv.org/pdf/2408.08585v1.pdf","comment":"CIKM 2024"},{"id":"http://arxiv.org/abs/2408.08564v1","updated":"2024-08-16T06:54:10Z","published":"2024-08-16T06:54:10Z","title":"Collaborative Cross-modal Fusion with Large Language Model for\n Recommendation","summary":" Despite the success of conventional collaborative filtering (CF) approaches\nfor recommendation systems, they exhibit limitations in leveraging semantic\nknowledge within the textual attributes of users and items. Recent focus on the\napplication of large language models for recommendation (LLM4Rec) has\nhighlighted their capability for effective semantic knowledge capture. However,\nthese methods often overlook the collaborative signals in user behaviors. Some\nsimply instruct-tune a language model, while others directly inject the\nembeddings of a CF-based model, lacking a synergistic fusion of different\nmodalities. To address these issues, we propose a framework of Collaborative\nCross-modal Fusion with Large Language Models, termed CCF-LLM, for\nrecommendation. In this framework, we translate the user-item interactions into\na hybrid prompt to encode both semantic knowledge and collaborative signals,\nand then employ an attentive cross-modal fusion strategy to effectively fuse\nlatent embeddings of both modalities. Extensive experiments demonstrate that\nCCF-LLM outperforms existing methods by effectively utilizing semantic and\ncollaborative signals in the LLM4Rec context.\n","authors":["Zhongzhou Liu","Hao Zhang","Kuicai Dong","Yuan Fang"],"pdf_url":"https://arxiv.org/pdf/2408.08564v1.pdf","comment":"10 pages, 4 figures, accepted by CIKM 2024"},{"id":"http://arxiv.org/abs/2408.08538v1","updated":"2024-08-16T05:51:00Z","published":"2024-08-16T05:51:00Z","title":"Don't Click the Bait: Title Debiasing News Recommendation via\n Cross-Field Contrastive Learning","summary":" News recommendation emerges as a primary means for users to access content of\ninterest from the vast amount of news. The title clickbait extensively exists\nin news domain and increases the difficulty for news recommendation to offer\nsatisfactory services for users. Fortunately, we find that news abstract, as a\ncritical field of news, aligns cohesively with the news authenticity. To this\nend, we propose a Title Debiasing News Recommendation with Cross-field\nContrastive learning (TDNR-C2) to overcome the title bias by incorporating news\nabstract. Specifically, a multi-field knowledge extraction module is devised to\nextract multi-view knowledge about news from various fields. Afterwards, we\npresent a cross-field contrastive learning module to conduct bias removal via\ncontrasting learned knowledge from title and abstract fileds. Experimental\nresults on a real-world dataset demonstrate the superiority of the proposed\nTDNR-C2 over existing state-of-the-art methods. Further analysis also indicates\nthe significance of news abstract for title debiasing.\n","authors":["Yijie Shu","Xiaokun Zhang","Youlin Wu","Bo Xu","Liang Yang","Hongfei Lin"],"pdf_url":"https://arxiv.org/pdf/2408.08538v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16357v2","updated":"2024-08-16T05:16:31Z","published":"2024-07-23T10:00:45Z","title":"TWIN V2: Scaling Ultra-Long User Behavior Sequence Modeling for Enhanced\n CTR Prediction at Kuaishou","summary":" The significance of modeling long-term user interests for CTR prediction\ntasks in large-scale recommendation systems is progressively gaining attention\namong researchers and practitioners. Existing work, such as SIM and TWIN,\ntypically employs a two-stage approach to model long-term user behavior\nsequences for efficiency concerns. The first stage rapidly retrieves a subset\nof sequences related to the target item from a long sequence using a\nsearch-based mechanism namely the General Search Unit (GSU), while the second\nstage calculates the interest scores using the Exact Search Unit (ESU) on the\nretrieved results. Given the extensive length of user behavior sequences\nspanning the entire life cycle, potentially reaching up to 10^6 in scale, there\nis currently no effective solution for fully modeling such expansive user\ninterests. To overcome this issue, we introduced TWIN-V2, an enhancement of\nTWIN, where a divide-and-conquer approach is applied to compress life-cycle\nbehaviors and uncover more accurate and diverse user interests. Specifically, a\nhierarchical clustering method groups items with similar characteristics in\nlife-cycle behaviors into a single cluster during the offline phase. By\nlimiting the size of clusters, we can compress behavior sequences well beyond\nthe magnitude of 10^5 to a length manageable for online inference in GSU\nretrieval. Cluster-aware target attention extracts comprehensive and\nmulti-faceted long-term interests of users, thereby making the final\nrecommendation results more accurate and diverse. Extensive offline experiments\non a multi-billion-scale industrial dataset and online A/B tests have\ndemonstrated the effectiveness of TWIN-V2. Under an efficient deployment\nframework, TWIN-V2 has been successfully deployed to the primary traffic that\nserves hundreds of millions of daily active users at Kuaishou.\n","authors":["Zihua Si","Lin Guan","ZhongXiang Sun","Xiaoxue Zang","Jing Lu","Yiqun Hui","Xingchao Cao","Zeyu Yang","Yichen Zheng","Dewei Leng","Kai Zheng","Chenbin Zhang","Yanan Niu","Yang Song","Kun Gai"],"pdf_url":"https://arxiv.org/pdf/2407.16357v2.pdf","comment":"Accepted by CIKM 2024"},{"id":"http://arxiv.org/abs/2408.08521v1","updated":"2024-08-16T04:32:10Z","published":"2024-08-16T04:32:10Z","title":"MuRAR: A Simple and Effective Multimodal Retrieval and Answer Refinement\n Framework for Multimodal Question Answering","summary":" Recent advancements in retrieval-augmented generation (RAG) have demonstrated\nimpressive performance in the question-answering (QA) task. However, most\nprevious works predominantly focus on text-based answers. While some studies\naddress multimodal data, they still fall short in generating comprehensive\nmultimodal answers, particularly for explaining concepts or providing\nstep-by-step tutorials on how to accomplish specific goals. This capability is\nespecially valuable for applications such as enterprise chatbots and settings\nsuch as customer service and educational systems, where the answers are sourced\nfrom multimodal data. In this paper, we introduce a simple and effective\nframework named MuRAR (Multimodal Retrieval and Answer Refinement). MuRAR\nenhances text-based answers by retrieving relevant multimodal data and refining\nthe responses to create coherent multimodal answers. This framework can be\neasily extended to support multimodal answers in enterprise chatbots with\nminimal modifications. Human evaluation results indicate that multimodal\nanswers generated by MuRAR are more useful and readable compared to plain text\nanswers.\n","authors":["Zhengyuan Zhu","Daniel Lee","Hong Zhang","Sai Sree Harsha","Loic Feujio","Akash Maharaj","Yunyao Li"],"pdf_url":"https://arxiv.org/pdf/2408.08521v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2408.09017v1","updated":"2024-08-16T20:55:21Z","published":"2024-08-16T20:55:21Z","title":"Meta Knowledge for Retrieval Augmented Large Language Models","summary":" Retrieval Augmented Generation (RAG) is a technique used to augment Large\nLanguage Models (LLMs) with contextually relevant, time-critical, or\ndomain-specific information without altering the underlying model parameters.\nHowever, constructing RAG systems that can effectively synthesize information\nfrom large and diverse set of documents remains a significant challenge. We\nintroduce a novel data-centric RAG workflow for LLMs, transforming the\ntraditional retrieve-then-read system into a more advanced\nprepare-then-rewrite-then-retrieve-then-read framework, to achieve higher\ndomain expert-level understanding of the knowledge base. Our methodology relies\non generating metadata and synthetic Questions and Answers (QA) for each\ndocument, as well as introducing the new concept of Meta Knowledge Summary (MK\nSummary) for metadata-based clusters of documents. The proposed innovations\nenable personalized user-query augmentation and in-depth information retrieval\nacross the knowledge base. Our research makes two significant contributions:\nusing LLMs as evaluators and employing new comparative performance metrics, we\ndemonstrate that (1) using augmented queries with synthetic question matching\nsignificantly outperforms traditional RAG pipelines that rely on document\nchunking (p < 0.01), and (2) meta knowledge-augmented queries additionally\nsignificantly improve retrieval precision and recall, as well as the final\nanswers breadth, depth, relevancy, and specificity. Our methodology is\ncost-effective, costing less than $20 per 2000 research papers using Claude 3\nHaiku, and can be adapted with any fine-tuning of either the language or\nembedding models to further enhance the performance of end-to-end RAG\npipelines.\n","authors":["Laurent Mombaerts","Terry Ding","Adi Banerjee","Florian Felice","Jonathan Taws","Tarik Borogovac"],"pdf_url":"https://arxiv.org/pdf/2408.09017v1.pdf","comment":"Accepted in Workshop on Generative AI for Recommender Systems and\n Personalization, KDD 2024"},{"id":"http://arxiv.org/abs/2408.08981v1","updated":"2024-08-16T19:10:48Z","published":"2024-08-16T19:10:48Z","title":"From Lazy to Prolific: Tackling Missing Labels in Open Vocabulary\n Extreme Classification by Positive-Unlabeled Sequence Learning","summary":" Open-vocabulary Extreme Multi-label Classification (OXMC) extends traditional\nXMC by allowing prediction beyond an extremely large, predefined label set\n(typically $10^3$ to $10^{12}$ labels), addressing the dynamic nature of\nreal-world labeling tasks. However, self-selection bias in data annotation\nleads to significant missing labels in both training and test data,\nparticularly for less popular inputs. This creates two critical challenges:\ngeneration models learn to be \"lazy'\" by under-generating labels, and\nevaluation becomes unreliable due to insufficient annotation in the test set.\nIn this work, we introduce Positive-Unlabeled Sequence Learning (PUSL), which\nreframes OXMC as an infinite keyphrase generation task, addressing the\ngeneration model's laziness. Additionally, we propose to adopt a suite of\nevaluation metrics, F1@$\\mathcal{O}$ and newly proposed B@$k$, to reliably\nassess OXMC models with incomplete ground truths. In a highly imbalanced\ne-commerce dataset with substantial missing labels, PUSL generates 30% more\nunique labels, and 72% of its predictions align with actual user queries. On\nthe less skewed EURLex-4.3k dataset, PUSL demonstrates superior F1 scores,\nespecially as label counts increase from 15 to 30. Our approach effectively\ntackles both the modeling and evaluation challenges in OXMC with missing\nlabels.\n","authors":["Haoran Ranran Zhang","Bensu Uçar","Soumik Dey","Hansi Wu","Binbin Li","Rui Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.08981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08972v1","updated":"2024-08-16T18:48:15Z","published":"2024-08-16T18:48:15Z","title":"ASGM-KG: Unveiling Alluvial Gold Mining Through Knowledge Graphs","summary":" Artisanal and Small-Scale Gold Mining (ASGM) is a low-cost yet highly\ndestructive mining practice, leading to environmental disasters across the\nworld's tropical watersheds. The topic of ASGM spans multiple domains of\nresearch and information, including natural and social systems, and knowledge\nis often atomized across a diversity of media and documents. We therefore\nintroduce a knowledge graph (ASGM-KG) that consolidates and provides crucial\ninformation about ASGM practices and their environmental effects. The current\nversion of ASGM-KG consists of 1,899 triples extracted using a large language\nmodel (LLM) from documents and reports published by both non-governmental and\ngovernmental organizations. These documents were carefully selected by a group\nof tropical ecologists with expertise in ASGM. This knowledge graph was\nvalidated using two methods. First, a small team of ASGM experts reviewed and\nlabeled triples as factual or non-factual. Second, we devised and applied an\nautomated factual reduction framework that relies on a search engine and an LLM\nfor labeling triples. Our framework performs as well as five baselines on a\npublicly available knowledge graph and achieves over 90 accuracy on our ASGM-KG\nvalidated by domain experts. ASGM-KG demonstrates an advancement in knowledge\naggregation and representation for complex, interdisciplinary environmental\ncrises such as ASGM.\n","authors":["Debashis Gupta","Aditi Golder","Luis Fernendez","Miles Silman","Greg Lersen","Fan Yang","Bob Plemmons","Sarra Alqahtani","Paul Victor Pauca"],"pdf_url":"https://arxiv.org/pdf/2408.08972v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08933v1","updated":"2024-08-16T06:48:16Z","published":"2024-08-16T06:48:16Z","title":"RoarGraph: A Projected Bipartite Graph for Efficient Cross-Modal\n Approximate Nearest Neighbor Search","summary":" Approximate Nearest Neighbor Search (ANNS) is a fundamental and critical\ncomponent in many applications, including recommendation systems and large\nlanguage model-based applications. With the advancement of multimodal neural\nmodels, which transform data from different modalities into a shared\nhigh-dimensional space as feature vectors, cross-modal ANNS aims to use the\ndata vector from one modality (e.g., texts) as the query to retrieve the most\nsimilar items from another (e.g., images or videos). However, there is an\ninherent distribution gap between embeddings from different modalities, and\ncross-modal queries become Out-of-Distribution (OOD) to the base data.\nConsequently, state-of-the-art ANNS approaches suffer poor performance for OOD\nworkloads. In this paper, we quantitatively analyze the properties of the OOD\nworkloads to gain an understanding of their ANNS efficiency. Unlike\nsingle-modal workloads, we reveal OOD queries spatially deviate from base data,\nand the k-nearest neighbors of an OOD query are distant from each other in the\nembedding space. The property breaks the assumptions of existing ANNS\napproaches and mismatches their design for efficient search. With insights from\nthe OOD workloads, we propose pRojected bipartite Graph (RoarGraph), an\nefficient ANNS graph index built under the guidance of query distribution.\nExtensive experiments show that RoarGraph significantly outperforms\nstate-of-the-art approaches on modern cross-modal datasets, achieving up to\n3.56x faster search speed at a 90% recall rate for OOD queries.\n","authors":["Meng Chen","Kai Zhang","Zhenying He","Yinan Jing","X. Sean Wang"],"pdf_url":"https://arxiv.org/pdf/2408.08933v1.pdf","comment":"to be published in PVLDB"},{"id":"http://arxiv.org/abs/2408.08931v1","updated":"2024-08-16T05:49:14Z","published":"2024-08-16T05:49:14Z","title":"Personalized Federated Collaborative Filtering: A Variational\n AutoEncoder Approach","summary":" Federated Collaborative Filtering (FedCF) is an emerging field focused on\ndeveloping a new recommendation framework with preserving privacy in a\nfederated setting. Existing FedCF methods typically combine distributed\nCollaborative Filtering (CF) algorithms with privacy-preserving mechanisms, and\nthen preserve personalized information into a user embedding vector. However,\nthe user embedding is usually insufficient to preserve the rich information of\nthe fine-grained personalization across heterogeneous clients. This paper\nproposes a novel personalized FedCF method by preserving users' personalized\ninformation into a latent variable and a neural model simultaneously.\nSpecifically, we decompose the modeling of user knowledge into two encoders,\neach designed to capture shared knowledge and personalized knowledge\nseparately. A personalized gating network is then applied to balance\npersonalization and generalization between the global and local encoders.\nMoreover, to effectively train the proposed framework, we model the CF problem\nas a specialized Variational AutoEncoder (VAE) task by integrating user\ninteraction vector reconstruction with missing value prediction. The decoder is\ntrained to reconstruct the implicit feedback from items the user has interacted\nwith, while also predicting items the user might be interested in but has not\nyet interacted with. Experimental results on benchmark datasets demonstrate\nthat the proposed method outperforms other baseline methods, showcasing\nsuperior performance.\n","authors":["Zhiwei Li","Guodong Long","Tianyi Zhou","Jing Jiang","Chengqi Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.08931v1.pdf","comment":"10 pages, 3 figures, 4 tables, conference"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2408.08873v1","updated":"2024-08-16T17:59:46Z","published":"2024-08-16T17:59:46Z","title":"Accelerating Giant Impact Simulations with Machine Learning","summary":" Constraining planet formation models based on the observed exoplanet\npopulation requires generating large samples of synthetic planetary systems,\nwhich can be computationally prohibitive. A significant bottleneck is\nsimulating the giant impact phase, during which planetary embryos evolve\ngravitationally and combine to form planets, which may themselves experience\nlater collisions. To accelerate giant impact simulations, we present a machine\nlearning (ML) approach to predicting collisional outcomes in multiplanet\nsystems. Trained on more than 500,000 $N$-body simulations of three-planet\nsystems, we develop an ML model that can accurately predict which two planets\nwill experience a collision, along with the state of the post-collision\nplanets, from a short integration of the system's initial conditions. Our model\ngreatly improves on non-ML baselines that rely on metrics from dynamics theory,\nwhich struggle to accurately predict which pair of planets will experience a\ncollision. By combining with a model for predicting long-term stability, we\ncreate an efficient ML-based giant impact emulator, which can predict the\noutcomes of giant impact simulations with a speedup of up to four orders of\nmagnitude. We expect our model to enable analyses that would not otherwise be\ncomputationally feasible. As such, we release our full training code, along\nwith an easy-to-use API for our collision outcome model and giant impact\nemulator.\n","authors":["Caleb Lammers","Miles Cranmer","Sam Hadden","Shirley Ho","Norman Murray","Daniel Tamayo"],"pdf_url":"https://arxiv.org/pdf/2408.08873v1.pdf","comment":"15 pages, 7 figures, 1 table. Easy-to-use API available at\n https://github.com/dtamayo/spock"},{"id":"http://arxiv.org/abs/2405.17243v2","updated":"2024-08-16T17:55:32Z","published":"2024-05-27T14:58:24Z","title":"Surprise-Adaptive Intrinsic Motivation for Unsupervised Reinforcement\n Learning","summary":" Both entropy-minimizing and entropy-maximizing (curiosity) objectives for\nunsupervised reinforcement learning (RL) have been shown to be effective in\ndifferent environments, depending on the environment's level of natural\nentropy. However, neither method alone results in an agent that will\nconsistently learn intelligent behavior across environments. In an effort to\nfind a single entropy-based method that will encourage emergent behaviors in\nany environment, we propose an agent that can adapt its objective online,\ndepending on the entropy conditions by framing the choice as a multi-armed\nbandit problem. We devise a novel intrinsic feedback signal for the bandit,\nwhich captures the agent's ability to control the entropy in its environment.\nWe demonstrate that such agents can learn to control entropy and exhibit\nemergent behaviors in both high- and low-entropy regimes and can learn skillful\nbehaviors in benchmark tasks. Videos of the trained agents and summarized\nfindings can be found on our project page\nhttps://sites.google.com/view/surprise-adaptive-agents\n","authors":["Adriana Hugessen","Roger Creus Castanyer","Faisal Mohamed","Glen Berseth"],"pdf_url":"https://arxiv.org/pdf/2405.17243v2.pdf","comment":"Published at the Reinforcement Learning Conference 2024"},{"id":"http://arxiv.org/abs/2408.08869v1","updated":"2024-08-16T17:54:09Z","published":"2024-08-16T17:54:09Z","title":"PEDAL: Enhancing Greedy Decoding with Large Language Models using\n Diverse Exemplars","summary":" Self-ensembling techniques with diverse reasoning paths such as\nSelf-Consistency have demonstrated remarkable gains in accuracy for Large\nLanguage Models (LLMs). However, such techniques depend on the availability of\nan accurate answer extraction process to aggregate across multiple outputs.\nMoreover, they acquire higher inference cost, in comparison to Greedy Decoding,\ndue to generation of relatively higher number of output tokens. Research has\nshown that the free form text outputs from Self-Consistency can be aggregated\nreliably using LLMs to produce the final output. Additionally, recent\nadvancements in LLM inference have demonstrated that usage of diverse exemplars\nin prompts have the ability to induce diversity in the LLM outputs. Such proven\ntechniques can be easily extended to self-ensembling based approaches to\nachieve enhanced results in text generation. In this paper, we introduce PEDAL\n(Prompts based on Exemplar Diversity Aggregated using LLMs), a hybrid\nself-ensembling approach, that combines the strengths of diverse exemplar based\nprompts and LLM based aggregation to achieve improvement in overall\nperformance. On the publicly available SVAMP and ARC datasets, our experiments\nreveal that PEDAL can achieve better accuracy than Greedy Decoding based\nstrategies with lower inference cost compared to Self Consistency based\napproaches.\n","authors":["Sumanth Prabhu"],"pdf_url":"https://arxiv.org/pdf/2408.08869v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08868v1","updated":"2024-08-16T17:52:22Z","published":"2024-08-16T17:52:22Z","title":"A Hassle-free Algorithm for Private Learning in Practice: Don't Use Tree\n Aggregation, Use BLTs","summary":" The state-of-the-art for training on-device language models for mobile\nkeyboard applications combines federated learning (FL) with differential\nprivacy (DP) via the DP-Follow-the-Regularized-Leader (DP-FTRL) algorithm. Two\nvariants of DP-FTRL are used in practice, tree aggregation and matrix\nfactorization. However, tree aggregation suffers from significantly suboptimal\nprivacy/utility tradeoffs, while matrix mechanisms require expensive\noptimization parameterized by hard-to-estimate-in-advance constants, and high\nruntime memory costs.This paper extends the recently introduced Buffered Linear\nToeplitz (BLT) mechanism to multi-participation scenarios. Our BLT-DP-FTRL\nmaintains the ease-of-use advantages of tree aggregation, while essentially\nmatching matrix factorization in terms of utility and privacy. We evaluate\nBLT-DP-FTRL on the StackOverflow dataset, serving as a re-producible simulation\nbenchmark, and across four on-device language model tasks in a production FL\nsystem. Our empirical results highlight the advantages of the BLT mechanism and\nelevate the practicality and effectiveness of DP in real-world scenarios.\n","authors":["H. Brendan McMahan","Zheng Xu","Yanxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.08868v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08862v1","updated":"2024-08-16T17:44:02Z","published":"2024-08-16T17:44:02Z","title":"Visual Agents as Fast and Slow Thinkers","summary":" Achieving human-level intelligence requires refining cognitive distinctions\nbetween System 1 and System 2 thinking. While contemporary AI, driven by large\nlanguage models, demonstrates human-like traits, it falls short of genuine\ncognition. Transitioning from structured benchmarks to real-world scenarios\npresents challenges for visual agents, often leading to inaccurate and overly\nconfident responses. To address the challenge, we introduce FaST, which\nincorporates the Fast and Slow Thinking mechanism into visual agents. FaST\nemploys a switch adapter to dynamically select between System 1/2 modes,\ntailoring the problem-solving approach to different task complexity. It tackles\nuncertain and unseen objects by adjusting model confidence and integrating new\ncontextual data. With this novel design, we advocate a flexible system,\nhierarchical reasoning capabilities, and a transparent decision-making\npipeline, all of which contribute to its ability to emulate human-like\ncognitive processes in visual intelligence. Empirical results demonstrate that\nFaST outperforms various well-known baselines, achieving 80.8% accuracy over\nVQA^{v2} for visual question answering and 48.7% GIoU score over ReasonSeg for\nreasoning segmentation, demonstrate FaST's superior performance. Extensive\ntesting validates the efficacy and robustness of FaST's core components,\nshowcasing its potential to advance the development of cognitive visual agents\nin AI systems.\n","authors":["Guangyan Sun","Mingyu Jin","Zhenting Wang","Cheng-Long Wang","Siqi Ma","Qifan Wang","Ying Nian Wu","Yongfeng Zhang","Dongfang Liu"],"pdf_url":"https://arxiv.org/pdf/2408.08862v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08859v1","updated":"2024-08-16T17:41:35Z","published":"2024-08-16T17:41:35Z","title":"Stochastic Bandits Robust to Adversarial Attacks","summary":" This paper investigates stochastic multi-armed bandit algorithms that are\nrobust to adversarial attacks, where an attacker can first observe the\nlearner's action and {then} alter their reward observation. We study two cases\nof this model, with or without the knowledge of an attack budget $C$, defined\nas an upper bound of the summation of the difference between the actual and\naltered rewards. For both cases, we devise two types of algorithms with regret\nbounds having additive or multiplicative $C$ dependence terms. For the known\nattack budget case, we prove our algorithms achieve the regret bound of\n${O}((K/\\Delta)\\log T + KC)$ and $\\tilde{O}(\\sqrt{KTC})$ for the additive and\nmultiplicative $C$ terms, respectively, where $K$ is the number of arms, $T$ is\nthe time horizon, $\\Delta$ is the gap between the expected rewards of the\noptimal arm and the second-best arm, and $\\tilde{O}$ hides the logarithmic\nfactors. For the unknown case, we prove our algorithms achieve the regret bound\nof $\\tilde{O}(\\sqrt{KT} + KC^2)$ and $\\tilde{O}(KC\\sqrt{T})$ for the additive\nand multiplicative $C$ terms, respectively. In addition to these upper bound\nresults, we provide several lower bounds showing the tightness of our bounds\nand the optimality of our algorithms. These results delineate an intrinsic\nseparation between the bandits with attacks and corruption models [Lykouris et\nal., 2018].\n","authors":["Xuchuang Wang","Jinhang Zuo","Xutong Liu","John C. S. Lui","Mohammad Hajiesmaili"],"pdf_url":"https://arxiv.org/pdf/2408.08859v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02304v3","updated":"2024-08-16T17:28:08Z","published":"2023-10-03T17:59:32Z","title":"Self-Taught Optimizer (STOP): Recursively Self-Improving Code Generation","summary":" Several recent advances in AI systems solve problems by providing a\n\"scaffolding\" program that structures multiple calls to language models (LMs)\nto generate better outputs. A scaffolding program is written in a programming\nlanguage such as Python. In this work, we use a language-model-infused\nscaffolding program to improve itself. We start with a seed \"improver\" that\nimproves an input program according to a given utility function by querying an\nLM several times and returning the best solution. We then run this seed\nimprover to improve itself. Across a small set of downstream tasks, the\nresulting improved improver generates programs with significantly better\nperformance than its seed improver. A variety of self-improvement strategies\nare proposed by the language model, including beam search, genetic algorithms,\nand simulated annealing. Since the language models themselves are not altered,\nthis is not full recursive self-improvement. Nonetheless, it demonstrates that\na modern language model, GPT-4 in our experiments, is capable of writing code\nthat can call itself to improve itself. We consider concerns around the\ndevelopment of self-improving technologies and evaluate the frequency with\nwhich the generated code bypasses a sandbox.\n","authors":["Eric Zelikman","Eliana Lorch","Lester Mackey","Adam Tauman Kalai"],"pdf_url":"https://arxiv.org/pdf/2310.02304v3.pdf","comment":"Published as a conference paper at COLM 2024"},{"id":"http://arxiv.org/abs/2408.08852v1","updated":"2024-08-16T17:26:42Z","published":"2024-08-16T17:26:42Z","title":"GeoTransformer: Enhancing Urban Forecasting with Geospatial Attention\n Mechanisms","summary":" Recent advancements have focused on encoding urban spatial information into\nhigh-dimensional spaces, with notable efforts dedicated to integrating\nsociodemographic data and satellite imagery. These efforts have established\nfoundational models in this field. However, the effective utilization of these\nspatial representations for urban forecasting applications remains\nunder-explored. To address this gap, we introduce GeoTransformer, a novel\nstructure that synergizes the Transformer architecture with geospatial\nstatistics prior. GeoTransformer employs an innovative geospatial attention\nmechanism to incorporate extensive urban information and spatial dependencies\ninto a unified predictive model. Specifically, we compute geospatial weighted\nattention scores between the target region and surrounding regions and leverage\nthe integrated urban information for predictions. Extensive experiments on GDP\nand ride-share demand prediction tasks demonstrate that GeoTransformer\nsignificantly outperforms existing baseline models, showcasing its potential to\nenhance urban forecasting tasks.\n","authors":["Yuhao Jia","Zile Wu","Shengao Yi","Yifei Sun"],"pdf_url":"https://arxiv.org/pdf/2408.08852v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09173v2","updated":"2024-08-16T17:24:54Z","published":"2024-06-13T14:35:11Z","title":"Potion: Towards Poison Unlearning","summary":" Adversarial attacks by malicious actors on machine learning systems, such as\nintroducing poison triggers into training datasets, pose significant risks. The\nchallenge in resolving such an attack arises in practice when only a subset of\nthe poisoned data can be identified. This necessitates the development of\nmethods to remove, i.e. unlearn, poison triggers from already trained models\nwith only a subset of the poison data available. The requirements for this task\nsignificantly deviate from privacy-focused unlearning where all of the data to\nbe forgotten by the model is known. Previous work has shown that the\nundiscovered poisoned samples lead to a failure of established unlearning\nmethods, with only one method, Selective Synaptic Dampening (SSD), showing\nlimited success. Even full retraining, after the removal of the identified\npoison, cannot address this challenge as the undiscovered poison samples lead\nto a reintroduction of the poison trigger in the model. Our work addresses two\nkey challenges to advance the state of the art in poison unlearning. First, we\nintroduce a novel outlier-resistant method, based on SSD, that significantly\nimproves model protection and unlearning performance. Second, we introduce\nPoison Trigger Neutralisation (PTN) search, a fast, parallelisable,\nhyperparameter search that utilises the characteristic \"unlearning versus model\nprotection\" trade-off to find suitable hyperparameters in settings where the\nforget set size is unknown and the retain set is contaminated. We benchmark our\ncontributions using ResNet-9 on CIFAR10 and WideResNet-28x10 on CIFAR100.\nExperimental results show that our method heals 93.72% of poison compared to\nSSD with 83.41% and full retraining with 40.68%. We achieve this while also\nlowering the average model accuracy drop caused by unlearning from 5.68% (SSD)\nto 1.41% (ours).\n","authors":["Stefan Schoepf","Jack Foster","Alexandra Brintrup"],"pdf_url":"https://arxiv.org/pdf/2406.09173v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12041v3","updated":"2024-08-16T17:22:23Z","published":"2023-09-21T13:09:10Z","title":"S-BDT: Distributed Differentially Private Boosted Decision Trees","summary":" We introduce S-BDT: a novel $(\\varepsilon,\\delta)$-differentially private\ndistributed gradient boosted decision tree (GBDT) learner that improves the\nprotection of single training data points (privacy) while achieving meaningful\nlearning goals, such as accuracy or regression error (utility). S-BDT uses less\nnoise by relying on non-spherical multivariate Gaussian noise, for which we\nshow tight subsampling bounds for privacy amplification and incorporate that\ninto a R\\'enyi filter for individual privacy accounting. We experimentally\nreach the same utility while saving $50\\%$ in terms of epsilon for $\\varepsilon\n\\le 0.5$ on the Abalone regression dataset (dataset size $\\approx 4K$), saving\n$30\\%$ in terms of epsilon for $\\varepsilon \\le 0.08$ for the Adult\nclassification dataset (dataset size $\\approx 50K$), and saving $30\\%$ in terms\nof epsilon for $\\varepsilon\\leq0.03$ for the Spambase classification dataset\n(dataset size $\\approx 5K$). Moreover, we show that for situations where a GBDT\nis learning a stream of data that originates from different subpopulations\n(non-IID), S-BDT improves the saving of epsilon even further.\n","authors":["Thorsten Peinemann","Moritz Kirschte","Joshua Stock","Carlos Cotrini","Esfandiar Mohammadi"],"pdf_url":"https://arxiv.org/pdf/2309.12041v3.pdf","comment":"The first two authors equally contributed to this work"},{"id":"http://arxiv.org/abs/2408.08847v1","updated":"2024-08-16T17:19:07Z","published":"2024-08-16T17:19:07Z","title":"HistoGym: A Reinforcement Learning Environment for Histopathological\n Image Analysis","summary":" In pathological research, education, and clinical practice, the\ndecision-making process based on pathological images is critically important.\nThis significance extends to digital pathology image analysis: its adequacy is\ndemonstrated by the extensive information contained within tissue structures,\nwhich is essential for accurate cancer classification and grading.\nAdditionally, its necessity is highlighted by the inherent requirement for\ninterpretability in the conclusions generated by algorithms. For humans,\ndetermining tumor type and grade typically involves multi-scale analysis, which\npresents a significant challenge for AI algorithms. Traditional patch-based\nmethods are inadequate for modeling such complex structures, as they fail to\ncapture the intricate, multi-scale information inherent in whole slide images.\nConsequently, there is a pressing need for advanced AI techniques capable of\nefficiently and accurately replicating this complex analytical process. To\naddress this issue, we introduce HistoGym, an open-source reinforcement\nlearning environment for histopathological image analysis. Following OpenAI Gym\nAPIs, HistoGym aims to foster whole slide image diagnosis by mimicking the\nreal-life processes of doctors. Leveraging the pyramid feature of WSIs and the\nOpenSlide API, HistoGym provides a unified framework for various clinical\ntasks, including tumor detection and classification. We detail the observation,\naction, and reward specifications tailored for the histopathological image\nanalysis domain and provide an open-source Python-based interface for both\nclinicians and researchers. To accommodate different clinical demands, we offer\nvarious scenarios for different organs and cancers, including both WSI-based\nand selected region-based scenarios, showcasing several noteworthy results.\n","authors":["Zhi-Bo Liu","Xiaobo Pang","Jizhao Wang","Shuai Liu","Chen Li"],"pdf_url":"https://arxiv.org/pdf/2408.08847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15455v2","updated":"2024-08-16T17:12:27Z","published":"2024-03-18T23:41:52Z","title":"Improving Sampling Methods for Fine-tuning SentenceBERT in Text Streams","summary":" The proliferation of textual data on the Internet presents a unique\nopportunity for institutions and companies to monitor public opinion about\ntheir services and products. Given the rapid generation of such data, the text\nstream mining setting, which handles sequentially arriving, potentially\ninfinite text streams, is often more suitable than traditional batch learning.\nWhile pre-trained language models are commonly employed for their high-quality\ntext vectorization capabilities in streaming contexts, they face challenges\nadapting to concept drift - the phenomenon where the data distribution changes\nover time, adversely affecting model performance. Addressing the issue of\nconcept drift, this study explores the efficacy of seven text sampling methods\ndesigned to selectively fine-tune language models, thereby mitigating\nperformance degradation. We precisely assess the impact of these methods on\nfine-tuning the SBERT model using four different loss functions. Our\nevaluation, focused on Macro F1-score and elapsed time, employs two text stream\ndatasets and an incremental SVM classifier to benchmark performance. Our\nfindings indicate that Softmax loss and Batch All Triplets loss are\nparticularly effective for text stream classification, demonstrating that\nlarger sample sizes generally correlate with improved macro F1-scores. Notably,\nour proposed WordPieceToken ratio sampling method significantly enhances\nperformance with the identified loss functions, surpassing baseline results.\n","authors":["Cristiano Mesquita Garcia","Alessandro Lameiras Koerich","Alceu de Souza Britto Jr","Jean Paul Barddal"],"pdf_url":"https://arxiv.org/pdf/2403.15455v2.pdf","comment":"Accepted for presentation at the 27th International Conference on\n Pattern Recognition (ICPR) 2024"},{"id":"http://arxiv.org/abs/2408.08845v1","updated":"2024-08-16T17:06:07Z","published":"2024-08-16T17:06:07Z","title":"Shapley Marginal Surplus for Strong Models","summary":" Shapley values have seen widespread use in machine learning as a way to\nexplain model predictions and estimate the importance of covariates. Accurately\nexplaining models is critical in real-world models to both aid in decision\nmaking and to infer the properties of the true data-generating process (DGP).\nIn this paper, we demonstrate that while model-based Shapley values might be\naccurate explainers of model predictions, machine learning models themselves\nare often poor explainers of the DGP even if the model is highly accurate.\nParticularly in the presence of interrelated or noisy variables, the output of\na highly predictive model may fail to account for these relationships. This\nimplies explanations of a trained model's behavior may fail to provide\nmeaningful insight into the DGP. In this paper we introduce a novel variable\nimportance algorithm, Shapley Marginal Surplus for Strong Models, that samples\nthe space of possible models to come up with an inferential measure of feature\nimportance. We compare this method to other popular feature importance methods,\nboth Shapley-based and non-Shapley based, and demonstrate significant\noutperformance in inferential capabilities relative to other methods.\n","authors":["Daniel de Marchi","Michael Kosorok","Scott de Marchi"],"pdf_url":"https://arxiv.org/pdf/2408.08845v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07246v2","updated":"2024-08-16T16:46:32Z","published":"2024-08-14T01:16:40Z","title":"ChemVLM: Exploring the Power of Multimodal Large Language Models in\n Chemistry Area","summary":" Large Language Models (LLMs) have achieved remarkable success and have been\napplied across various scientific fields, including chemistry. However, many\nchemical tasks require the processing of visual information, which cannot be\nsuccessfully handled by existing chemical LLMs. This brings a growing need for\nmodels capable of integrating multimodal information in the chemical domain. In\nthis paper, we introduce \\textbf{ChemVLM}, an open-source chemical multimodal\nlarge language model specifically designed for chemical applications. ChemVLM\nis trained on a carefully curated bilingual multimodal dataset that enhances\nits ability to understand both textual and visual chemical information,\nincluding molecular structures, reactions, and chemistry examination questions.\nWe develop three datasets for comprehensive evaluation, tailored to Chemical\nOptical Character Recognition (OCR), Multimodal Chemical Reasoning (MMCR), and\nMultimodal Molecule Understanding tasks. We benchmark ChemVLM against a range\nof open-source and proprietary multimodal large language models on various\ntasks. Experimental results demonstrate that ChemVLM achieves competitive\nperformance across all evaluated tasks. Our model can be found at\nhttps://huggingface.co/AI4Chem/ChemVLM-26B.\n","authors":["Junxian Li","Di Zhang","Xunzhi Wang","Zeying Hao","Jingdi Lei","Qian Tan","Cai Zhou","Wei Liu","Yaotian Yang","Xinrui Xiong","Weiyun Wang","Zhe Chen","Wenhai Wang","Wei Li","Shufei Zhang","Mao Su","Wanli Ouyang","Yuqiang Li","Dongzhan Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.07246v2.pdf","comment":"11 pages, updated version"},{"id":"http://arxiv.org/abs/2408.08837v1","updated":"2024-08-16T16:41:27Z","published":"2024-08-16T16:41:27Z","title":"Entropy Coding of Unordered Data Structures","summary":" We present shuffle coding, a general method for optimal compression of\nsequences of unordered objects using bits-back coding. Data structures that can\nbe compressed using shuffle coding include multisets, graphs, hypergraphs, and\nothers. We release an implementation that can easily be adapted to different\ndata types and statistical models, and demonstrate that our implementation\nachieves state-of-the-art compression rates on a range of graph datasets\nincluding molecular data.\n","authors":["Julius Kunze","Daniel Severo","Giulio Zani","Jan-Willem van de Meent","James Townsend"],"pdf_url":"https://arxiv.org/pdf/2408.08837v1.pdf","comment":"Published at ICLR 2024"},{"id":"http://arxiv.org/abs/2204.08335v3","updated":"2024-08-16T16:40:52Z","published":"2022-04-18T14:27:31Z","title":"Active Learning with Weak Supervision for Gaussian Processes","summary":" Annotating data for supervised learning can be costly. When the annotation\nbudget is limited, active learning can be used to select and annotate those\nobservations that are likely to give the most gain in model performance. We\npropose an active learning algorithm that, in addition to selecting which\nobservation to annotate, selects the precision of the annotation that is\nacquired. Assuming that annotations with low precision are cheaper to obtain,\nthis allows the model to explore a larger part of the input space, with the\nsame annotation budget. We build our acquisition function on the previously\nproposed BALD objective for Gaussian Processes, and empirically demonstrate the\ngains of being able to adjust the annotation precision in the active learning\nloop.\n","authors":["Amanda Olmin","Jakob Lindqvist","Lennart Svensson","Fredrik Lindsten"],"pdf_url":"https://arxiv.org/pdf/2204.08335v3.pdf","comment":"This version of the contribution has been accepted for publication,\n after peer review but is not the Version of Record and does not reflect\n post-acceptance improvements, or any corrections. The Version of Record is\n available online at: http://dx.doi.org/10.1007/978-981-99-1642-9_17. Use of\n this Accepted Version is subject to the publisher's Accepted Manuscript terms\n of use"},{"id":"http://arxiv.org/abs/2407.03194v4","updated":"2024-08-16T16:37:09Z","published":"2024-07-03T15:26:02Z","title":"Prediction Instability in Machine Learning Ensembles","summary":" In machine learning ensembles predictions from multiple models are\naggregated. Despite widespread use and strong performance of ensembles in\napplied problems little is known about the mathematical properties of\naggregating models and associated consequences for safe, explainable use of\nsuch models. In this paper we prove a theorem that shows that any ensemble will\nexhibit at least one of the following forms of prediction instability. It will\neither ignore agreement among all underlying models, change its mind when none\nof the underlying models have done so, or be manipulable through inclusion or\nexclusion of options it would never actually predict. As a consequence,\nensemble aggregation procedures will always need to balance the benefits of\ninformation use against the risk of these prediction instabilities. This\nanalysis also sheds light on what specific forms of prediction instability to\nexpect from particular ensemble algorithms; for example popular tree ensembles\nlike random forest, or xgboost will violate basic, intuitive fairness\nproperties. Finally, we show that this can be ameliorated by using consistent\nmodels in asymptotic conditions.\n","authors":["Jeremy Kedziora"],"pdf_url":"https://arxiv.org/pdf/2407.03194v4.pdf","comment":"15 pages, uses a modified version of ICML2024.sty"},{"id":"http://arxiv.org/abs/2311.00201v2","updated":"2024-08-16T16:34:00Z","published":"2023-11-01T00:15:18Z","title":"Federated Natural Policy Gradient and Actor Critic Methods for\n Multi-task Reinforcement Learning","summary":" Federated reinforcement learning (RL) enables collaborative decision making\nof multiple distributed agents without sharing local data trajectories. In this\nwork, we consider a multi-task setting, in which each agent has its own private\nreward function corresponding to different tasks, while sharing the same\ntransition kernel of the environment. Focusing on infinite-horizon Markov\ndecision processes, the goal is to learn a globally optimal policy that\nmaximizes the sum of the discounted total rewards of all the agents in a\ndecentralized manner, where each agent only communicates with its neighbors\nover some prescribed graph topology.\n We develop federated vanilla and entropy-regularized natural policy gradient\n(NPG) methods in the tabular setting under softmax parameterization, where\ngradient tracking is applied to estimate the global Q-function to mitigate the\nimpact of imperfect information sharing. We establish non-asymptotic global\nconvergence guarantees under exact policy evaluation, where the rates are\nnearly independent of the size of the state-action space and illuminate the\nimpacts of network size and connectivity. To the best of our knowledge, this is\nthe first time that near dimension-free global convergence is established for\nfederated multi-task RL using policy optimization. We further go beyond the\ntabular setting by proposing a federated natural actor critic (NAC) method for\nmulti-task RL with function approximation, and establish its finite-time sample\ncomplexity taking the errors of function approximation into account.\n","authors":["Tong Yang","Shicong Cen","Yuting Wei","Yuxin Chen","Yuejie Chi"],"pdf_url":"https://arxiv.org/pdf/2311.00201v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01008v3","updated":"2024-08-16T16:26:11Z","published":"2023-03-31T16:11:56Z","title":"Self-Supervised Multimodal Learning: A Survey","summary":" Multimodal learning, which aims to understand and analyze information from\nmultiple modalities, has achieved substantial progress in the supervised regime\nin recent years. However, the heavy dependence on data paired with expensive\nhuman annotations impedes scaling up models. Meanwhile, given the availability\nof large-scale unannotated data in the wild, self-supervised learning has\nbecome an attractive strategy to alleviate the annotation bottleneck. Building\non these two directions, self-supervised multimodal learning (SSML) provides\nways to learn from raw multimodal data. In this survey, we provide a\ncomprehensive review of the state-of-the-art in SSML, in which we elucidate\nthree major challenges intrinsic to self-supervised learning with multimodal\ndata: (1) learning representations from multimodal data without labels, (2)\nfusion of different modalities, and (3) learning with unaligned data. We then\ndetail existing solutions to these challenges. Specifically, we consider (1)\nobjectives for learning from multimodal unlabeled data via self-supervision,\n(2) model architectures from the perspective of different multimodal fusion\nstrategies, and (3) pair-free learning strategies for coarse-grained and\nfine-grained alignment. We also review real-world applications of SSML\nalgorithms in diverse fields such as healthcare, remote sensing, and machine\ntranslation. Finally, we discuss challenges and future directions for SSML. A\ncollection of related resources can be found at:\nhttps://github.com/ys-zong/awesome-self-supervised-multimodal-learning.\n","authors":["Yongshuo Zong","Oisin Mac Aodha","Timothy Hospedales"],"pdf_url":"https://arxiv.org/pdf/2304.01008v3.pdf","comment":"Accepted to IEEE T-PAMI"},{"id":"http://arxiv.org/abs/2401.11325v3","updated":"2024-08-16T16:18:28Z","published":"2024-01-20T21:09:27Z","title":"Detecting Hidden Triggers: Mapping Non-Markov Reward Functions to Markov","summary":" Many Reinforcement Learning algorithms assume a Markov reward function to\nguarantee optimality. However, not all reward functions are Markov. This paper\nproposes a framework for mapping non-Markov reward functions into equivalent\nMarkov ones by learning specialized reward automata, Reward Machines. Unlike\nthe general practice of learning Reward Machines, we do not require a set of\nhigh-level propositional symbols from which to learn. Rather, we learn hidden\ntriggers, directly from data, that construct them. We demonstrate the\nimportance of learning Reward Machines over their Deterministic Finite-State\nAutomata counterparts given their ability to model reward dependencies. We\nformalize this distinction in our learning objective. Our mapping process is\nconstructed as an Integer Linear Programming problem. We prove that our\nmappings form a suitable proxy for maximizing reward expectations. We\nempirically validate our approach by learning black-box, non-Markov reward\nfunctions in the Officeworld domain. Additionally, we demonstrate the\neffectiveness of learning reward dependencies in a new domain, Breakfastworld.\n","authors":["Gregory Hyde","Eugene Santos Jr"],"pdf_url":"https://arxiv.org/pdf/2401.11325v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08824v1","updated":"2024-08-16T16:15:57Z","published":"2024-08-16T16:15:57Z","title":"LEVIS: Large Exact Verifiable Input Spaces for Neural Networks","summary":" The robustness of neural networks is paramount in safety-critical\napplications. While most current robustness verification methods assess the\nworst-case output under the assumption that the input space is known,\nidentifying a verifiable input space $\\mathcal{C}$, where no adversarial\nexamples exist, is crucial for effective model selection, robustness\nevaluation, and the development of reliable control strategies. To address this\nchallenge, we introduce a novel framework, $\\texttt{LEVIS}$, comprising\n$\\texttt{LEVIS}$-$\\alpha$ and $\\texttt{LEVIS}$-$\\beta$.\n$\\texttt{LEVIS}$-$\\alpha$ locates the largest possible verifiable ball within\nthe central region of $\\mathcal{C}$ that intersects at least two boundaries. In\ncontrast, $\\texttt{LEVIS}$-$\\beta$ integrates multiple verifiable balls to\nencapsulate the entirety of the verifiable space comprehensively. Our\ncontributions are threefold: (1) We propose $\\texttt{LEVIS}$ equipped with\nthree pioneering techniques that identify the maximum verifiable ball and the\nnearest adversarial point along collinear or orthogonal directions. (2) We\noffer a theoretical analysis elucidating the properties of the verifiable balls\nacquired through $\\texttt{LEVIS}$-$\\alpha$ and $\\texttt{LEVIS}$-$\\beta$. (3) We\nvalidate our methodology across diverse applications, including electrical\npower flow regression and image classification, showcasing performance\nenhancements and visualizations of the searching characteristics.\n","authors":["Mohamad Fares El Hajj Chehade","Brian Wesley Bell","Russell Bent","Hao Zhu","Wenting Li"],"pdf_url":"https://arxiv.org/pdf/2408.08824v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08823v1","updated":"2024-08-16T16:15:18Z","published":"2024-08-16T16:15:18Z","title":"Optimal Symmetries in Binary Classification","summary":" We explore the role of group symmetries in binary classification tasks,\npresenting a novel framework that leverages the principles of Neyman-Pearson\noptimality. Contrary to the common intuition that larger symmetry groups lead\nto improved classification performance, our findings show that selecting the\nappropriate group symmetries is crucial for optimising generalisation and\nsample efficiency. We develop a theoretical foundation for designing group\nequivariant neural networks that align the choice of symmetries with the\nunderlying probability distributions of the data. Our approach provides a\nunified methodology for improving classification accuracy across a broad range\nof applications by carefully tailoring the symmetry group to the specific\ncharacteristics of the problem. Theoretical analysis and experimental results\ndemonstrate that optimal classification performance is not always associated\nwith the largest equivariant groups possible in the domain, even when the\nlikelihood ratio is invariant under one of its proper subgroups, but rather\nwith those subgroups themselves. This work offers insights and practical\nguidelines for constructing more effective group equivariant architectures in\ndiverse machine-learning contexts.\n","authors":["Vishal S. Ngairangbam","Michael Spannowsky"],"pdf_url":"https://arxiv.org/pdf/2408.08823v1.pdf","comment":"13 pages, 1 figure, 2 tables"},{"id":"http://arxiv.org/abs/2405.15019v2","updated":"2024-08-16T15:56:46Z","published":"2024-05-23T19:44:03Z","title":"Agentic Skill Discovery","summary":" Language-conditioned robotic skills make it possible to apply the high-level\nreasoning of Large Language Models (LLMs) to low-level robotic control. A\nremaining challenge is to acquire a diverse set of fundamental skills. Existing\napproaches either manually decompose a complex task into atomic robotic actions\nin a top-down fashion, or bootstrap as many combinations as possible in a\nbottom-up fashion to cover a wider range of task possibilities. These\ndecompositions or combinations, however, require an initial skill library. For\nexample, a ``grasping'' capability can never emerge from a skill library\ncontaining only diverse ``pushing'' skills. Existing skill discovery techniques\nwith reinforcement learning acquire skills by an exhaustive exploration but\noften yield non-meaningful behaviors. In this study, we introduce a novel\nframework for skill discovery that is entirely driven by LLMs. The framework\nbegins with an LLM generating task proposals based on the provided scene\ndescription and the robot's configurations, aiming to incrementally acquire new\nskills upon task completion. For each proposed task, a series of reinforcement\nlearning processes are initiated, utilizing reward and success determination\nfunctions sampled by the LLM to develop the corresponding policy. The\nreliability and trustworthiness of learned behaviors are further ensured by an\nindependent vision-language model. We show that starting with zero skill, the\nskill library emerges and expands to more and more meaningful and reliable\nskills, enabling the robot to efficiently further propose and complete advanced\ntasks. Project page: \\url{https://agentic-skill-discovery.github.io}.\n","authors":["Xufeng Zhao","Cornelius Weber","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2405.15019v2.pdf","comment":"Webpage see https://agentic-skill-discovery.github.io/"},{"id":"http://arxiv.org/abs/2408.08815v1","updated":"2024-08-16T15:49:30Z","published":"2024-08-16T15:49:30Z","title":"An Empirical Examination of Balancing Strategy for Counterfactual\n Estimation on Time Series","summary":" Counterfactual estimation from observations represents a critical endeavor in\nnumerous application fields, such as healthcare and finance, with the primary\nchallenge being the mitigation of treatment bias. The balancing strategy aimed\nat reducing covariate disparities between different treatment groups serves as\na universal solution. However, when it comes to the time series data, the\neffectiveness of balancing strategies remains an open question, with a thorough\nanalysis of the robustness and applicability of balancing strategies still\nlacking. This paper revisits counterfactual estimation in the temporal setting\nand provides a brief overview of recent advancements in balancing strategies.\nMore importantly, we conduct a critical empirical examination for the\neffectiveness of the balancing strategies within the realm of temporal\ncounterfactual estimation in various settings on multiple datasets. Our\nfindings could be of significant interest to researchers and practitioners and\ncall for a reexamination of the balancing strategy in time series settings.\n","authors":["Qiang Huang","Chuizheng Meng","Defu Cao","Biwei Huang","Yi Chang","Yan Liu"],"pdf_url":"https://arxiv.org/pdf/2408.08815v1.pdf","comment":"ICML 2024 Carema Ready Version. 20 Pages, 12 Figures, 10 Tables"},{"id":"http://arxiv.org/abs/2408.08812v1","updated":"2024-08-16T15:47:08Z","published":"2024-08-16T15:47:08Z","title":"CAT: Caution Aware Transfer in Reinforcement Learning via Distributional\n Risk","summary":" Transfer learning in reinforcement learning (RL) has become a pivotal\nstrategy for improving data efficiency in new, unseen tasks by utilizing\nknowledge from previously learned tasks. This approach is especially beneficial\nin real-world deployment scenarios where computational resources are\nconstrained and agents must adapt rapidly to novel environments. However,\ncurrent state-of-the-art methods often fall short in ensuring safety during the\ntransfer process, particularly when unforeseen risks emerge in the deployment\nphase. In this work, we address these limitations by introducing a novel\nCaution-Aware Transfer Learning (CAT) framework. Unlike traditional approaches\nthat limit risk considerations to mean-variance, we define \"caution\" as a more\ngeneralized and comprehensive notion of risk. Our core innovation lies in\noptimizing a weighted sum of reward return and caution-based on state-action\noccupancy measures-during the transfer process, allowing for a rich\nrepresentation of diverse risk factors. To the best of our knowledge, this is\nthe first work to explore the optimization of such a generalized risk notion\nwithin the context of transfer RL. Our contributions are threefold: (1) We\npropose a Caution-Aware Transfer (CAT) framework that evaluates source policies\nwithin the test environment and constructs a new policy that balances reward\nmaximization and caution. (2) We derive theoretical sub-optimality bounds for\nour method, providing rigorous guarantees of its efficacy. (3) We empirically\nvalidate CAT, demonstrating that it consistently outperforms existing methods\nby delivering safer policies under varying risk conditions in the test tasks.\n","authors":["Mohamad Fares El Hajj Chehade","Amrit Singh Bedi","Amy Zhang","Hao Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.08812v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08808v1","updated":"2024-08-16T15:41:43Z","published":"2024-08-16T15:41:43Z","title":"Constructing Domain-Specific Evaluation Sets for LLM-as-a-judge","summary":" Large Language Models (LLMs) have revolutionized the landscape of machine\nlearning, yet current benchmarks often fall short in capturing the diverse\nbehavior of these models in real-world applications. A benchmark's usefulness\nis determined by its ability to clearly differentiate between models of varying\ncapabilities (separability) and closely align with human preferences. Existing\nframeworks like Alpaca-Eval 2.0 LC\n\\cite{dubois2024lengthcontrolledalpacaevalsimpleway} and Arena-Hard v0.1\n\\cite{li2024crowdsourced} are limited by their focus on general-purpose queries\nand lack of diversity across domains such as law, medicine, and multilingual\ncontexts. In this paper, we address these limitations by introducing a novel\ndata pipeline that curates diverse, domain-specific evaluation sets tailored\nfor LLM-as-a-Judge frameworks. Our approach leverages a combination of manual\ncuration, semi-supervised learning to generate clusters, and stratified\nsampling to ensure balanced representation across a wide range of domains and\nlanguages. The resulting evaluation set, which includes 1573 samples across 14\ncategories, demonstrates high separability (84\\%) across ten top-ranked models,\nand agreement (84\\%) with Chatbot Arena and (0.915) Spearman correlation. The\nagreement values are 9\\% better than Arena Hard and 20\\% better than AlpacaEval\n2.0 LC, while the Spearman coefficient is 0.7 more than the next best\nbenchmark, showcasing a significant improvement in the usefulness of the\nbenchmark. We further provide an open-source evaluation tool that enables\nfine-grained analysis of model performance across user-defined categories,\noffering valuable insights for practitioners. This work contributes to the\nongoing effort to enhance the transparency, diversity, and effectiveness of LLM\nevaluation methodologies.\n","authors":["Ravi Raju","Swayambhoo Jain","Bo Li","Jonathan Li","Urmish Thakkar"],"pdf_url":"https://arxiv.org/pdf/2408.08808v1.pdf","comment":"14 pages, 8 figures"},{"id":"http://arxiv.org/abs/2405.17391v2","updated":"2024-08-16T15:29:52Z","published":"2024-05-27T17:44:33Z","title":"Dataset-learning duality and emergent criticality","summary":" In artificial neural networks, the activation dynamics of non-trainable\nvariables is strongly coupled to the learning dynamics of trainable variables.\nDuring the activation pass, the boundary neurons (e.g., input neurons) are\nmapped to the bulk neurons (e.g., hidden neurons), and during the learning\npass, both bulk and boundary neurons are mapped to changes in trainable\nvariables (e.g., weights and biases). For example, in feed-forward neural\nnetworks, forward propagation is the activation pass and backward propagation\nis the learning pass. We show that a composition of the two maps establishes a\nduality map between a subspace of non-trainable boundary variables (e.g.,\ndataset) and a tangent subspace of trainable variables (i.e., learning). In\ngeneral, the dataset-learning duality is a complex non-linear map between\nhigh-dimensional spaces, but in a learning equilibrium, the problem can be\nlinearized and reduced to many weakly coupled one-dimensional problems. We use\nthe duality to study the emergence of criticality, or the power-law\ndistributions of fluctuations of the trainable variables. In particular, we\nshow that criticality can emerge in the learning system even from the dataset\nin a non-critical state, and that the power-law distribution can be modified by\nchanging either the activation function or the loss function.\n","authors":["Ekaterina Kukleva","Vitaly Vanchurin"],"pdf_url":"https://arxiv.org/pdf/2405.17391v2.pdf","comment":"29 pages, 9 figures, 1 table, minor corrections"},{"id":"http://arxiv.org/abs/2408.08799v1","updated":"2024-08-16T15:16:35Z","published":"2024-08-16T15:16:35Z","title":"Representation Learning of Geometric Trees","summary":" Geometric trees are characterized by their tree-structured layout and\nspatially constrained nodes and edges, which significantly impacts their\ntopological attributes. This inherent hierarchical structure plays a crucial\nrole in domains such as neuron morphology and river geomorphology, but\ntraditional graph representation methods often overlook these specific\ncharacteristics of tree structures. To address this, we introduce a new\nrepresentation learning framework tailored for geometric trees. It first\nfeatures a unique message passing neural network, which is both provably\ngeometrical structure-recoverable and rotation-translation invariant. To\naddress the data label scarcity issue, our approach also includes two\ninnovative training targets that reflect the hierarchical ordering and\ngeometric structure of these geometric trees. This enables fully\nself-supervised learning without explicit labels. We validate our method's\neffectiveness on eight real-world datasets, demonstrating its capability to\nrepresent geometric trees.\n","authors":["Zheng Zhang","Allen Zhang","Ruth Nelson","Giorgio Ascoli","Liang Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.08799v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06945v2","updated":"2024-08-16T15:09:09Z","published":"2024-08-13T15:03:46Z","title":"Heavy-Ball Momentum Accelerated Actor-Critic With Function Approximation","summary":" By using an parametric value function to replace the Monte-Carlo rollouts for\nvalue estimation, the actor-critic (AC) algorithms can reduce the variance of\nstochastic policy gradient so that to improve the convergence rate. While\nexisting works mainly focus on analyzing convergence rate of AC algorithms\nunder Markovian noise, the impacts of momentum on AC algorithms remain largely\nunexplored. In this work, we first propose a heavy-ball momentum based\nadvantage actor-critic (\\mbox{HB-A2C}) algorithm by integrating the heavy-ball\nmomentum into the critic recursion that is parameterized by a linear function.\nWhen the sample trajectory follows a Markov decision process, we quantitatively\ncertify the acceleration capability of the proposed HB-A2C algorithm. Our\ntheoretical results demonstrate that the proposed HB-A2C finds an\n$\\epsilon$-approximate stationary point with $\\oo{\\epsilon^{-2}}$ iterations\nfor reinforcement learning tasks with Markovian noise. Moreover, we also reveal\nthe dependence of learning rates on the length of the sample trajectory. By\ncarefully selecting the momentum factor of the critic recursion, the proposed\nHB-A2C can balance the errors introduced by the initialization and the\nstoschastic approximation.\n","authors":["Yanjie Dong","Haijun Zhang","Gang Wang","Shisheng Cui","Xiping Hu"],"pdf_url":"https://arxiv.org/pdf/2408.06945v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14322v3","updated":"2024-08-16T15:02:45Z","published":"2024-06-20T13:54:32Z","title":"Mind the Privacy Unit! User-Level Differential Privacy for Language\n Model Fine-Tuning","summary":" Large language models (LLMs) have emerged as powerful tools for tackling\ncomplex tasks across diverse domains, but they also raise privacy concerns when\nfine-tuned on sensitive data due to potential memorization. While differential\nprivacy (DP) offers a promising solution by ensuring models are 'almost\nindistinguishable' with or without any particular privacy unit, current\nevaluations on LLMs mostly treat each example (text record) as the privacy\nunit. This leads to uneven user privacy guarantees when contributions per user\nvary. We therefore study user-level DP motivated by applications where it\nnecessary to ensure uniform privacy protection across users. We present a\nsystematic evaluation of user-level DP for LLM fine-tuning on natural language\ngeneration tasks. Focusing on two mechanisms for achieving user-level DP\nguarantees, Group Privacy and User-wise DP-SGD, we investigate design choices\nlike data selection strategies and parameter tuning for the best\nprivacy-utility tradeoff.\n","authors":["Lynn Chua","Badih Ghazi","Yangsibo Huang","Pritish Kamath","Ravi Kumar","Daogao Liu","Pasin Manurangsi","Amer Sinha","Chiyuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.14322v3.pdf","comment":"Published as a conference paper at COLM 2024"},{"id":"http://arxiv.org/abs/2408.08788v1","updated":"2024-08-16T15:01:28Z","published":"2024-08-16T15:01:28Z","title":"Neighbor Overlay-Induced Graph Attention Network","summary":" Graph neural networks (GNNs) have garnered significant attention due to their\nability to represent graph data. Among various GNN variants, graph attention\nnetwork (GAT) stands out since it is able to dynamically learn the importance\nof different nodes. However, present GATs heavily rely on the smoothed node\nfeatures to obtain the attention coefficients rather than graph structural\ninformation, which fails to provide crucial contextual cues for node\nrepresentations. To address this issue, this study proposes a neighbor\noverlay-induced graph attention network (NO-GAT) with the following two-fold\nideas: a) learning favorable structural information, i.e., overlaid neighbors,\noutside the node feature propagation process from an adjacency matrix; b)\ninjecting the information of overlaid neighbors into the node feature\npropagation process to compute the attention coefficient jointly. Empirical\nstudies on graph benchmark datasets indicate that the proposed NO-GAT\nconsistently outperforms state-of-the-art models.\n","authors":["Tiqiao Wei","Ye Yuan"],"pdf_url":"https://arxiv.org/pdf/2408.08788v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06183v2","updated":"2024-08-16T14:59:52Z","published":"2024-08-12T14:29:54Z","title":"Centralized and Federated Heart Disease Classification Models Using UCI\n Dataset and their Shapley-value Based Interpretability","summary":" Cardiovascular diseases are a leading cause of mortality worldwide,\nhighlighting the need for accurate diagnostic methods. This study benchmarks\ncentralized and federated machine learning algorithms for heart disease\nclassification using the UCI dataset which includes 920 patient records from\nfour hospitals in the USA, Hungary and Switzerland. Our benchmark is supported\nby Shapley-value interpretability analysis to quantify features' importance for\nclassification. In the centralized setup, various binary classification\nalgorithms are trained on pooled data, with a support vector machine (SVM)\nachieving the highest testing accuracy of 83.3\\%, surpassing the established\nbenchmark of 78.7\\% with logistic regression. Additionally, federated learning\nalgorithms with four clients (hospitals) are explored, leveraging the dataset's\nnatural partition to enhance privacy without sacrificing accuracy. Federated\nSVM, an uncommon approach in the literature, achieves a top testing accuracy of\n73.8\\%. Our interpretability analysis aligns with existing medical knowledge of\nheart disease indicators. Overall, this study establishes a benchmark for\nefficient and interpretable pre-screening tools for heart disease while\nmaintaining patients' privacy. This work is available at\nhttps://github.com/padillma1/Heart-Disease-Classification-on-UCI-dataset-and-Shapley-Interpretability-Analysis.\n","authors":["Mario Padilla Rodriguez","Mohamed Nafea"],"pdf_url":"https://arxiv.org/pdf/2408.06183v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08785v1","updated":"2024-08-16T14:59:00Z","published":"2024-08-16T14:59:00Z","title":"A Transparency Paradox? Investigating the Impact of Explanation\n Specificity and Autonomous Vehicle Perceptual Inaccuracies on Passengers","summary":" Transparency in automated systems could be afforded through the provision of\nintelligible explanations. While transparency is desirable, might it lead to\ncatastrophic outcomes (such as anxiety), that could outweigh its benefits? It's\nquite unclear how the specificity of explanations (level of transparency)\ninfluences recipients, especially in autonomous driving (AD). In this work, we\nexamined the effects of transparency mediated through varying levels of\nexplanation specificity in AD. We first extended a data-driven explainer model\nby adding a rule-based option for explanation generation in AD, and then\nconducted a within-subject lab study with 39 participants in an immersive\ndriving simulator to study the effect of the resulting explanations.\nSpecifically, our investigation focused on: (1) how different types of\nexplanations (specific vs. abstract) affect passengers' perceived safety,\nanxiety, and willingness to take control of the vehicle when the vehicle\nperception system makes erroneous predictions; and (2) the relationship between\npassengers' behavioural cues and their feelings during the autonomous drives.\nOur findings showed that passengers felt safer with specific explanations when\nthe vehicle's perception system had minimal errors, while abstract explanations\nthat hid perception errors led to lower feelings of safety. Anxiety levels\nincreased when specific explanations revealed perception system errors (high\ntransparency). We found no significant link between passengers' visual patterns\nand their anxiety levels. Our study suggests that passengers prefer clear and\nspecific explanations (high transparency) when they originate from autonomous\nvehicles (AVs) with optimal perceptual accuracy.\n","authors":["Daniel Omeiza","Raunak Bhattacharyya","Marina Jirotka","Nick Hawes","Lars Kunze"],"pdf_url":"https://arxiv.org/pdf/2408.08785v1.pdf","comment":"Submitted to Transportation Research Part F: Traffic Psychology and\n Behaviour. arXiv admin note: text overlap with arXiv:2307.00633"},{"id":"http://arxiv.org/abs/2405.13712v3","updated":"2024-08-16T14:54:02Z","published":"2024-05-22T15:04:06Z","title":"Learning Diffusion Priors from Observations by Expectation Maximization","summary":" Diffusion models recently proved to be remarkable priors for Bayesian inverse\nproblems. However, training these models typically requires access to large\namounts of clean data, which could prove difficult in some settings. In this\nwork, we present a novel method based on the expectation-maximization algorithm\nfor training diffusion models from incomplete and noisy observations only.\nUnlike previous works, our method leads to proper diffusion models, which is\ncrucial for downstream tasks. As part of our method, we propose and motivate an\nimproved posterior sampling scheme for unconditional diffusion models. We\npresent empirical evidence supporting the effectiveness of our method.\n","authors":["François Rozet","Gérôme Andry","François Lanusse","Gilles Louppe"],"pdf_url":"https://arxiv.org/pdf/2405.13712v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12641v2","updated":"2024-08-16T14:49:02Z","published":"2024-03-19T11:24:14Z","title":"Automated Contrastive Learning Strategy Search for Time Series","summary":" In recent years, Contrastive Learning (CL) has become a predominant\nrepresentation learning paradigm for time series. Most existing methods\nmanually build specific CL Strategies (CLS) by human heuristics for certain\ndatasets and tasks. However, manually developing CLS usually requires excessive\nprior knowledge about the data, and massive experiments to determine the\ndetailed CL configurations. In this paper, we present an Automated Machine\nLearning (AutoML) practice at Microsoft, which automatically learns CLS for\ntime series datasets and tasks, namely Automated Contrastive Learning (AutoCL).\nWe first construct a principled search space of size over $3\\times10^{12}$,\ncovering data augmentation, embedding transformation, contrastive pair\nconstruction, and contrastive losses. Further, we introduce an efficient\nreinforcement learning algorithm, which optimizes CLS from the performance on\nthe validation tasks, to obtain effective CLS within the space. Experimental\nresults on various real-world datasets demonstrate that AutoCL could\nautomatically find the suitable CLS for the given dataset and task. From the\ncandidate CLS found by AutoCL on several public datasets/tasks, we compose a\ntransferable Generally Good Strategy (GGS), which has a strong performance for\nother datasets. We also provide empirical analysis as a guide for the future\ndesign of CLS.\n","authors":["Baoyu Jing","Yansen Wang","Guoxin Sui","Jing Hong","Jingrui He","Yuqing Yang","Dongsheng Li","Kan Ren"],"pdf_url":"https://arxiv.org/pdf/2403.12641v2.pdf","comment":"Accepted by CIKM'2024"},{"id":"http://arxiv.org/abs/2403.11960v2","updated":"2024-08-16T14:47:22Z","published":"2024-03-18T16:57:16Z","title":"Causality-Aware Spatiotemporal Graph Neural Networks for Spatiotemporal\n Time Series Imputation","summary":" Spatiotemporal time series are usually collected via monitoring sensors\nplaced at different locations, which usually contain missing values due to\nvarious mechanical failures. Imputing the missing values is crucial for\nanalyzing time series. When recovering a specific data point, most existing\nmethods consider all the information relevant to that point regardless of the\ncause-and-effect relationship. During data collection, it is inevitable that\nsome unknown confounders are included, e.g., background noise in time series\nand non-causal shortcut edges in the constructed sensor network. These\nconfounders could open backdoor paths and establish non-causal correlations\nbetween the input and output. Over-exploiting these non-causal correlations\ncould cause overfitting. In this paper, we first revisit spatiotemporal time\nseries imputation from a causal perspective and show how to block the\nconfounders via the frontdoor adjustment. Based on the results of frontdoor\nadjustment, we introduce a novel Causality-Aware Spatiotemporal Graph Neural\nNetwork (Casper), which contains a novel Prompt Based Decoder (PBD) and a\nSpatiotemporal Causal Attention (SCA). PBD could reduce the impact of\nconfounders and SCA could discover the sparse causal relationships among\nembeddings. Theoretical analysis reveals that SCA discovers causal\nrelationships based on the values of gradients. We evaluate Casper on three\nreal-world datasets, and the experimental results show that Casper could\noutperform the baselines and could effectively discover causal relationships.\n","authors":["Baoyu Jing","Dawei Zhou","Kan Ren","Carl Yang"],"pdf_url":"https://arxiv.org/pdf/2403.11960v2.pdf","comment":"Accepted by CIKM'2024"},{"id":"http://arxiv.org/abs/2408.08776v1","updated":"2024-08-16T14:38:14Z","published":"2024-08-16T14:38:14Z","title":"NEAR: A Training-Free Pre-Estimator of Machine Learning Model\n Performance","summary":" Artificial neural networks have been shown to be state-of-the-art machine\nlearning models in a wide variety of applications, including natural language\nprocessing and image recognition. However, building a performant neural network\nis a laborious task and requires substantial computing power. Neural\nArchitecture Search (NAS) addresses this issue by an automatic selection of the\noptimal network from a set of potential candidates. While many NAS methods\nstill require training of (some) neural networks, zero-cost proxies promise to\nidentify the optimal network without training. In this work, we propose the\nzero-cost proxy Network Expressivity by Activation Rank (NEAR). It is based on\nthe effective rank of the pre- and post-activation matrix, i.e., the values of\na neural network layer before and after applying its activation function. We\ndemonstrate the cutting-edge correlation between this network score and the\nmodel accuracy on NAS-Bench-101 and NATS-Bench-SSS/TSS. In addition, we present\na simple approach to estimate the optimal layer sizes in multi-layer\nperceptrons. Furthermore, we show that this score can be utilized to select\nhyperparameters such as the activation function and the neural network weight\ninitialization scheme.\n","authors":["Raphael T. Husistein","Markus Reiher","Marco Eckhoff"],"pdf_url":"https://arxiv.org/pdf/2408.08776v1.pdf","comment":"12 pages, 4 figures, 10 tables"},{"id":"http://arxiv.org/abs/2408.08774v1","updated":"2024-08-16T14:33:02Z","published":"2024-08-16T14:33:02Z","title":"Speckle Noise Analysis for Synthetic Aperture Radar (SAR) Space Data","summary":" This research tackles the challenge of speckle noise in Synthetic Aperture\nRadar (SAR) space data, a prevalent issue that hampers the clarity and utility\nof SAR images. The study presents a comparative analysis of six distinct\nspeckle noise reduction techniques: Lee Filtering, Frost Filtering, Kuan\nFiltering, Gaussian Filtering, Median Filtering, and Bilateral Filtering. These\nmethods, selected for their unique approaches to noise reduction and image\npreservation, were applied to SAR datasets sourced from the Alaska Satellite\nFacility (ASF). The performance of each technique was evaluated using a\ncomprehensive set of metrics, including Peak Signal-to-Noise Ratio (PSNR), Mean\nSquared Error (MSE), Structural Similarity Index (SSIM), Equivalent Number of\nLooks (ENL), and Speckle Suppression Index (SSI). The study concludes that both\nthe Lee and Kuan Filters are effective, with the choice of filter depending on\nthe specific application requirements for image quality and noise suppression.\nThis work provides valuable insights into optimizing SAR image processing, with\nsignificant implications for remote sensing, environmental monitoring, and\ngeological surveying.\n","authors":["Sanjjushri Varshini R","Rohith Mahadevan","Bagiya Lakshmi S","Mathivanan Periasamy","Raja CSP Raman","Lokesh M"],"pdf_url":"https://arxiv.org/pdf/2408.08774v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08770v1","updated":"2024-08-16T14:25:20Z","published":"2024-08-16T14:25:20Z","title":"Pessimistic Iterative Planning for Robust POMDPs","summary":" Robust partially observable Markov decision processes (robust POMDPs) extend\nclassical POMDPs to handle additional uncertainty on the transition and\nobservation probabilities via so-called uncertainty sets. Policies for robust\nPOMDPs must not only be memory-based to account for partial observability but\nalso robust against model uncertainty to account for the worst-case instances\nfrom the uncertainty sets. We propose the pessimistic iterative planning (PIP)\nframework, which finds robust memory-based policies for robust POMDPs. PIP\nalternates between two main steps: (1) selecting an adversarial (non-robust)\nPOMDP via worst-case probability instances from the uncertainty sets; and (2)\ncomputing a finite-state controller (FSC) for this adversarial POMDP. We\nevaluate the performance of this FSC on the original robust POMDP and use this\nevaluation in step (1) to select the next adversarial POMDP. Within PIP, we\npropose the rFSCNet algorithm. In each iteration, rFSCNet finds an FSC through\na recurrent neural network trained using supervision policies optimized for the\nadversarial POMDP. The empirical evaluation in four benchmark environments\nshowcases improved robustness against a baseline method in an ablation study\nand competitive performance compared to a state-of-the-art robust POMDP solver.\n","authors":["Maris F. L. Galesloot","Marnix Suilen","Thiago D. Simão","Steven Carr","Matthijs T. J. Spaan","Ufuk Topcu","Nils Jansen"],"pdf_url":"https://arxiv.org/pdf/2408.08770v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08761v1","updated":"2024-08-16T14:04:40Z","published":"2024-08-16T14:04:40Z","title":"SYMPOL: Symbolic Tree-Based On-Policy Reinforcement Learning","summary":" Reinforcement learning (RL) has seen significant success across various\ndomains, but its adoption is often limited by the black-box nature of neural\nnetwork policies, making them difficult to interpret. In contrast, symbolic\npolicies allow representing decision-making strategies in a compact and\ninterpretable way. However, learning symbolic policies directly within\non-policy methods remains challenging. In this paper, we introduce SYMPOL, a\nnovel method for SYMbolic tree-based on-POLicy RL. SYMPOL employs a tree-based\nmodel integrated with a policy gradient method, enabling the agent to learn and\nadapt its actions while maintaining a high level of interpretability. We\nevaluate SYMPOL on a set of benchmark RL tasks, demonstrating its superiority\nover alternative tree-based RL approaches in terms of performance and\ninterpretability. To the best of our knowledge, this is the first method, that\nallows a gradient-based end-to-end learning of interpretable, axis-aligned\ndecision trees on-policy. Therefore, SYMPOL can become the foundation for a new\nclass of interpretable RL based on decision trees. Our implementation is\navailable under: https://github.com/s-marton/SYMPOL\n","authors":["Sascha Marton","Tim Grams","Florian Vogt","Stefan Lüdtke","Christian Bartelt","Heiner Stuckenschmidt"],"pdf_url":"https://arxiv.org/pdf/2408.08761v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08754v1","updated":"2024-08-16T13:54:50Z","published":"2024-08-16T13:54:50Z","title":"SE-SGformer: A Self-Explainable Signed Graph Transformer for Link Sign\n Prediction","summary":" Signed Graph Neural Networks (SGNNs) have been shown to be effective in\nanalyzing complex patterns in real-world situations where positive and negative\nlinks coexist. However, SGNN models suffer from poor explainability, which\nlimit their adoptions in critical scenarios that require understanding the\nrationale behind predictions. To the best of our knowledge, there is currently\nno research work on the explainability of the SGNN models. Our goal is to\naddress the explainability of decision-making for the downstream task of link\nsign prediction specific to signed graph neural networks. Since post-hoc\nexplanations are not derived directly from the models, they may be biased and\nmisrepresent the true explanations. Therefore, in this paper we introduce a\nSelf-Explainable Signed Graph transformer (SE-SGformer) framework, which can\nnot only outputs explainable information while ensuring high prediction\naccuracy. Specifically, We propose a new Transformer architecture for signed\ngraphs and theoretically demonstrate that using positional encoding based on\nsigned random walks has greater expressive power than current SGNN methods and\nother positional encoding graph Transformer-based approaches. We constructs a\nnovel explainable decision process by discovering the $K$-nearest (farthest)\npositive (negative) neighbors of a node to replace the neural network-based\ndecoder for predicting edge signs. These $K$ positive (negative) neighbors\nrepresent crucial information about the formation of positive (negative) edges\nbetween nodes and thus can serve as important explanatory information in the\ndecision-making process. We conducted experiments on several real-world\ndatasets to validate the effectiveness of SE-SGformer, which outperforms the\nstate-of-the-art methods by improving 2.2\\% prediction accuracy and 73.1\\%\nexplainablity accuracy in the best-case scenario.\n","authors":["Lu Li","Jiale Liu","Xingyu Ji","Maojun Wang","Zeyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.08754v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06277v2","updated":"2024-08-16T13:51:59Z","published":"2024-08-12T16:39:18Z","title":"Multi-marginal Schrödinger Bridges with Iterative Reference Refinement","summary":" Practitioners frequently aim to infer an unobserved population trajectory\nusing sample snapshots at multiple time points. For instance, in single-cell\nsequencing, scientists would like to learn how gene expression evolves over\ntime. But sequencing any cell destroys that cell. So we cannot access any\ncell's full trajectory, but we can access snapshot samples from many cells.\nStochastic differential equations are commonly used to analyze systems with\nfull individual-trajectory access; since here we have only sample snapshots,\nthese methods are inapplicable. The deep learning community has recently\nexplored using Schr\\\"odinger bridges (SBs) and their extensions to estimate\nthese dynamics. However, these methods either (1) interpolate between just two\ntime points or (2) require a single fixed reference dynamic within the SB,\nwhich is often just set to be Brownian motion. But learning piecewise from\nadjacent time points can fail to capture long-term dependencies. And\npractitioners are typically able to specify a model class for the reference\ndynamic but not the exact values of the parameters within it. So we propose a\nnew method that (1) learns the unobserved trajectories from sample snapshots\nacross multiple time points and (2) requires specification only of a class of\nreference dynamics, not a single fixed one. In particular, we suggest an\niterative projection method inspired by Schr\\\"odinger bridges; we alternate\nbetween learning a piecewise SB on the unobserved trajectories and using the\nlearned SB to refine our best guess for the dynamics within the reference\nclass. We demonstrate the advantages of our method via a well-known simulated\nparametric model from ecology, simulated and real data from systems biology,\nand real motion-capture data.\n","authors":["Yunyi Shen","Renato Berlinghieri","Tamara Broderick"],"pdf_url":"https://arxiv.org/pdf/2408.06277v2.pdf","comment":"Updated to fix title error"},{"id":"http://arxiv.org/abs/2408.08749v1","updated":"2024-08-16T13:50:04Z","published":"2024-08-16T13:50:04Z","title":"ML Study of MaliciousTransactions in Ethereum","summary":" Smart contracts are a major tool in Ethereum transactions. Therefore hackers\ncan exploit them by adding code vulnerabilities to their sources and using\nthese vulnerabilities for performing malicious transactions. This paper\npresents two successful approaches for detecting malicious contracts: one uses\nopcode and relies on GPT2 and the other uses the Solidity source and a LORA\nfine-tuned CodeLlama. Finally, we present an XGBOOST model that combines gas\nproperties and Hexa-decimal signatures for detecting malicious transactions.\nThis approach relies on early assumptions that maliciousness is manifested by\nthe uncommon usage of the contracts' functions and the effort to pursue the\ntransaction.\n","authors":["Natan Katz"],"pdf_url":"https://arxiv.org/pdf/2408.08749v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05807v2","updated":"2024-08-16T13:03:02Z","published":"2024-08-11T15:56:44Z","title":"Kernel Density Estimators in Large Dimensions","summary":" This paper studies Kernel density estimation for a high-dimensional\ndistribution $\\rho(x)$. Traditional approaches have focused on the limit of\nlarge number of data points $n$ and fixed dimension $d$. We analyze instead the\nregime where both the number $n$ of data points $y_i$ and their dimensionality\n$d$ grow with a fixed ratio $\\alpha=(\\log n)/d$. Our study reveals three\ndistinct statistical regimes for the kernel-based estimate of the density $\\hat\n\\rho_h^{\\mathcal {D}}(x)=\\frac{1}{n h^d}\\sum_{i=1}^n\nK\\left(\\frac{x-y_i}{h}\\right)$, depending on the bandwidth $h$: a classical\nregime for large bandwidth where the Central Limit Theorem (CLT) holds, which\nis akin to the one found in traditional approaches. Below a certain value of\nthe bandwidth, $h_{CLT}(\\alpha)$, we find that the CLT breaks down. The\nstatistics of $\\hat \\rho_h^{\\mathcal {D}}(x)$ for a fixed $x$ drawn from\n$\\rho(x)$ is given by a heavy-tailed distribution (an alpha-stable\ndistribution). In particular below a value $h_G(\\alpha)$, we find that $\\hat\n\\rho_h^{\\mathcal {D}}(x)$ is governed by extreme value statistics: only a few\npoints in the database matter and give the dominant contribution to the density\nestimator. We provide a detailed analysis for high-dimensional multivariate\nGaussian data. We show that the optimal bandwidth threshold based on\nKullback-Leibler divergence lies in the new statistical regime identified in\nthis paper. Our findings reveal limitations of classical approaches, show the\nrelevance of these new statistical regimes, and offer new insights for Kernel\ndensity estimation in high-dimensional settings.\n","authors":["Giulio Biroli","Marc Mézard"],"pdf_url":"https://arxiv.org/pdf/2408.05807v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08713v1","updated":"2024-08-16T12:51:52Z","published":"2024-08-16T12:51:52Z","title":"Beyond KAN: Introducing KarSein for Adaptive High-Order Feature\n Interaction Modeling in CTR Prediction","summary":" Modeling feature interactions is crucial for click-through rate (CTR)\nprediction, particularly when it comes to high-order explicit interactions.\nTraditional methods struggle with this task because they often predefine a\nmaximum interaction order, which relies heavily on prior knowledge and can\nlimit the model's effectiveness. Additionally, modeling high-order interactions\ntypically leads to increased computational costs. Therefore, the challenge lies\nin adaptively modeling high-order feature interactions while maintaining\nefficiency. To address this issue, we introduce Kolmogorov-Arnold Represented\nSparse Efficient Interaction Network (KarSein), designed to optimize both\npredictive accuracy and computational efficiency. We firstly identify\nlimitations of directly applying Kolmogorov-Arnold Networks (KAN) to CTR and\nthen introduce KarSein to overcome these issues. It features a novel\narchitecture that reduces the computational costs of KAN and supports embedding\nvectors as feature inputs. Additionally, KarSein employs guided symbolic\nregression to address the challenge of KAN in spontaneously learning\nmultiplicative relationships. Extensive experiments demonstrate KarSein's\nsuperior performance, achieving significant predictive accuracy with minimal\ncomputational overhead. Furthermore, KarSein maintains strong global\nexplainability while enabling the removal of redundant features, resulting in a\nsparse network structure. These advantages also position KarSein as a promising\nmethod for efficient inference.\n","authors":["Yunxiao Shi","Wujiang Wu","Mingyu Jin","Haimin Zhang","Qiang Wu","Yongfeng Zhang","Min Xu"],"pdf_url":"https://arxiv.org/pdf/2408.08713v1.pdf","comment":"KarSein for CTR"},{"id":"http://arxiv.org/abs/2403.13108v2","updated":"2024-08-16T12:49:56Z","published":"2024-03-19T19:15:38Z","title":"Resilience in Online Federated Learning: Mitigating Model-Poisoning\n Attacks via Partial Sharing","summary":" Federated learning (FL) allows training machine learning models on\ndistributed data without compromising privacy. However, FL is vulnerable to\nmodel-poisoning attacks where malicious clients tamper with their local models\nto manipulate the global model. In this work, we investigate the resilience of\nthe partial-sharing online FL (PSO-Fed) algorithm against such attacks. PSO-Fed\nreduces communication overhead by allowing clients to share only a fraction of\ntheir model updates with the server. We demonstrate that this partial sharing\nmechanism has the added advantage of enhancing PSO-Fed's robustness to\nmodel-poisoning attacks. Through theoretical analysis, we show that PSO-Fed\nmaintains convergence even under Byzantine attacks, where malicious clients\ninject noise into their updates. Furthermore, we derive a formula for PSO-Fed's\nmean square error, considering factors like stepsize, attack probability, and\nthe number of malicious clients. Interestingly, we find a non-trivial optimal\nstepsize that maximizes PSO-Fed's resistance to these attacks. Extensive\nnumerical experiments confirm our theoretical findings and showcase PSO-Fed's\nsuperior performance against model-poisoning attacks compared to other leading\nFL algorithms.\n","authors":["Ehsan Lari","Reza Arablouei","Vinay Chakravarthi Gogineni","Stefan Werner"],"pdf_url":"https://arxiv.org/pdf/2403.13108v2.pdf","comment":"13 pages, 9 figures, Submitted to TSIPN"},{"id":"http://arxiv.org/abs/2401.17542v3","updated":"2024-08-16T12:46:03Z","published":"2024-01-31T02:09:21Z","title":"A Medical Data-Effective Learning Benchmark for Highly Efficient\n Pre-training of Foundation Models","summary":" Foundation models, pre-trained on massive datasets, have achieved\nunprecedented generalizability. However, is it truly necessary to involve such\nvast amounts of data in pre-training, consuming extensive computational\nresources? This paper introduces data-effective learning, aiming to use data in\nthe most impactful way to pre-train foundation models. This involves strategies\nthat focus on data quality rather than quantity, ensuring the data used for\ntraining has high informational value. Data-effective learning plays a profound\nrole in accelerating foundation model training, reducing computational costs,\nand saving data storage, which is very important as the volume of medical data\nin recent years has grown beyond many people's expectations. However, due to\nthe lack of standards and comprehensive benchmarks, research on medical\ndata-effective learning is poorly studied. To address this gap, our paper\nintroduces a comprehensive benchmark specifically for evaluating data-effective\nlearning in the medical field. This benchmark includes a dataset with millions\nof data samples from 31 medical centers (DataDEL), a baseline method for\ncomparison (MedDEL), and a new evaluation metric (NormDEL) to objectively\nmeasure data-effective learning performance. Our extensive experimental results\nshow the baseline MedDEL can achieve performance comparable to the original\nlarge dataset with only 5% of the data. Establishing such an open\ndata-effective learning benchmark is crucial for the medical foundation model\nresearch community because it facilitates efficient data use, promotes\ncollaborative breakthroughs, and fosters the development of cost-effective,\nscalable, and impactful healthcare solutions.\n","authors":["Wenxuan Yang","Weimin Tan","Yuqi Sun","Bo Yan"],"pdf_url":"https://arxiv.org/pdf/2401.17542v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08707v1","updated":"2024-08-16T12:40:01Z","published":"2024-08-16T12:40:01Z","title":"Beam Prediction based on Large Language Models","summary":" Millimeter-wave (mmWave) communication is promising for next-generation\nwireless networks but suffers from significant path loss, requiring extensive\nantenna arrays and frequent beam training. Traditional deep learning models,\nsuch as long short-term memory (LSTM), enhance beam tracking accuracy however\nare limited by poor robustness and generalization. In this letter, we use large\nlanguage models (LLMs) to improve the robustness of beam prediction. By\nconverting time series data into text-based representations and employing the\nPrompt-as-Prefix (PaP) technique for contextual enrichment, our approach\nunleashes the strength of LLMs for time series forecasting. Simulation results\ndemonstrate that our LLM-based method offers superior robustness and\ngeneralization compared to LSTM-based models, showcasing the potential of LLMs\nin wireless communications.\n","authors":["Yucheng Sheng","Kai Huang","Le Liang","Peng Liu","Shi Jin","Geoffrey Ye Li"],"pdf_url":"https://arxiv.org/pdf/2408.08707v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08706v1","updated":"2024-08-16T12:33:40Z","published":"2024-08-16T12:33:40Z","title":"Efficient Multi-Policy Evaluation for Reinforcement Learning","summary":" To unbiasedly evaluate multiple target policies, the dominant approach among\nRL practitioners is to run and evaluate each target policy separately. However,\nthis evaluation method is far from efficient because samples are not shared\nacross policies, and running target policies to evaluate themselves is actually\nnot optimal. In this paper, we address these two weaknesses by designing a\ntailored behavior policy to reduce the variance of estimators across all target\npolicies. Theoretically, we prove that executing this behavior policy with\nmanyfold fewer samples outperforms on-policy evaluation on every target policy\nunder characterized conditions. Empirically, we show our estimator has a\nsubstantially lower variance compared with previous best methods and achieves\nstate-of-the-art performance in a broad range of environments.\n","authors":["Shuze Liu","Yuxin Chen","Shangtong Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.08706v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08699v1","updated":"2024-08-16T12:26:36Z","published":"2024-08-16T12:26:36Z","title":"RBLA: Rank-Based-LoRA-Aggregation for Fine-tuning Heterogeneous Models\n in FLaaS","summary":" Federated Learning (FL) is a promising privacy-aware distributed learning\nframework that can be deployed on various devices, such as mobile phones,\ndesktops, and devices equipped with CPUs or GPUs. In the context of\nserver-based Federated Learning as a Service (FLaas), FL enables the central\nserver to coordinate the training process across multiple devices without\ndirect access to the local data, thereby enhancing privacy and data security.\nLow-Rank Adaptation (LoRA) is a method that fine-tunes models efficiently by\nfocusing on a low-dimensional subspace of the model's parameters. This approach\nsignificantly reduces computational and memory costs compared to fine-tuning\nall parameters from scratch. When integrated with FL, especially in a FLaas\nenvironment, LoRA allows for flexible and efficient deployment across diverse\nhardware with varying computational capabilities by adjusting the local model's\nrank. However, in LoRA-enabled FL, different clients may train models with\nvarying ranks, which poses a challenge for model aggregation on the server.\nCurrent methods of aggregating models of different ranks require padding\nweights to a uniform shape, which can degrade the global model's performance.\nTo address this issue, we propose Rank-Based LoRA Aggregation (RBLA), a novel\nmodel aggregation method designed for heterogeneous LoRA structures. RBLA\npreserves key features across models with different ranks. This paper analyzes\nthe issues with current padding methods that reshape models for aggregation in\na FLaas environment. Then, we introduce RBLA, a rank-based aggregation method\nthat maintains both low-rank and high-rank features. Finally, we demonstrate\nthe effectiveness of RBLA through comparative experiments with state-of-the-art\nmethods.\n","authors":["Shuaijun Chen","Omid Tavallaie","Niousha Nazemi","Albert Y. Zomaya"],"pdf_url":"https://arxiv.org/pdf/2408.08699v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.02504v2","updated":"2024-08-16T12:24:54Z","published":"2023-02-05T22:51:27Z","title":"Motion-compensated MR CINE reconstruction with reconstruction-driven\n motion estimation","summary":" In cardiac CINE, motion-compensated MR reconstruction (MCMR) is an effective\napproach to address highly undersampled acquisitions by incorporating motion\ninformation between frames. In this work, we propose a novel perspective for\naddressing the MCMR problem and a more integrated and efficient solution to the\nMCMR field. Contrary to state-of-the-art (SOTA) MCMR methods which break the\noriginal problem into two sub-optimization problems, i.e. motion estimation and\nreconstruction, we formulate this problem as a single entity with one single\noptimization. Our approach is unique in that the motion estimation is directly\ndriven by the ultimate goal, reconstruction, but not by the canonical\nmotion-warping loss (similarity measurement between motion-warped images and\ntarget images). We align the objectives of motion estimation and\nreconstruction, eliminating the drawbacks of artifacts-affected motion\nestimation and therefore error-propagated reconstruction. Further, we can\ndeliver high-quality reconstruction and realistic motion without applying any\nregularization/smoothness loss terms, circumventing the non-trivial weighting\nfactor tuning. We evaluate our method on two datasets: 1) an in-house acquired\n2D CINE dataset for the retrospective study and 2) the public OCMR cardiac\ndataset for the prospective study. The conducted experiments indicate that the\nproposed MCMR framework can deliver artifact-free motion estimation and\nhigh-quality MR images even for imaging accelerations up to 20x, outperforming\nSOTA non-MCMR and MCMR methods in both qualitative and quantitative evaluation\nacross all experiments. The code is available at\nhttps://github.com/JZPeterPan/MCMR-Recon-Driven-Motion.\n","authors":["Jiazhen Pan","Wenqi Huang","Daniel Rueckert","Thomas Küstner","Kerstin Hammernik"],"pdf_url":"https://arxiv.org/pdf/2302.02504v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08696v1","updated":"2024-08-16T12:20:56Z","published":"2024-08-16T12:20:56Z","title":"Turning Trash into Treasure: Accelerating Inference of Large Language\n Models with Token Recycling","summary":" The rapid growth in the parameters of large language models (LLMs) has made\ninference latency a fundamental bottleneck, limiting broader application of\nLLMs. Speculative decoding represents a lossless approach to accelerate\ninference through a guess-and-verify paradigm, leveraging the parallel\ncapabilities of modern hardware. Some speculative decoding methods rely on\nadditional structures to guess draft tokens, such as small models or\nparameter-efficient architectures, which need extra training before use.\nAlternatively, retrieval-based train-free techniques build libraries from\npre-existing corpora or by n-gram generation. However, they face challenges\nlike large storage requirements, time-consuming retrieval, and limited\nadaptability. Observing that candidate tokens generated during the decoding\nprocess are likely to reoccur in future sequences, we propose Token Recycling.\nThis approach stores candidate tokens in an adjacency matrix and employs a\nbreadth-first search (BFS)-like algorithm on the matrix to construct a draft\ntree. The tree is then validated through tree attention. New candidate tokens\nfrom the decoding process are then used to update the matrix. Token Recycling\nrequires \\textless2MB of additional storage and achieves approximately 2x\nspeedup across all sizes of LLMs. It significantly outperforms existing\ntrain-free methods by 30\\% and even a training method by 25\\%. It can be\ndirectly applied to any existing LLMs and tasks without the need for\nadaptation.\n","authors":["Xianzhen Luo","Yixuan Wang","Qingfu Zhu","Zhiming Zhang","Xuanyu Zhang","Qing Yang","Dongliang Xu","Wanxiang Che"],"pdf_url":"https://arxiv.org/pdf/2408.08696v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2408.07107v2","updated":"2024-08-16T12:19:44Z","published":"2024-08-13T10:28:54Z","title":"Maximizing V-information for Pre-training Superior Foundation Models","summary":" Pre-training foundation models on large-scale datasets demonstrates\nexceptional performance. However, recent research questions this traditional\nnotion, exploring whether an increase in pre-training data always leads to\nenhanced model performance. To address this issue, data-effective learning\napproaches have been introduced. However, current methods in this area lack a\nclear standard for sample selection. Our experiments reveal that by maximizing\nV-information, sample selection can be framed as an optimization problem,\nenabling effective improvement in model performance even with fewer samples.\nUnder this guidance, we develop an optimal data-effective learning method\n(OptiDEL) to maximize V-information. The OptiDEL method generates hard samples\nto achieve or even exceed the performance of models trained on the full dataset\nwhile using substantially less data. We compare the OptiDEL method with\nstate-of-the-art approaches finding that OptiDEL consistently outperforms\nexisting approaches across different datasets, with foundation models trained\non only 5% of the pre-training data surpassing the performance of those trained\non the full dataset.\n","authors":["Wenxuan Yang","Weimin Tan","Hanyu Zhang","Bo Yan"],"pdf_url":"https://arxiv.org/pdf/2408.07107v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.04720v4","updated":"2024-08-16T12:08:03Z","published":"2024-03-07T18:16:29Z","title":"Rethinking of Encoder-based Warm-start Methods in Hyperparameter\n Optimization","summary":" Effectively representing heterogeneous tabular datasets for meta-learning\npurposes remains an open problem. Previous approaches rely on predefined\nmeta-features, for example, statistical measures or landmarkers. The emergence\nof dataset encoders opens new possibilities for the extraction of meta-features\nbecause they do not involve any handmade design. Moreover, they are proven to\ngenerate dataset representations with desired spatial properties. In this\nresearch, we evaluate an encoder-based approach to one of the most established\nmeta-tasks - warm-starting of the Bayesian Hyperparameter Optimization. To\nbroaden our analysis we introduce a new approach for representation learning on\ntabular data based on [Tomoharu Iwata and Atsutoshi Kumagai. Meta-learning from\nTasks with Heterogeneous Attribute Spaces. In Advances in Neural Information\nProcessing Systems, 2020]. The validation on over 100 datasets from UCI and an\nindependent metaMIMIC set of datasets highlights the nuanced challenges in\nrepresentation learning. We show that general representations may not suffice\nfor some meta-tasks where requirements are not explicitly considered during\nextraction.\n","authors":["Dawid Płudowski","Antoni Zajko","Anna Kozak","Katarzyna Woźnica"],"pdf_url":"https://arxiv.org/pdf/2403.04720v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08690v1","updated":"2024-08-16T12:06:09Z","published":"2024-08-16T12:06:09Z","title":"Explore-then-Commit Algorithms for Decentralized Two-Sided Matching\n Markets","summary":" Online learning in a decentralized two-sided matching markets, where the\ndemand-side (players) compete to match with the supply-side (arms), has\nreceived substantial interest because it abstracts out the complex interactions\nin matching platforms (e.g. UpWork, TaskRabbit). However, past works assume\nthat each arm knows their preference ranking over the players (one-sided\nlearning), and each player aim to learn the preference over arms through\nsuccessive interactions. Moreover, several (impractical) assumptions on the\nproblem are usually made for theoretical tractability such as broadcast\nplayer-arm match Liu et al. (2020; 2021); Kong & Li (2023) or serial\ndictatorship Sankararaman et al. (2021); Basu et al. (2021); Ghosh et al.\n(2022). In this paper, we study a decentralized two-sided matching market,\nwhere we do not assume that the preference ranking over players are known to\nthe arms apriori. Furthermore, we do not have any structural assumptions on the\nproblem. We propose a multi-phase explore-then-commit type algorithm namely\nepoch-based CA-ETC (collision avoidance explore then commit) (\\texttt{CA-ETC}\nin short) for this problem that does not require any communication across\nagents (players and arms) and hence decentralized. We show that for the initial\nepoch length of $T_{\\circ}$ and subsequent epoch-lengths of $2^{l/\\gamma}\nT_{\\circ}$ (for the $l-$th epoch with $\\gamma \\in (0,1)$ as an input parameter\nto the algorithm), \\texttt{CA-ETC} yields a player optimal expected regret of\n$\\mathcal{O}\\left(T_{\\circ} (\\frac{K \\log T}{T_{\\circ} \\Delta^2})^{1/\\gamma} +\nT_{\\circ} (\\frac{T}{T_{\\circ}})^\\gamma\\right)$ for the $i$-th player, where $T$\nis the learning horizon, $K$ is the number of arms and $\\Delta$ is an\nappropriately defined problem gap. Furthermore, we propose a blackboard\ncommunication based baseline achieving logarithmic regret in $T$.\n","authors":["Tejas Pagare","Avishek Ghosh"],"pdf_url":"https://arxiv.org/pdf/2408.08690v1.pdf","comment":"Accepted at International Symposium of Information Theory (ISIT) 2024"},{"id":"http://arxiv.org/abs/2408.08685v1","updated":"2024-08-16T11:58:34Z","published":"2024-08-16T11:58:34Z","title":"Can Large Language Models Improve the Adversarial Robustness of Graph\n Neural Networks?","summary":" Graph neural networks (GNNs) are vulnerable to adversarial perturbations,\nespecially for topology attacks, and many methods that improve the robustness\nof GNNs have received considerable attention. Recently, we have witnessed the\nsignificant success of large language models (LLMs), leading many to explore\nthe great potential of LLMs on GNNs. However, they mainly focus on improving\nthe performance of GNNs by utilizing LLMs to enhance the node features.\nTherefore, we ask: Will the robustness of GNNs also be enhanced with the\npowerful understanding and inference capabilities of LLMs? By presenting the\nempirical results, we find that despite that LLMs can improve the robustness of\nGNNs, there is still an average decrease of 23.1% in accuracy, implying that\nthe GNNs remain extremely vulnerable against topology attack. Therefore,\nanother question is how to extend the capabilities of LLMs on graph adversarial\nrobustness. In this paper, we propose an LLM-based robust graph structure\ninference framework, LLM4RGNN, which distills the inference capabilities of\nGPT-4 into a local LLM for identifying malicious edges and an LM-based edge\npredictor for finding missing important edges, so as to recover a robust graph\nstructure. Extensive experiments demonstrate that LLM4RGNN consistently\nimproves the robustness across various GNNs. Even in some cases where the\nperturbation ratio increases to 40%, the accuracy of GNNs is still better than\nthat on the clean graph.\n","authors":["Zhongjian Zhang","Xiao Wang","Huichi Zhou","Yue Yu","Mengmei Zhang","Cheng Yang","Chuan Shi"],"pdf_url":"https://arxiv.org/pdf/2408.08685v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08684v1","updated":"2024-08-16T11:56:49Z","published":"2024-08-16T11:56:49Z","title":"Research on Personalized Compression Algorithm for Pre-trained Models\n Based on Homomorphic Entropy Increase","summary":" In this article, we explore the challenges and evolution of two key\ntechnologies in the current field of AI: Vision Transformer model and Large\nLanguage Model (LLM). Vision Transformer captures global information by\nsplitting images into small pieces and leveraging Transformer's multi-head\nattention mechanism, but its high reference count and compute overhead limit\ndeployment on mobile devices. At the same time, the rapid development of LLM\nhas revolutionized natural language processing, but it also faces huge\ndeployment challenges. To address these issues, we investigate model pruning\ntechniques, with a particular focus on how to reduce redundant parameters\nwithout losing accuracy to accommodate personalized data and\nresource-constrained environments. In this paper, a new layered pruning\nstrategy is proposed to distinguish the personalized layer from the common\nlayer by compressed sensing and random sampling, thus significantly reducing\nthe model parameters. Our experimental results show that the introduced step\nbuffering mechanism further improves the accuracy of the model after pruning,\nproviding new directions and possibilities for the deployment of efficient and\npersonalized AI models on mobile devices in the future.\n","authors":["Yicong Li","Xing Guo","Haohua Du"],"pdf_url":"https://arxiv.org/pdf/2408.08684v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08681v1","updated":"2024-08-16T11:53:52Z","published":"2024-08-16T11:53:52Z","title":"A Mean Field Ansatz for Zero-Shot Weight Transfer","summary":" The pre-training cost of large language models (LLMs) is prohibitive. One\ncutting-edge approach to reduce the cost is zero-shot weight transfer, also\nknown as model growth for some cases, which magically transfers the weights\ntrained in a small model to a large model. However, there are still some\ntheoretical mysteries behind the weight transfer. In this paper, inspired by\nprior applications of mean field theory to neural network dynamics, we\nintroduce a mean field ansatz to provide a theoretical explanation for weight\ntransfer. Specifically, we propose the row-column (RC) ansatz under the mean\nfield point of view, which describes the measure structure of the weights in\nthe neural network (NN) and admits a close measure dynamic. Thus, the weights\nof different sizes NN admit a common distribution under proper assumptions, and\nweight transfer methods can be viewed as sampling methods. We empirically\nvalidate the RC ansatz by exploring simple MLP examples and LLMs such as GPT-3\nand Llama-3.1. We show the mean-field point of view is adequate under suitable\nassumptions which can provide theoretical support for zero-shot weight\ntransfer.\n","authors":["Xingyuan Chen","Wenwei Kuang","Lei Deng","Wei Han","Bo Bai","Goncalo dos Reis"],"pdf_url":"https://arxiv.org/pdf/2408.08681v1.pdf","comment":"40 pages, 6 Figures, 1 table"},{"id":"http://arxiv.org/abs/2408.08677v1","updated":"2024-08-16T11:44:27Z","published":"2024-08-16T11:44:27Z","title":"Neural Reward Machines","summary":" Non-markovian Reinforcement Learning (RL) tasks are very hard to solve,\nbecause agents must consider the entire history of state-action pairs to act\nrationally in the environment. Most works use symbolic formalisms (as Linear\nTemporal Logic or automata) to specify the temporally-extended task. These\napproaches only work in finite and discrete state environments or continuous\nproblems for which a mapping between the raw state and a symbolic\ninterpretation is known as a symbol grounding (SG) function. Here, we define\nNeural Reward Machines (NRM), an automata-based neurosymbolic framework that\ncan be used for both reasoning and learning in non-symbolic non-markovian RL\ndomains, which is based on the probabilistic relaxation of Moore Machines. We\ncombine RL with semisupervised symbol grounding (SSSG) and we show that NRMs\ncan exploit high-level symbolic knowledge in non-symbolic environments without\nany knowledge of the SG function, outperforming Deep RL methods which cannot\nincorporate prior knowledge. Moreover, we advance the research in SSSG,\nproposing an algorithm for analysing the groundability of temporal\nspecifications, which is more efficient than baseline techniques of a factor\n$10^3$.\n","authors":["Elena Umili","Francesco Argenziano","Roberto Capobianco"],"pdf_url":"https://arxiv.org/pdf/2408.08677v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08675v1","updated":"2024-08-16T11:41:06Z","published":"2024-08-16T11:41:06Z","title":"Misclassification excess risk bounds for PAC-Bayesian classification via\n convexified loss","summary":" PAC-Bayesian bounds have proven to be a valuable tool for deriving\ngeneralization bounds and for designing new learning algorithms in machine\nlearning. However, it typically focus on providing generalization bounds with\nrespect to a chosen loss function. In classification tasks, due to the\nnon-convex nature of the 0-1 loss, a convex surrogate loss is often used, and\nthus current PAC-Bayesian bounds are primarily specified for this convex\nsurrogate. This work shifts its focus to providing misclassification excess\nrisk bounds for PAC-Bayesian classification when using a convex surrogate loss.\nOur key ingredient here is to leverage PAC-Bayesian relative bounds in\nexpectation rather than relying on PAC-Bayesian bounds in probability. We\ndemonstrate our approach in several important applications.\n","authors":["The Tien Mai"],"pdf_url":"https://arxiv.org/pdf/2408.08675v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.19105v2","updated":"2024-08-16T11:28:13Z","published":"2024-02-29T12:36:10Z","title":"CollaFuse: Navigating Limited Resources and Privacy in Collaborative\n Generative AI","summary":" In the landscape of generative artificial intelligence, diffusion-based\nmodels present challenges for socio-technical systems in data requirements and\nprivacy. Traditional approaches like federated learning distribute the learning\nprocess but strain individual clients, especially with constrained resources\n(e.g., edge devices). In response to these challenges, we introduce CollaFuse,\na novel framework inspired by split learning. Tailored for efficient and\ncollaborative use of denoising diffusion probabilistic models, CollaFuse\nenables shared server training and inference, alleviating client computational\nburdens. This is achieved by retaining data and computationally inexpensive GPU\nprocesses locally at each client while outsourcing the computationally\nexpensive processes to the shared server. Demonstrated in a healthcare context,\nCollaFuse enhances privacy by highly reducing the need for sensitive\ninformation sharing. These capabilities hold the potential to impact various\napplication areas, such as the design of edge computing solutions, healthcare\nresearch, or autonomous driving. In essence, our work advances distributed\nmachine learning, shaping the future of collaborative GenAI networks.\n","authors":["Domenique Zipperling","Simeon Allmendinger","Lukas Struppek","Niklas Kühl"],"pdf_url":"https://arxiv.org/pdf/2402.19105v2.pdf","comment":"Thirty-Second European Conference on Information Systems (ECIS 2024)"},{"id":"http://arxiv.org/abs/2408.08666v1","updated":"2024-08-16T11:15:52Z","published":"2024-08-16T11:15:52Z","title":"A Multivocal Literature Review on Privacy and Fairness in Federated\n Learning","summary":" Federated Learning presents a way to revolutionize AI applications by\neliminating the necessity for data sharing. Yet, research has shown that\ninformation can still be extracted during training, making additional\nprivacy-preserving measures such as differential privacy imperative. To\nimplement real-world federated learning applications, fairness, ranging from a\nfair distribution of performance to non-discriminative behaviour, must be\nconsidered. Particularly in high-risk applications (e.g. healthcare), avoiding\nthe repetition of past discriminatory errors is paramount. As recent research\nhas demonstrated an inherent tension between privacy and fairness, we conduct a\nmultivocal literature review to examine the current methods to integrate\nprivacy and fairness in federated learning. Our analyses illustrate that the\nrelationship between privacy and fairness has been neglected, posing a critical\nrisk for real-world applications. We highlight the need to explore the\nrelationship between privacy, fairness, and performance, advocating for the\ncreation of integrated federated learning frameworks.\n","authors":["Beatrice Balbierer","Lukas Heinlein","Domenique Zipperling","Niklas Kühl"],"pdf_url":"https://arxiv.org/pdf/2408.08666v1.pdf","comment":"Accepted for publication at the Internationale Tagung\n Wirtschaftsinformatik 2024"},{"id":"http://arxiv.org/abs/2408.08664v1","updated":"2024-08-16T11:11:56Z","published":"2024-08-16T11:11:56Z","title":"A new perspective on Bayesian Operational Modal Analysis","summary":" In the field of operational modal analysis (OMA), obtained modal information\nis frequently used to assess the current state of aerospace, mechanical,\noffshore and civil structures. However, the stochasticity of operational\nsystems and the lack of forcing information can lead to inconsistent results.\nQuantifying the uncertainty of the recovered modal parameters through OMA is\ntherefore of significant value. In this article, a new perspective on Bayesian\nOMA is proposed: a Bayesian stochastic subspace identification (SSI) algorithm.\nDistinct from existing approaches to Bayesian OMA, a hierarchical probabilistic\nmodel is embedded at the core of covariance-driven SSI. Through substitution of\ncanonical correlation analysis with a Bayesian equivalent, posterior\ndistributions over the modal properties are obtained. Two inference schemes are\npresented for the proposed Bayesian formulation: Markov Chain Monte Carlo and\nvariational Bayes. Two case studies are then explored. The first is benchmark\nstudy using data from a simulated, multi degree-of-freedom, linear system.\nFollowing application of Bayesian SSI, it is shown that the same posterior is\ntargeted and recovered by both inference schemes, with good agreement between\nthe posterior mean and the conventional SSI result. The second study applies\nthe variational form to data obtained from an in-service structure: The Z24\nbridge. The results of this study are presented at single model orders, and\nthen using a stabilisation diagram. The recovered posterior uncertainty is\npresented and compared to the classic SSI result. It is observed that the\nposterior distributions with mean values coinciding with the natural\nfrequencies exhibit lower variance than values situated away from the natural\nfrequencies.\n","authors":["Brandon J. O'Connell","Max D. Champneys","Timothy J. Rogers"],"pdf_url":"https://arxiv.org/pdf/2408.08664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08661v1","updated":"2024-08-16T11:09:56Z","published":"2024-08-16T11:09:56Z","title":"MIA-Tuner: Adapting Large Language Models as Pre-training Text Detector","summary":" The increasing parameters and expansive dataset of large language models\n(LLMs) highlight the urgent demand for a technical solution to audit the\nunderlying privacy risks and copyright issues associated with LLMs. Existing\nstudies have partially addressed this need through an exploration of the\npre-training data detection problem, which is an instance of a membership\ninference attack (MIA). This problem involves determining whether a given piece\nof text has been used during the pre-training phase of the target LLM. Although\nexisting methods have designed various sophisticated MIA score functions to\nachieve considerable detection performance in pre-trained LLMs, how to achieve\nhigh-confidence detection and how to perform MIA on aligned LLMs remain\nchallenging. In this paper, we propose MIA-Tuner, a novel instruction-based MIA\nmethod, which instructs LLMs themselves to serve as a more precise pre-training\ndata detector internally, rather than design an external MIA score function.\nFurthermore, we design two instruction-based safeguards to respectively\nmitigate the privacy risks brought by the existing methods and MIA-Tuner. To\ncomprehensively evaluate the most recent state-of-the-art LLMs, we collect a\nmore up-to-date MIA benchmark dataset, named WIKIMIA-24, to replace the widely\nadopted benchmark WIKIMIA. We conduct extensive experiments across various\naligned and unaligned LLMs over the two benchmark datasets. The results\ndemonstrate that MIA-Tuner increases the AUC of MIAs from 0.7 to a\nsignificantly high level of 0.9.\n","authors":["Wenjie Fu","Huandong Wang","Chen Gao","Guanghua Liu","Yong Li","Tao Jiang"],"pdf_url":"https://arxiv.org/pdf/2408.08661v1.pdf","comment":"code and dataset: https://github.com/wjfu99/MIA-Tuner"},{"id":"http://arxiv.org/abs/2408.02349v2","updated":"2024-08-16T11:09:18Z","published":"2024-08-05T09:54:08Z","title":"Active Sensing of Knee Osteoarthritis Progression with Reinforcement\n Learning","summary":" Osteoarthritis (OA) is the most common musculoskeletal disease, which has no\ncure. Knee OA (KOA) is one of the highest causes of disability worldwide, and\nit costs billions of United States dollars to the global community. Prediction\nof KOA progression has been of high interest to the community for years, as it\ncan advance treatment development through more efficient clinical trials and\nimprove patient outcomes through more efficient healthcare utilization.\nExisting approaches for predicting KOA, however, are predominantly static, i.e.\nconsider data from a single time point to predict progression many years into\nthe future, and knee level, i.e. consider progression in a single joint only.\nDue to these and related reasons, these methods fail to deliver the level of\npredictive performance, which is sufficient to result in cost savings and\nbetter patient outcomes. Collecting extensive data from all patients on a\nregular basis could address the issue, but it is limited by the high cost at a\npopulation level. In this work, we propose to go beyond static prediction\nmodels in OA, and bring a novel Active Sensing (AS) approach, designed to\ndynamically follow up patients with the objective of maximizing the number of\ninformative data acquisitions, while minimizing their total cost over a period\nof time. Our approach is based on Reinforcement Learning (RL), and it leverages\na novel reward function designed specifically for AS of disease progression in\nmore than one part of a human body. Our method is end-to-end, relies on\nmulti-modal Deep Learning, and requires no human input at inference time.\nThroughout an exhaustive experimental evaluation, we show that using RL can\nprovide a higher monetary benefit when compared to state-of-the-art baselines.\n","authors":["Khanh Nguyen","Huy Hoang Nguyen","Egor Panfilov","Aleksei Tiulpin"],"pdf_url":"https://arxiv.org/pdf/2408.02349v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12842v3","updated":"2024-08-16T10:47:32Z","published":"2023-10-19T15:51:23Z","title":"Model-agnostic variable importance for predictive uncertainty: an\n entropy-based approach","summary":" In order to trust the predictions of a machine learning algorithm, it is\nnecessary to understand the factors that contribute to those predictions. In\nthe case of probabilistic and uncertainty-aware models, it is necessary to\nunderstand not only the reasons for the predictions themselves, but also the\nreasons for the model's level of confidence in those predictions. In this\npaper, we show how existing methods in explainability can be extended to\nuncertainty-aware models and how such extensions can be used to understand the\nsources of uncertainty in a model's predictive distribution. In particular, by\nadapting permutation feature importance, partial dependence plots, and\nindividual conditional expectation plots, we demonstrate that novel insights\ninto model behaviour may be obtained and that these methods can be used to\nmeasure the impact of features on both the entropy of the predictive\ndistribution and the log-likelihood of the ground truth labels under that\ndistribution. With experiments using both synthetic and real-world data, we\ndemonstrate the utility of these approaches to understand both the sources of\nuncertainty and their impact on model performance.\n","authors":["Danny Wood","Theodore Papamarkou","Matt Benatan","Richard Allmendinger"],"pdf_url":"https://arxiv.org/pdf/2310.12842v3.pdf","comment":"Data Mining and Knowledge Discovery. Springer"},{"id":"http://arxiv.org/abs/2408.08655v1","updated":"2024-08-16T10:44:14Z","published":"2024-08-16T10:44:14Z","title":"Mitigating Backdoor Attacks in Federated Learning via Flipping Weight\n Updates of Low-Activation Input Neurons","summary":" Federated learning enables multiple clients to collaboratively train machine\nlearning models under the overall planning of the server while adhering to\nprivacy requirements. However, the server cannot directly oversee the local\ntraining process, creating an opportunity for malicious clients to introduce\nbackdoors. Existing research shows that backdoor attacks activate specific\nneurons in the compromised model, which remain dormant when processing clean\ndata. Leveraging this insight, we propose a method called Flipping Weight\nUpdates of Low-Activation Input Neurons (FLAIN) to defend against backdoor\nattacks in federated learning. Specifically, after completing global training,\nwe employ an auxiliary dataset to identify low-activation input neurons and\nflip the associated weight updates. We incrementally raise the threshold for\nlow-activation inputs and flip the weight updates iteratively, until the\nperformance degradation on the auxiliary data becomes unacceptable. Extensive\nexperiments validate that our method can effectively reduce the success rate of\nbackdoor attacks to a low level in various attack scenarios including those\nwith non-IID data distribution or high MCRs, causing only minimal performance\ndegradation on clean data.\n","authors":["Binbin Ding","Penghui Yang","Zeqing Ge","Shengjun Huang"],"pdf_url":"https://arxiv.org/pdf/2408.08655v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12378v2","updated":"2024-08-16T10:40:40Z","published":"2024-06-18T08:05:04Z","title":"Efficient mapping of phase diagrams with conditional Boltzmann\n Generators","summary":" The accurate prediction of phase diagrams is of central importance for both\nthe fundamental understanding of materials as well as for technological\napplications in material sciences. However, the computational prediction of the\nrelative stability between phases based on their free energy is a daunting\ntask, as traditional free energy estimators require a large amount of\nsimulation data to obtain uncorrelated equilibrium samples over a grid of\nthermodynamic states. In this work, we develop deep generative machine learning\nmodels based on the Boltzmann Generator approach for entire phase diagrams,\nemploying normalizing flows conditioned on the thermodynamic states, e.g.,\ntemperature and pressure, that they map to. By training a single normalizing\nflow to transform the equilibrium distribution sampled at only one reference\nthermodynamic state to a wide range of target temperatures and pressures, we\ncan efficiently generate equilibrium samples across the entire phase diagram.\nUsing a permutation-equivariant architecture allows us, thereby, to treat solid\nand liquid phases on the same footing. We demonstrate our approach by\npredicting the solid-liquid coexistence line for a Lennard-Jones system in\nexcellent agreement with state-of-the-art free energy methods while\nsignificantly reducing the number of energy evaluations needed.\n","authors":["Maximilian Schebek","Michele Invernizzi","Frank Noé","Jutta Rogal"],"pdf_url":"https://arxiv.org/pdf/2406.12378v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2011.11152v6","updated":"2024-08-16T10:36:24Z","published":"2020-11-23T00:39:49Z","title":"On the Overlooked Pitfalls of Weight Decay and How to Mitigate Them: A\n Gradient-Norm Perspective","summary":" Weight decay is a simple yet powerful regularization technique that has been\nvery widely used in training of deep neural networks (DNNs). While weight decay\nhas attracted much attention, previous studies fail to discover some overlooked\npitfalls on large gradient norms resulted by weight decay. In this paper, we\ndiscover that, weight decay can unfortunately lead to large gradient norms at\nthe final phase (or the terminated solution) of training, which often indicates\nbad convergence and poor generalization. To mitigate the gradient-norm-centered\npitfalls, we present the first practical scheduler for weight decay, called the\nScheduled Weight Decay (SWD) method that can dynamically adjust the weight\ndecay strength according to the gradient norm and significantly penalize large\ngradient norms during training. Our experiments also support that SWD indeed\nmitigates large gradient norms and often significantly outperforms the\nconventional constant weight decay strategy for Adaptive Moment Estimation\n(Adam).\n","authors":["Zeke Xie","Zhiqiang Xu","Jingzhao Zhang","Issei Sato","Masashi Sugiyama"],"pdf_url":"https://arxiv.org/pdf/2011.11152v6.pdf","comment":"NeurIPS 2023, 21 pages, 20 figures. Keywords: Weight Decay,\n Regularization, Optimization, Deep Learning"},{"id":"http://arxiv.org/abs/2408.08652v1","updated":"2024-08-16T10:36:08Z","published":"2024-08-16T10:36:08Z","title":"TextCAVs: Debugging vision models using text","summary":" Concept-based interpretability methods are a popular form of explanation for\ndeep learning models which provide explanations in the form of high-level human\ninterpretable concepts. These methods typically find concept activation vectors\n(CAVs) using a probe dataset of concept examples. This requires labelled data\nfor these concepts -- an expensive task in the medical domain. We introduce\nTextCAVs: a novel method which creates CAVs using vision-language models such\nas CLIP, allowing for explanations to be created solely using text descriptions\nof the concept, as opposed to image exemplars. This reduced cost in testing\nconcepts allows for many concepts to be tested and for users to interact with\nthe model, testing new ideas as they are thought of, rather than a delay caused\nby image collection and annotation. In early experimental results, we\ndemonstrate that TextCAVs produces reasonable explanations for a chest x-ray\ndataset (MIMIC-CXR) and natural images (ImageNet), and that these explanations\ncan be used to debug deep learning-based models.\n","authors":["Angus Nicolson","Yarin Gal","J. Alison Noble"],"pdf_url":"https://arxiv.org/pdf/2408.08652v1.pdf","comment":"11 pages, 2 figures. Accepted at iMIMIC Workshop at MICCAI 2024"},{"id":"http://arxiv.org/abs/2407.16058v2","updated":"2024-08-16T10:29:46Z","published":"2024-07-22T21:26:39Z","title":"Revisiting Score Function Estimators for $k$-Subset Sampling","summary":" Are score function estimators an underestimated approach to learning with\n$k$-subset sampling? Sampling $k$-subsets is a fundamental operation in many\nmachine learning tasks that is not amenable to differentiable parametrization,\nimpeding gradient-based optimization. Prior work has focused on relaxed\nsampling or pathwise gradient estimators. Inspired by the success of score\nfunction estimators in variational inference and reinforcement learning, we\nrevisit them within the context of $k$-subset sampling. Specifically, we\ndemonstrate how to efficiently compute the $k$-subset distribution's score\nfunction using a discrete Fourier transform, and reduce the estimator's\nvariance with control variates. The resulting estimator provides both exact\nsamples and unbiased gradient estimates while also applying to\nnon-differentiable downstream models, unlike existing methods. Experiments in\nfeature selection show results competitive with current methods, despite weaker\nassumptions.\n","authors":["Klas Wijk","Ricardo Vinuesa","Hossein Azizpour"],"pdf_url":"https://arxiv.org/pdf/2407.16058v2.pdf","comment":"ICML 2024 Workshop on Differentiable Almost Everything:\n Differentiable Relaxations, Algorithms, Operators, and Simulators"},{"id":"http://arxiv.org/abs/2408.08647v1","updated":"2024-08-16T10:22:54Z","published":"2024-08-16T10:22:54Z","title":"Modeling the Neonatal Brain Development Using Implicit Neural\n Representations","summary":" The human brain undergoes rapid development during the third trimester of\npregnancy. In this work, we model the neonatal development of the infant brain\nin this age range. As a basis, we use MR images of preterm- and term-birth\nneonates from the developing human connectome project (dHCP). We propose a\nneural network, specifically an implicit neural representation (INR), to\npredict 2D- and 3D images of varying time points. In order to model a\nsubject-specific development process, it is necessary to disentangle the age\nfrom the subjects' identity in the latent space of the INR. We propose two\nmethods, Subject Specific Latent Vectors (SSL) and Stochastic Global Latent\nAugmentation (SGLA), enabling this disentanglement. We perform an analysis of\nthe results and compare our proposed model to an age-conditioned denoising\ndiffusion model as a baseline. We also show that our method can be applied in a\nmemory-efficient way, which is especially important for 3D data.\n","authors":["Florentin Bieder","Paul Friedrich","Hélène Corbaz","Alicia Durrer","Julia Wolleb","Philippe C. Cattin"],"pdf_url":"https://arxiv.org/pdf/2408.08647v1.pdf","comment":"Preprint, Accepted for PRIME MICCAI 2024"},{"id":"http://arxiv.org/abs/2408.08642v1","updated":"2024-08-16T10:19:27Z","published":"2024-08-16T10:19:27Z","title":"The Power of Bias: Optimizing Client Selection in Federated Learning\n with Heterogeneous Differential Privacy","summary":" To preserve the data privacy, the federated learning (FL) paradigm emerges in\nwhich clients only expose model gradients rather than original data for\nconducting model training. To enhance the protection of model gradients in FL,\ndifferentially private federated learning (DPFL) is proposed which incorporates\ndifferentially private (DP) noises to obfuscate gradients before they are\nexposed. Yet, an essential but largely overlooked problem in DPFL is the\nheterogeneity of clients' privacy requirement, which can vary significantly\nbetween clients and extremely complicates the client selection problem in DPFL.\nIn other words, both the data quality and the influence of DP noises should be\ntaken into account when selecting clients. To address this problem, we conduct\nconvergence analysis of DPFL under heterogeneous privacy, a generic client\nselection strategy, popular DP mechanisms and convex loss. Based on convergence\nanalysis, we formulate the client selection problem to minimize the value of\nloss function in DPFL with heterogeneous privacy, which is a convex\noptimization problem and can be solved efficiently. Accordingly, we propose the\nDPFL-BCS (biased client selection) algorithm. The extensive experiment results\nwith real datasets under both convex and non-convex loss functions indicate\nthat DPFL-BCS can remarkably improve model utility compared with the SOTA\nbaselines.\n","authors":["Jiating Ma","Yipeng Zhou","Qi Li","Quan Z. Sheng","Laizhong Cui","Jiangchuan Liu"],"pdf_url":"https://arxiv.org/pdf/2408.08642v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08639v1","updated":"2024-08-16T10:09:45Z","published":"2024-08-16T10:09:45Z","title":"Solving The Quantum Many-Body Hamiltonian Learning Problem with Neural\n Differential Equations","summary":" Understanding and characterising quantum many-body dynamics remains a\nsignificant challenge due to both the exponential complexity required to\nrepresent quantum many-body Hamiltonians, and the need to accurately track\nstates in time under the action of such Hamiltonians. This inherent complexity\nlimits our ability to characterise quantum many-body systems, highlighting the\nneed for innovative approaches to unlock their full potential. To address this\nchallenge, we propose a novel method to solve the Hamiltonian Learning (HL)\nproblem-inferring quantum dynamics from many-body state trajectories-using\nNeural Differential Equations combined with an Ansatz Hamiltonian. Our method\nis reliably convergent, experimentally friendly, and interpretable, making it a\nstable solution for HL on a set of Hamiltonians previously unlearnable in the\nliterature. In addition to this, we propose a new quantitative benchmark based\non power laws, which can objectively compare the reliability and generalisation\ncapabilities of any two HL algorithms. Finally, we benchmark our method against\nstate-of-the-art HL algorithms with a 1D spin-1/2 chain proof of concept.\n","authors":["Timothy Heightman","Edward Jiang","Antonio Acín"],"pdf_url":"https://arxiv.org/pdf/2408.08639v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07081v3","updated":"2024-08-16T09:54:23Z","published":"2024-08-07T18:07:15Z","title":"MathBridge: A Large Corpus Dataset for Translating Spoken Mathematical\n Expressions into $LaTeX$ Formulas for Improved Readability","summary":" Improving the readability of mathematical expressions in text-based document\nsuch as subtitle of mathematical video, is an significant task. To achieve\nthis, mathematical expressions should be convert to compiled formulas. For\ninstance, the spoken expression ``x equals minus b plus or minus the square\nroot of b squared minus four a c, all over two a'' from automatic speech\nrecognition is more readily comprehensible when displayed as a compiled formula\n$x = \\frac{-b \\pm \\sqrt{b^2 - 4ac}}{2a}$. To convert mathematical spoken\nsentences to compiled formulas, two processes are required: spoken sentences\nare converted into LaTeX formulas, and LaTeX formulas are converted into\ncompiled formulas. The latter can be managed by using LaTeX engines. However,\nthere is no way to do the former effectively. Even if we try to solve this\nusing language models, there is no paired data between spoken sentences and\nLaTeX formulas to train it. In this paper, we introduce MathBridge, the first\nextensive dataset for translating mathematical spoken sentences into LaTeX\nformulas. MathBridge comprises approximately 23 million LaTeX formulas paired\nwith the corresponding mathematical spoken sentences. Through comprehensive\nevaluations, including fine-tuning with proposed data, we discovered that\nMathBridge significantly enhances the capabilities of pretrained language\nmodels for converting to LaTeX formulas from mathematical spoken sentences.\nSpecifically, for the T5-large model, the sacreBLEU score increased from 4.77\nto 46.8, demonstrating substantial enhancement.\n","authors":["Kyudan Jung","Sieun Hyeon","Jeong Youn Kwon","Nam-Joon Kim","Hyun Gon Ryu","Hyuk-Jae Lee","Jaeyoung Do"],"pdf_url":"https://arxiv.org/pdf/2408.07081v3.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2403.00563v2","updated":"2024-08-16T09:53:46Z","published":"2024-03-01T14:41:51Z","title":"Indirectly Parameterized Concrete Autoencoders","summary":" Feature selection is a crucial task in settings where data is\nhigh-dimensional or acquiring the full set of features is costly. Recent\ndevelopments in neural network-based embedded feature selection show promising\nresults across a wide range of applications. Concrete Autoencoders (CAEs),\nconsidered state-of-the-art in embedded feature selection, may struggle to\nachieve stable joint optimization, hurting their training time and\ngeneralization. In this work, we identify that this instability is correlated\nwith the CAE learning duplicate selections. To remedy this, we propose a simple\nand effective improvement: Indirectly Parameterized CAEs (IP-CAEs). IP-CAEs\nlearn an embedding and a mapping from it to the Gumbel-Softmax distributions'\nparameters. Despite being simple to implement, IP-CAE exhibits significant and\nconsistent improvements over CAE in both generalization and training time\nacross several datasets for reconstruction and classification. Unlike CAE,\nIP-CAE effectively leverages non-linear relationships and does not require\nretraining the jointly optimized decoder. Furthermore, our approach is, in\nprinciple, generalizable to Gumbel-Softmax distributions beyond feature\nselection.\n","authors":["Alfred Nilsson","Klas Wijk","Sai bharath chandra Gutha","Erik Englesson","Alexandra Hotti","Carlo Saccardi","Oskar Kviman","Jens Lagergren","Ricardo Vinuesa","Hossein Azizpour"],"pdf_url":"https://arxiv.org/pdf/2403.00563v2.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2408.08629v1","updated":"2024-08-16T09:43:01Z","published":"2024-08-16T09:43:01Z","title":"Navigating Uncertainties in Machine Learning for Structural Dynamics: A\n Comprehensive Review of Probabilistic and Non-Probabilistic Approaches in\n Forward and Inverse Problems","summary":" In the era of big data, machine learning (ML) has become a powerful tool in\nvarious fields, notably impacting structural dynamics. ML algorithms offer\nadvantages by modeling physical phenomena based on data, even in the absence of\nunderlying mechanisms. However, uncertainties such as measurement noise and\nmodeling errors can compromise the reliability of ML predictions, highlighting\nthe need for effective uncertainty awareness to enhance prediction robustness.\nThis paper presents a comprehensive review on navigating uncertainties in ML,\ncategorizing uncertainty-aware approaches into probabilistic methods (including\nBayesian and frequentist perspectives) and non-probabilistic methods (such as\ninterval learning and fuzzy learning). Bayesian neural networks, known for\ntheir uncertainty quantification and nonlinear mapping capabilities, are\nemphasized for their superior performance and potential. The review covers\nvarious techniques and methodologies for addressing uncertainties in ML,\ndiscussing fundamentals and implementation procedures of each method. While\nproviding a concise overview of fundamental concepts, the paper refrains from\nin-depth critical explanations. Strengths and limitations of each approach are\nexamined, along with their applications in structural dynamic forward problems\nlike response prediction, sensitivity assessment, and reliability analysis, and\ninverse problems like system identification, model updating, and damage\nidentification. Additionally, the review identifies research gaps and suggests\nfuture directions for investigations, aiming to provide comprehensive insights\nto the research community. By offering an extensive overview of both\nprobabilistic and non-probabilistic approaches, this review aims to assist\nresearchers and practitioners in making informed decisions when utilizing ML\ntechniques to address uncertainties in structural dynamic problems.\n","authors":["Wang-Ji Yan","Lin-Feng Mei","Jiang Mo","Costas Papadimitriou","Ka-Veng Yuen","Michael Beer"],"pdf_url":"https://arxiv.org/pdf/2408.08629v1.pdf","comment":"114 pages, 27 figures, 6 tables, references added"},{"id":"http://arxiv.org/abs/2311.01205v2","updated":"2024-08-16T09:42:19Z","published":"2023-11-02T12:59:32Z","title":"Attacking Graph Neural Networks with Bit Flips: Weisfeiler and Lehman Go\n Indifferent","summary":" Prior attacks on graph neural networks have mostly focused on graph poisoning\nand evasion, neglecting the network's weights and biases. Traditional\nweight-based fault injection attacks, such as bit flip attacks used for\nconvolutional neural networks, do not consider the unique properties of graph\nneural networks. We propose the Injectivity Bit Flip Attack, the first bit flip\nattack designed specifically for graph neural networks. Our attack targets the\nlearnable neighborhood aggregation functions in quantized message passing\nneural networks, degrading their ability to distinguish graph structures and\nlosing the expressivity of the Weisfeiler-Lehman test. Our findings suggest\nthat exploiting mathematical properties specific to certain graph neural\nnetwork architectures can significantly increase their vulnerability to bit\nflip attacks. Injectivity Bit Flip Attacks can degrade the maximal expressive\nGraph Isomorphism Networks trained on various graph property prediction\ndatasets to random output by flipping only a small fraction of the network's\nbits, demonstrating its higher destructive power compared to a bit flip attack\ntransferred from convolutional neural networks. Our attack is transparent and\nmotivated by theoretical insights which are confirmed by extensive empirical\nresults.\n","authors":["Lorenz Kummer","Samir Moustafa","Nils N. Kriege","Wilfried N. Gansterer"],"pdf_url":"https://arxiv.org/pdf/2311.01205v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08628v1","updated":"2024-08-16T09:42:19Z","published":"2024-08-16T09:42:19Z","title":"A survey on secure decentralized optimization and learning","summary":" Decentralized optimization has become a standard paradigm for solving\nlarge-scale decision-making problems and training large machine learning models\nwithout centralizing data. However, this paradigm introduces new privacy and\nsecurity risks, with malicious agents potentially able to infer private data or\nimpair the model accuracy. Over the past decade, significant advancements have\nbeen made in developing secure decentralized optimization and learning\nframeworks and algorithms. This survey provides a comprehensive tutorial on\nthese advancements. We begin with the fundamentals of decentralized\noptimization and learning, highlighting centralized aggregation and distributed\nconsensus as key modules exposed to security risks in federated and distributed\noptimization, respectively. Next, we focus on privacy-preserving algorithms,\ndetailing three cryptographic tools and their integration into decentralized\noptimization and learning systems. Additionally, we examine resilient\nalgorithms, exploring the design and analysis of resilient aggregation and\nconsensus protocols that support these systems. We conclude the survey by\ndiscussing current trends and potential future directions.\n","authors":["Changxin Liu","Nicola Bastianello","Wei Huo","Yang Shi","Karl H. Johansson"],"pdf_url":"https://arxiv.org/pdf/2408.08628v1.pdf","comment":"38 pages"},{"id":"http://arxiv.org/abs/2408.08622v1","updated":"2024-08-16T09:30:36Z","published":"2024-08-16T09:30:36Z","title":"DeepDFA: Automata Learning through Neural Probabilistic Relaxations","summary":" In this work, we introduce DeepDFA, a novel approach to identifying\nDeterministic Finite Automata (DFAs) from traces, harnessing a differentiable\nyet discrete model. Inspired by both the probabilistic relaxation of DFAs and\nRecurrent Neural Networks (RNNs), our model offers interpretability\npost-training, alongside reduced complexity and enhanced training efficiency\ncompared to traditional RNNs. Moreover, by leveraging gradient-based\noptimization, our method surpasses combinatorial approaches in both scalability\nand noise resilience. Validation experiments conducted on target regular\nlanguages of varying size and complexity demonstrate that our approach is\naccurate, fast, and robust to noise in both the input symbols and the output\nlabels of training data, integrating the strengths of both logical grammar\ninduction and deep learning.\n","authors":["Elena Umili","Roberto Capobianco"],"pdf_url":"https://arxiv.org/pdf/2408.08622v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01295v4","updated":"2024-08-16T09:26:37Z","published":"2024-02-02T10:34:13Z","title":"ExtremeCast: Boosting Extreme Value Prediction for Global Weather\n Forecast","summary":" Data-driven weather forecast based on machine learning (ML) has experienced\nrapid development and demonstrated superior performance in the global\nmedium-range forecast compared to traditional physics-based dynamical models.\nHowever, most of these ML models struggle with accurately predicting extreme\nweather, which is related to training loss and the uncertainty of weather\nsystems. Through mathematical analysis, we prove that the use of symmetric\nlosses, such as the Mean Squared Error (MSE), leads to biased predictions and\nunderestimation of extreme values. To address this issue, we introduce Exloss,\na novel loss function that performs asymmetric optimization and highlights\nextreme values to obtain accurate extreme weather forecast. Beyond the\nevolution in training loss, we introduce a training-free extreme value\nenhancement module named ExBooster, which captures the uncertainty in\nprediction outcomes by employing multiple random samples, thereby increasing\nthe hit rate of low-probability extreme events. Combined with an advanced\nglobal weather forecast model, extensive experiments show that our solution can\nachieve state-of-the-art performance in extreme weather prediction, while\nmaintaining the overall forecast accuracy comparable to the top medium-range\nforecast models.\n","authors":["Wanghan Xu","Kang Chen","Tao Han","Hao Chen","Wanli Ouyang","Lei Bai"],"pdf_url":"https://arxiv.org/pdf/2402.01295v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.13044v3","updated":"2024-08-16T09:06:08Z","published":"2024-07-17T22:48:47Z","title":"DropKAN: Dropout Kolmogorov-Arnold Networks","summary":" We propose DropKAN (Dropout Kolmogorov-Arnold Networks) a regularization\nmethod that prevents co-adaptation of activation function weights in\nKolmogorov-Arnold Networks (KANs). DropKAN functions by embedding the drop mask\ndirectly within the KAN layer, randomly masking the outputs of some activations\nwithin the KANs' computation graph. We show that this simple procedure that\nrequire minimal coding effort has a regularizing effect and consistently lead\nto better generalization of KANs. We analyze the adaptation of the standard\nDropout with KANs and demonstrate that Dropout applied to KANs' neurons can\nlead to unpredictable behavior in the feedforward pass. We carry an empirical\nstudy with real world Machine Learning datasets to validate our findings. Our\nresults suggest that DropKAN is consistently a better alternative to using\nstandard Dropout with KANs, and improves the generalization performance of\nKANs. Our implementation of DropKAN is available at:\n\\url{https://github.com/Ghaith81/dropkan}.\n","authors":["Mohammed Ghaith Altarabichi"],"pdf_url":"https://arxiv.org/pdf/2407.13044v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08610v1","updated":"2024-08-16T08:52:02Z","published":"2024-08-16T08:52:02Z","title":"Generative Dataset Distillation Based on Diffusion Model","summary":" This paper presents our method for the generative track of The First Dataset\nDistillation Challenge at ECCV 2024. Since the diffusion model has become the\nmainstay of generative models because of its high-quality generative effects,\nwe focus on distillation methods based on the diffusion model. Considering that\nthe track can only generate a fixed number of images in 10 minutes using a\ngenerative model for CIFAR-100 and Tiny-ImageNet datasets, we need to use a\ngenerative model that can generate images at high speed. In this study, we\nproposed a novel generative dataset distillation method based on Stable\nDiffusion. Specifically, we use the SDXL-Turbo model which can generate images\nat high speed and quality. Compared to other diffusion models that can only\ngenerate images per class (IPC) = 1, our method can achieve an IPC = 10 for\nTiny-ImageNet and an IPC = 20 for CIFAR-100, respectively. Additionally, to\ngenerate high-quality distilled datasets for CIFAR-100 and Tiny-ImageNet, we\nuse the class information as text prompts and post data augmentation for the\nSDXL-Turbo model. Experimental results show the effectiveness of the proposed\nmethod, and we achieved third place in the generative track of the ECCV 2024 DD\nChallenge. Codes are available at https://github.com/Guang000/BANKO.\n","authors":["Duo Su","Junjie Hou","Guang Li","Ren Togo","Rui Song","Takahiro Ogawa","Miki Haseyama"],"pdf_url":"https://arxiv.org/pdf/2408.08610v1.pdf","comment":"The Third Place Winner in Generative Track of the ECCV 2024 DD\n Challenge"},{"id":"http://arxiv.org/abs/2401.03140v4","updated":"2024-08-16T08:27:00Z","published":"2024-01-06T06:55:26Z","title":"Fair Sampling in Diffusion Models through Switching Mechanism","summary":" Diffusion models have shown their effectiveness in generation tasks by\nwell-approximating the underlying probability distribution. However, diffusion\nmodels are known to suffer from an amplified inherent bias from the training\ndata in terms of fairness. While the sampling process of diffusion models can\nbe controlled by conditional guidance, previous works have attempted to find\nempirical guidance to achieve quantitative fairness. To address this\nlimitation, we propose a fairness-aware sampling method called\n\\textit{attribute switching} mechanism for diffusion models. Without additional\ntraining, the proposed sampling can obfuscate sensitive attributes in generated\ndata without relying on classifiers. We mathematically prove and experimentally\ndemonstrate the effectiveness of the proposed method on two key aspects: (i)\nthe generation of fair data and (ii) the preservation of the utility of the\ngenerated data.\n","authors":["Yujin Choi","Jinseong Park","Hoki Kim","Jaewook Lee","Saeroom Park"],"pdf_url":"https://arxiv.org/pdf/2401.03140v4.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2202.02466v5","updated":"2024-08-16T08:25:42Z","published":"2022-02-05T02:31:01Z","title":"Handling Distribution Shifts on Graphs: An Invariance Perspective","summary":" There is increasing evidence suggesting neural networks' sensitivity to\ndistribution shifts, so that research on out-of-distribution (OOD)\ngeneralization comes into the spotlight. Nonetheless, current endeavors mostly\nfocus on Euclidean data, and its formulation for graph-structured data is not\nclear and remains under-explored, given two-fold fundamental challenges: 1) the\ninter-connection among nodes in one graph, which induces non-IID generation of\ndata points even under the same environment, and 2) the structural information\nin the input graph, which is also informative for prediction. In this paper, we\nformulate the OOD problem on graphs and develop a new invariant learning\napproach, Explore-to-Extrapolate Risk Minimization (EERM), that facilitates\ngraph neural networks to leverage invariance principles for prediction. EERM\nresorts to multiple context explorers (specified as graph structure editers in\nour case) that are adversarially trained to maximize the variance of risks from\nmultiple virtual environments. Such a design enables the model to extrapolate\nfrom a single observed environment which is the common case for node-level\nprediction. We prove the validity of our method by theoretically showing its\nguarantee of a valid OOD solution and further demonstrate its power on various\nreal-world datasets for handling distribution shifts from artificial spurious\nfeatures, cross-domain transfers and dynamic graph evolution.\n","authors":["Qitian Wu","Hengrui Zhang","Junchi Yan","David Wipf"],"pdf_url":"https://arxiv.org/pdf/2202.02466v5.pdf","comment":"ICLR2022, 30 pages"},{"id":"http://arxiv.org/abs/2306.10759v5","updated":"2024-08-16T08:24:25Z","published":"2023-06-19T08:03:25Z","title":"SGFormer: Simplifying and Empowering Transformers for Large-Graph\n Representations","summary":" Learning representations on large-sized graphs is a long-standing challenge\ndue to the inter-dependence nature involved in massive data points.\nTransformers, as an emerging class of foundation encoders for graph-structured\ndata, have shown promising performance on small graphs due to its global\nattention capable of capturing all-pair influence beyond neighboring nodes.\nEven so, existing approaches tend to inherit the spirit of Transformers in\nlanguage and vision tasks, and embrace complicated models by stacking deep\nmulti-head attentions. In this paper, we critically demonstrate that even using\na one-layer attention can bring up surprisingly competitive performance across\nnode property prediction benchmarks where node numbers range from\nthousand-level to billion-level. This encourages us to rethink the design\nphilosophy for Transformers on large graphs, where the global attention is a\ncomputation overhead hindering the scalability. We frame the proposed scheme as\nSimplified Graph Transformers (SGFormer), which is empowered by a simple\nattention model that can efficiently propagate information among arbitrary\nnodes in one layer. SGFormer requires none of positional encodings,\nfeature/graph pre-processing or augmented loss. Empirically, SGFormer\nsuccessfully scales to the web-scale graph ogbn-papers100M and yields up to\n141x inference acceleration over SOTA Transformers on medium-sized graphs.\nBeyond current results, we believe the proposed methodology alone enlightens a\nnew technical path of independent interest for building Transformers on large\ngraphs.\n","authors":["Qitian Wu","Wentao Zhao","Chenxiao Yang","Hengrui Zhang","Fan Nie","Haitian Jiang","Yatao Bian","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2306.10759v5.pdf","comment":"Accepted to NeurIPS 2023, the codes are available at\n https://github.com/qitianwu/SGFormer"},{"id":"http://arxiv.org/abs/2402.11494v2","updated":"2024-08-16T08:22:17Z","published":"2024-02-18T07:49:22Z","title":"Graph Out-of-Distribution Generalization via Causal Intervention","summary":" Out-of-distribution (OOD) generalization has gained increasing attentions for\nlearning on graphs, as graph neural networks (GNNs) often exhibit performance\ndegradation with distribution shifts. The challenge is that distribution shifts\non graphs involve intricate interconnections between nodes, and the environment\nlabels are often absent in data. In this paper, we adopt a bottom-up\ndata-generative perspective and reveal a key observation through causal\nanalysis: the crux of GNNs' failure in OOD generalization lies in the latent\nconfounding bias from the environment. The latter misguides the model to\nleverage environment-sensitive correlations between ego-graph features and\ntarget nodes' labels, resulting in undesirable generalization on new unseen\nnodes. Built upon this analysis, we introduce a conceptually simple yet\nprincipled approach for training robust GNNs under node-level distribution\nshifts, without prior knowledge of environment labels. Our method resorts to a\nnew learning objective derived from causal inference that coordinates an\nenvironment estimator and a mixture-of-expert GNN predictor. The new approach\ncan counteract the confounding bias in training data and facilitate learning\ngeneralizable predictive relations. Extensive experiment demonstrates that our\nmodel can effectively enhance generalization with various types of distribution\nshifts and yield up to 27.4\\% accuracy improvement over state-of-the-arts on\ngraph OOD generalization benchmarks. Source codes are available at\nhttps://github.com/fannie1208/CaNet.\n","authors":["Qitian Wu","Fan Nie","Chenxiao Yang","Tianyi Bao","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2402.11494v2.pdf","comment":"Accepted by the research paper track of The Web Conference (WWW)\n 2024. The codes are available at https://github.com/fannie1208/CaNet"},{"id":"http://arxiv.org/abs/2407.06992v2","updated":"2024-08-16T08:18:19Z","published":"2024-07-09T16:07:01Z","title":"Robust Neural Information Retrieval: An Adversarial and\n Out-of-distribution Perspective","summary":" Recent advances in neural information retrieval (IR) models have\nsignificantly enhanced their effectiveness over various IR tasks. The\nrobustness of these models, essential for ensuring their reliability in\npractice, has also garnered significant attention. With a wide array of\nresearch on robust IR being proposed, we believe it is the opportune moment to\nconsolidate the current status, glean insights from existing methodologies, and\nlay the groundwork for future development. We view the robustness of IR to be a\nmultifaceted concept, emphasizing its necessity against adversarial attacks,\nout-of-distribution (OOD) scenarios and performance variance. With a focus on\nadversarial and OOD robustness, we dissect robustness solutions for dense\nretrieval models (DRMs) and neural ranking models (NRMs), respectively,\nrecognizing them as pivotal components of the neural IR pipeline. We provide an\nin-depth discussion of existing methods, datasets, and evaluation metrics,\nshedding light on challenges and future directions in the era of large language\nmodels. To the best of our knowledge, this is the first comprehensive survey on\nthe robustness of neural IR models, and we will also be giving our first\ntutorial presentation at SIGIR 2024\n\\url{https://sigir2024-robust-information-retrieval.github.io}. Along with the\norganization of existing work, we introduce a Benchmark for robust IR (BestIR),\na heterogeneous evaluation benchmark for robust neural information retrieval,\nwhich is publicly available at \\url{https://github.com/Davion-Liu/BestIR}. We\nhope that this study provides useful clues for future research on the\nrobustness of IR models and helps to develop trustworthy search engines\n\\url{https://github.com/Davion-Liu/Awesome-Robustness-in-Information-Retrieval}.\n","authors":["Yu-An Liu","Ruqing Zhang","Jiafeng Guo","Maarten de Rijke","Yixing Fan","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2407.06992v2.pdf","comment":"Survey paper"},{"id":"http://arxiv.org/abs/2402.16998v2","updated":"2024-08-16T08:13:38Z","published":"2024-02-26T20:13:58Z","title":"What Do Language Models Hear? Probing for Auditory Representations in\n Language Models","summary":" This work explores whether language models encode meaningfully grounded\nrepresentations of sounds of objects. We learn a linear probe that retrieves\nthe correct text representation of an object given a snippet of audio related\nto that object, where the sound representation is given by a pretrained audio\nmodel. This probe is trained via a contrastive loss that pushes the language\nrepresentations and sound representations of an object to be close to one\nanother. After training, the probe is tested on its ability to generalize to\nobjects that were not seen during training. Across different language models\nand audio models, we find that the probe generalization is above chance in many\ncases, indicating that despite being trained only on raw text, language models\nencode grounded knowledge of sounds for some objects.\n","authors":["Jerry Ngo","Yoon Kim"],"pdf_url":"https://arxiv.org/pdf/2402.16998v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08593v1","updated":"2024-08-16T08:02:00Z","published":"2024-08-16T08:02:00Z","title":"RadioDiff: An Effective Generative Diffusion Model for Sampling-Free\n Dynamic Radio Map Construction","summary":" Radio map (RM) is a promising technology that can obtain pathloss based on\nonly location, which is significant for 6G network applications to reduce the\ncommunication costs for pathloss estimation. However, the construction of RM in\ntraditional is either computationally intensive or depends on costly\nsampling-based pathloss measurements. Although the neural network (NN)-based\nmethod can efficiently construct the RM without sampling, its performance is\nstill suboptimal. This is primarily due to the misalignment between the\ngenerative characteristics of the RM construction problem and the\ndiscrimination modeling exploited by existing NN-based methods. Thus, to\nenhance RM construction performance, in this paper, the sampling-free RM\nconstruction is modeled as a conditional generative problem, where a denoised\ndiffusion-based method, named RadioDiff, is proposed to achieve high-quality RM\nconstruction. In addition, to enhance the diffusion model's capability of\nextracting features from dynamic environments, an attention U-Net with an\nadaptive fast Fourier transform module is employed as the backbone network to\nimprove the dynamic environmental features extracting capability. Meanwhile,\nthe decoupled diffusion model is utilized to further enhance the construction\nperformance of RMs. Moreover, a comprehensive theoretical analysis of why the\nRM construction is a generative problem is provided for the first time, from\nboth perspectives of data features and NN training methods. Experimental\nresults show that the proposed RadioDiff achieves state-of-the-art performance\nin all three metrics of accuracy, structural similarity, and peak\nsignal-to-noise ratio. The code is available at\nhttps://github.com/UNIC-Lab/RadioDiff.\n","authors":["Xiucheng Wang","Keda Tao","Nan Cheng","Zhisheng Yin","Zan Li","Yuan Zhang","Xuemin Shen"],"pdf_url":"https://arxiv.org/pdf/2408.08593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08590v1","updated":"2024-08-16T07:47:39Z","published":"2024-08-16T07:47:39Z","title":"A Mechanistic Interpretation of Syllogistic Reasoning in Auto-Regressive\n Language Models","summary":" Recent studies on logical reasoning in auto-regressive Language Models (LMs)\nhave sparked a debate on whether such models can learn systematic reasoning\nprinciples during pre-training or merely exploit superficial patterns in the\ntraining data. This paper presents a mechanistic interpretation of syllogistic\nreasoning in LMs to further enhance our understanding of internal dynamics.\nSpecifically, we present a methodology for circuit discovery aimed at\ndisentangling content-independent reasoning mechanisms from world knowledge\nacquired during pre-training. Through two distinct intervention methods, we\nuncover a sufficient and necessary circuit involving middle-term suppression\nthat elucidates how LMs transfer information to derive valid conclusions from\npremises. Furthermore, we investigate how belief biases manifest in syllogistic\nreasoning, finding evidence of partial contamination from additional attention\nheads responsible for encoding commonsense and contextualized knowledge.\nFinally, we explore the generalization of the discovered mechanisms across\nvarious syllogistic schemes and model sizes, finding that the identified\ncircuit is sufficient and necessary for all the schemes on which the model\nachieves high downstream accuracy ($\\geq$ 60\\%). Overall, our findings suggest\nthat LMs indeed learn transferable content-independent reasoning mechanisms,\nbut that, at the same time, such mechanisms do not involve generalisable and\nabstract logical primitives, being susceptible to contamination by the same\nworld knowledge acquired during pre-training.\n","authors":["Geonhee Kim","Marco Valentino","André Freitas"],"pdf_url":"https://arxiv.org/pdf/2408.08590v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.06566v4","updated":"2024-08-16T07:43:55Z","published":"2024-06-03T07:44:32Z","title":"Natural Language Interaction with a Household Electricity\n Knowledge-based Digital Twin","summary":" Domain specific digital twins, representing a digital replica of various\nsegments of the smart grid, are foreseen as able to model, simulate, and\ncontrol the respective segments. At the same time, knowledge-based digital\ntwins, coupled with AI, may also empower humans to understand aspects of the\nsystem through natural language interaction in view of planning and policy\nmaking. This paper is the first to assess and report on the potential of\nRetrieval Augmented Generation (RAG) question answers related to household\nelectrical energy measurement aspects leveraging a knowledge-based energy\ndigital twin. Relying on the recently published electricity consumption\nknowledge graph that actually represents a knowledge-based digital twin, we\nstudy the capabilities of ChatGPT, Gemini and Llama in answering electricity\nrelated questions. Furthermore, we compare the answers with the ones generated\nthrough a RAG techniques that leverages an existing electricity knowledge-based\ndigital twin. Our findings illustrate that the RAG approach not only reduces\nthe incidence of incorrect information typically generated by LLMs but also\nsignificantly improves the quality of the output by grounding responses in\nverifiable data. This paper details our methodology, presents a comparative\nanalysis of responses with and without RAG, and discusses the implications of\nour findings for future applications of AI in specialized sectors like energy\ndata analysis.\n","authors":["Carolina Fortuna","Vid Hanžel","Blaž Bertalanič"],"pdf_url":"https://arxiv.org/pdf/2406.06566v4.pdf","comment":"Accepted at IEEE SmartGridComm'24"},{"id":"http://arxiv.org/abs/2408.08585v1","updated":"2024-08-16T07:39:38Z","published":"2024-08-16T07:39:38Z","title":"OptDist: Learning Optimal Distribution for Customer Lifetime Value\n Prediction","summary":" Customer Lifetime Value (CLTV) prediction is a critical task in business\napplications. Accurately predicting CLTV is challenging in real-world business\nscenarios, as the distribution of CLTV is complex and mutable. Firstly, there\nis a large number of users without any consumption consisting of a long-tailed\npart that is too complex to fit. Secondly, the small set of high-value users\nspent orders of magnitude more than a typical user leading to a wide range of\nthe CLTV distribution which is hard to capture in a single distribution.\nExisting approaches for CLTV estimation either assume a prior probability\ndistribution and fit a single group of distribution-related parameters for all\nsamples, or directly learn from the posterior distribution with manually\npredefined buckets in a heuristic manner. However, all these methods fail to\nhandle complex and mutable distributions. In this paper, we propose a novel\noptimal distribution selection model OptDist for CLTV prediction, which\nutilizes an adaptive optimal sub-distribution selection mechanism to improve\nthe accuracy of complex distribution modeling. Specifically, OptDist trains\nseveral candidate sub-distribution networks in the distribution learning module\n(DLM) for modeling the probability distribution of CLTV. Then, a distribution\nselection module (DSM) is proposed to select the sub-distribution for each\nsample, thus making the selection automatically and adaptively. Besides, we\ndesign an alignment mechanism that connects both modules, which effectively\nguides the optimization. We conduct extensive experiments on both two public\nand one private dataset to verify that OptDist outperforms state-of-the-art\nbaselines. Furthermore, OptDist has been deployed on a large-scale financial\nplatform for customer acquisition marketing campaigns and the online\nexperiments also demonstrate the effectiveness of OptDist.\n","authors":["Yunpeng Weng","Xing Tang","Zhenhao Xu","Fuyuan Lyu","Dugang Liu","Zexu Sun","Xiuqiang He"],"pdf_url":"https://arxiv.org/pdf/2408.08585v1.pdf","comment":"CIKM 2024"},{"id":"http://arxiv.org/abs/2310.15290v3","updated":"2024-08-16T07:37:20Z","published":"2023-10-23T18:56:01Z","title":"Reliable Generation of Privacy-preserving Synthetic EHR Time Series via\n Diffusion Models","summary":" Electronic Health Records (EHRs) are rich sources of patient-level data,\noffering valuable resources for medical data analysis. However, privacy\nconcerns often restrict access to EHRs, hindering downstream analysis. Current\nEHR de-identification methods are flawed and can lead to potential privacy\nleakage. Additionally, existing publicly available EHR databases are limited,\npreventing the advancement of medical research using EHR. This study aims to\novercome these challenges by generating realistic and privacy-preserving\nsynthetic electronic health records (EHRs) time series efficiently. We\nintroduce a new method for generating diverse and realistic synthetic EHR time\nseries data using Denoising Diffusion Probabilistic Models (DDPM). We conducted\nexperiments on six databases: Medical Information Mart for Intensive Care III\nand IV (MIMIC-III/IV), the eICU Collaborative Research Database (eICU), and\nnon-EHR datasets on Stocks and Energy. We compared our proposed method with\neight existing methods. Our results demonstrate that our approach significantly\noutperforms all existing methods in terms of data fidelity while requiring less\ntraining effort. Additionally, data generated by our method yields a lower\ndiscriminative accuracy compared to other baseline methods, indicating the\nproposed method can generate data with less privacy risk. The proposed\ndiffusion-model-based method can reliably and efficiently generate synthetic\nEHR time series, which facilitates the downstream medical data analysis. Our\nnumerical results show the superiority of the proposed method over all other\nexisting methods.\n","authors":["Muhang Tian","Bernie Chen","Allan Guo","Shiyi Jiang","Anru R. Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.15290v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08584v1","updated":"2024-08-16T07:37:05Z","published":"2024-08-16T07:37:05Z","title":"S-RAF: A Simulation-Based Robustness Assessment Framework for\n Responsible Autonomous Driving","summary":" As artificial intelligence (AI) technology advances, ensuring the robustness\nand safety of AI-driven systems has become paramount. However, varying\nperceptions of robustness among AI developers create misaligned evaluation\nmetrics, complicating the assessment and certification of safety-critical and\ncomplex AI systems such as autonomous driving (AD) agents. To address this\nchallenge, we introduce Simulation-Based Robustness Assessment Framework\n(S-RAF) for autonomous driving. S-RAF leverages the CARLA Driving simulator to\nrigorously assess AD agents across diverse conditions, including faulty\nsensors, environmental changes, and complex traffic situations. By quantifying\nrobustness and its relationship with other safety-critical factors, such as\ncarbon emissions, S-RAF aids developers and stakeholders in building safe and\nresponsible driving agents, and streamlining safety certification processes.\nFurthermore, S-RAF offers significant advantages, such as reduced testing\ncosts, and the ability to explore edge cases that may be unsafe to test in the\nreal world. The code for this framework is available here:\nhttps://github.com/cognitive-robots/rai-leaderboard\n","authors":["Daniel Omeiza","Pratik Somaiya","Jo-Ann Pattinson","Carolyn Ten-Holter","Jack Stilgoe","Marina Jirotka","Lars Kunze"],"pdf_url":"https://arxiv.org/pdf/2408.08584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08583v1","updated":"2024-08-16T07:33:58Z","published":"2024-08-16T07:33:58Z","title":"GrassNet: State Space Model Meets Graph Neural Network","summary":" Designing spectral convolutional networks is a formidable task in graph\nlearning. In traditional spectral graph neural networks (GNNs),\npolynomial-based methods are commonly used to design filters via the Laplacian\nmatrix. In practical applications, however, these polynomial methods encounter\ninherent limitations, which primarily arise from the the low-order truncation\nof polynomial filters and the lack of overall modeling of the graph spectrum.\nThis leads to poor performance of existing spectral approaches on real-world\ngraph data, especially when the spectrum is highly concentrated or contains\nmany numerically identical values, as they tend to apply the exact same\nmodulation to signals with the same frequencies. To overcome these issues, in\nthis paper, we propose Graph State Space Network (GrassNet), a novel graph\nneural network with theoretical support that provides a simple yet effective\nscheme for designing and learning arbitrary graph spectral filters. In\nparticular, our GrassNet introduces structured state space models (SSMs) to\nmodel the correlations of graph signals at different frequencies and derives a\nunique rectification for each frequency in the graph spectrum. To the best of\nour knowledge, our work is the first to employ SSMs for the design of GNN\nspectral filters, and it theoretically offers greater expressive power compared\nwith polynomial filters. Extensive experiments on nine public benchmarks reveal\nthat GrassNet achieves superior performance in real-world graph modeling tasks.\n","authors":["Gongpei Zhao","Tao Wang","Yi Jin","Congyan Lang","Yidong Li","Haibin Ling"],"pdf_url":"https://arxiv.org/pdf/2408.08583v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14573v3","updated":"2024-08-16T07:30:29Z","published":"2024-07-21T06:27:45Z","title":"Trading Devil Final: Backdoor attack via Stock market and Bayesian\n Optimization","summary":" Since the advent of generative artificial intelligence, every company and\nresearcher has been rushing to develop their own generative models, whether\ncommercial or not. Given the large number of users of these powerful new tools,\nthere is currently no intrinsically verifiable way to explain from the ground\nup what happens when LLMs (large language models) learn. For example, those\nbased on automatic speech recognition systems, which have to rely on huge and\nastronomical amounts of data collected from all over the web to produce fast\nand efficient results, In this article, we develop a backdoor attack called\nMarketBackFinal 2.0, based on acoustic data poisoning, MarketBackFinal 2.0 is\nmainly based on modern stock market models. In order to show the possible\nvulnerabilities of speech-based transformers that may rely on LLMs.\n","authors":["Orson Mengara"],"pdf_url":"https://arxiv.org/pdf/2407.14573v3.pdf","comment":"END (will never be modified again) :Jumps-Diffusion and stock market:\n Better quantify uncertainty in financial simulations"},{"id":"http://arxiv.org/abs/2408.08185v2","updated":"2024-08-16T07:13:38Z","published":"2024-08-15T14:42:28Z","title":"Data-driven identification of latent port-Hamiltonian systems","summary":" Conventional physics-based modeling techniques involve high effort, e.g.,\ntime and expert knowledge, while data-driven methods often lack\ninterpretability, structure, and sometimes reliability. To mitigate this, we\npresent a data-driven system identification framework that derives models in\nthe port-Hamiltonian (pH) formulation. This formulation is suitable for\nmulti-physical systems while guaranteeing the useful system theoretical\nproperties of passivity and stability. Our framework combines linear and\nnonlinear reduction with structured, physics-motivated system identification.\nIn this process, high-dimensional state data obtained from possibly nonlinear\nsystems serves as input for an autoencoder, which then performs two tasks: (i)\nnonlinearly transforming and (ii) reducing this data onto a low-dimensional\nlatent space. In this space, a linear pH system, that satisfies the pH\nproperties per construction, is parameterized by the weights of a neural\nnetwork. The mathematical requirements are met by defining the pH matrices\nthrough Cholesky factorizations. The neural networks that define the coordinate\ntransformation and the pH system are identified in a joint optimization process\nto match the dynamics observed in the data while defining a linear pH system in\nthe latent space. The learned, low-dimensional pH system can describe even\nnonlinear systems and is rapidly computable due to its small size. The method\nis exemplified by a parametric mass-spring-damper and a nonlinear pendulum\nexample, as well as the high-dimensional model of a disc brake with linear\nthermoelastic behavior.\n","authors":["Johannes Rettberg","Jonas Kneifl","Julius Herb","Patrick Buchfink","Jörg Fehr","Bernard Haasdonk"],"pdf_url":"https://arxiv.org/pdf/2408.08185v2.pdf","comment":"33 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.00204v2","updated":"2024-08-16T07:08:59Z","published":"2024-03-30T00:46:43Z","title":"AirPilot: A PPO-based DRL Auto-Tuned Nonlinear PID Drone Controller for\n Robust Autonomous Flights","summary":" Navigation precision, speed and stability are crucial for safe UAV flight\nmaneuvers and effective flight mission executions in dynamic environments.\nDifferent flight missions may have varying objectives, such as minimizing\nenergy consumption, achieving precise positioning, or maximizing speed. A\ncontroller that can adapt to different objectives on the fly is highly\nvaluable. Proportional Integral Derivative controllers are one of the most\npopular and widely used control algorithms for drones control systems, but\ntheir linear control algorithm fails to capture the nonlinear nature of the\ndynamic wind conditions and complex drone system. Manually tuning the PID gains\nfor various missions can be time-consuming and requires significant expertise.\nThis paper aims to revolutionize drone flight control by presenting the\nAirPilot, a nonlinear Deep Reinforcement Learning (DRL) - enhanced PID drone\ncontroller using Proximal Policy Optimization. AirPilot controller combines the\nsimplicity and effectiveness of traditional PID control with the adaptability,\nlearning capability, and optimization potential of DRL. This makes it better\nsuited for modern drone applications where the environment is dynamic, and\nmission-specific performance demands are high. We employed a COEX Clover\nautonomous drone for training the DRL agent within the Gazebo simulator and\nsubsequently implemented it in a real-world lab setting, which marks a\nsignificant milestone as one of the first attempts to apply a DRL-based flight\ncontroller on an actual drone. Airpilot is capable of reducing the navigation\nerror by more than 82% and improving overshoot, speed and settling time\nsignificantly.\n","authors":["Junyang Zhang","Cristian Emanuel Ocampo Rivera","Kyle Tyni","Steven Nguyen","Ulices Santa Cruz Leal","Yasser Shoukry"],"pdf_url":"https://arxiv.org/pdf/2404.00204v2.pdf","comment":"14 pages, 17 figures"},{"id":"http://arxiv.org/abs/2408.08567v1","updated":"2024-08-16T07:01:46Z","published":"2024-08-16T07:01:46Z","title":"S$^3$Attention: Improving Long Sequence Attention with Smoothed Skeleton\n Sketching","summary":" Attention based models have achieved many remarkable breakthroughs in\nnumerous applications. However, the quadratic complexity of Attention makes the\nvanilla Attention based models hard to apply to long sequence tasks. Various\nimproved Attention structures are proposed to reduce the computation cost by\ninducing low rankness and approximating the whole sequence by sub-sequences.\nThe most challenging part of those approaches is maintaining the proper balance\nbetween information preservation and computation reduction: the longer\nsub-sequences used, the better information is preserved, but at the price of\nintroducing more noise and computational costs. In this paper, we propose a\nsmoothed skeleton sketching based Attention structure, coined S$^3$Attention,\nwhich significantly improves upon the previous attempts to negotiate this\ntrade-off. S$^3$Attention has two mechanisms to effectively minimize the impact\nof noise while keeping the linear complexity to the sequence length: a\nsmoothing block to mix information over long sequences and a matrix sketching\nmethod that simultaneously selects columns and rows from the input matrix. We\nverify the effectiveness of S$^3$Attention both theoretically and empirically.\nExtensive studies over Long Range Arena (LRA) datasets and six time-series\nforecasting show that S$^3$Attention significantly outperforms both vanilla\nAttention and other state-of-the-art variants of Attention structures.\n","authors":["Xue Wang","Tian Zhou","Jianqing Zhu","Jialin Liu","Kun Yuan","Tao Yao","Wotao Yin","Rong Jin","HanQin Cai"],"pdf_url":"https://arxiv.org/pdf/2408.08567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08560v1","updated":"2024-08-16T06:52:06Z","published":"2024-08-16T06:52:06Z","title":"A training regime to learn unified representations from complementary\n breast imaging modalities","summary":" Full Field Digital Mammograms (FFDMs) and Digital Breast Tomosynthesis (DBT)\nare the two most widely used imaging modalities for breast cancer screening.\nAlthough DBT has increased cancer detection compared to FFDM, its widespread\nadoption in clinical practice has been slowed by increased interpretation times\nand a perceived decrease in the conspicuity of specific lesion types.\nSpecifically, the non-inferiority of DBT for microcalcifications remains under\ndebate. Due to concerns about the decrease in visual acuity, combined DBT-FFDM\nacquisitions remain popular, leading to overall increased exam times and\nradiation dosage. Enabling DBT to provide diagnostic information present in\nboth FFDM and DBT would reduce reliance on FFDM, resulting in a reduction in\nboth quantities. We propose a machine learning methodology that learns\nhigh-level representations leveraging the complementary diagnostic signal from\nboth DBT and FFDM. Experiments on a large-scale data set validate our claims\nand show that our representations enable more accurate breast lesion detection\nthan any DBT- or FFDM-based model.\n","authors":["Umang Sharma","Jungkyu Park","Laura Heacock","Sumit Chopra","Krzysztof Geras"],"pdf_url":"https://arxiv.org/pdf/2408.08560v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08558v1","updated":"2024-08-16T06:43:58Z","published":"2024-08-16T06:43:58Z","title":"Linear combinations of latents in diffusion models: interpolation and\n beyond","summary":" Generative models are crucial for applications like data synthesis and\naugmentation. Diffusion, Flow Matching and Continuous Normalizing Flows have\nshown effectiveness across various modalities, and rely on Gaussian latent\nvariables for generation. As any generated object is directly associated with a\nparticular latent variable, we can manipulate the variables to exert control\nover the generation process. However, standard approaches for combining latent\nvariables, such as spherical interpolation, only apply or work well in special\ncases. Moreover, current methods for obtaining low-dimensional representations\nof the data, important for e.g. surrogate models for search and creative\napplications, are network and data modality specific. In this work we show that\nthe standard methods to combine variables do not yield intermediates following\nthe distribution the models are trained to expect. We propose Combination of\nGaussian variables (COG), a novel interpolation method that addresses this, is\neasy to implement yet matches or improves upon current methods. COG addresses\nlinear combinations in general and, as we demonstrate, also supports other\noperations including e.g. defining subspaces of the latent space, simplifying\nthe creation of expressive low-dimensional spaces of high-dimensional objects\nusing generative models based on Gaussian latents.\n","authors":["Erik Bodin","Henry Moss","Carl Henrik Ek"],"pdf_url":"https://arxiv.org/pdf/2408.08558v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08554v1","updated":"2024-08-16T06:39:08Z","published":"2024-08-16T06:39:08Z","title":"ABQ-LLM: Arbitrary-Bit Quantized Inference Acceleration for Large\n Language Models","summary":" Large Language Models (LLMs) have revolutionized natural language processing\ntasks. However, their practical application is constrained by substantial\nmemory and computational demands. Post-training quantization (PTQ) is\nconsidered an effective method to accelerate LLM inference. Despite its growing\npopularity in LLM model compression, PTQ deployment faces two major challenges.\nFirst, low-bit quantization leads to performance degradation. Second,\nrestricted by the limited integer computing unit type on GPUs, quantized matrix\noperations with different precisions cannot be effectively accelerated. To\naddress these issues, we introduce a novel arbitrary-bit quantization algorithm\nand inference framework, ABQ-LLM. It achieves superior performance across\nvarious quantization settings and enables efficient arbitrary-precision\nquantized inference on the GPU. ABQ-LLM introduces several key innovations: (1)\na distribution correction method for transformer blocks to mitigate\ndistribution differences caused by full quantization of weights and\nactivations, improving performance at low bit-widths. (2) the bit balance\nstrategy to counteract performance degradation from asymmetric distribution\nissues at very low bit-widths (e.g., 2-bit). (3) an innovative quantization\nacceleration framework that reconstructs the quantization matrix multiplication\nof arbitrary precision combinations based on BTC (Binary TensorCore)\nequivalents, gets rid of the limitations of INT4/INT8 computing units. ABQ-LLM\ncan convert each component bit width gain into actual acceleration gain,\nmaximizing performance under mixed precision(e.g., W6A6, W2A8). Based on W2*A8\nquantization configuration on LLaMA-7B model, it achieved a WikiText2\nperplexity of 7.59 (2.17$\\downarrow $ vs 9.76 in AffineQuant). Compared to\nSmoothQuant, we realized 1.6$\\times$ acceleration improvement and 2.7$\\times$\nmemory compression gain.\n","authors":["Chao Zeng","Songwei Liu","Yusheng Xie","Hong Liu","Xiaojian Wang","Miao Wei","Shu Yang","Fangmin Chen","Xing Mei"],"pdf_url":"https://arxiv.org/pdf/2408.08554v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.05319v5","updated":"2024-08-16T06:38:32Z","published":"2023-02-10T15:28:55Z","title":"Large Language Models for Code: Security Hardening and Adversarial\n Testing","summary":" Large language models (large LMs) are increasingly trained on massive\ncodebases and used to generate code. However, LMs lack awareness of security\nand are found to frequently produce unsafe code. This work studies the security\nof LMs along two important axes: (i) security hardening, which aims to enhance\nLMs' reliability in generating secure code, and (ii) adversarial testing, which\nseeks to evaluate LMs' security at an adversarial standpoint. We address both\nof these by formulating a new security task called controlled code generation.\nThe task is parametric and takes as input a binary property to guide the LM to\ngenerate secure or unsafe code, while preserving the LM's capability of\ngenerating functionally correct code. We propose a novel learning-based\napproach called SVEN to solve this task. SVEN leverages property-specific\ncontinuous vectors to guide program generation towards the given property,\nwithout modifying the LM's weights. Our training procedure optimizes these\ncontinuous vectors by enforcing specialized loss terms on different regions of\ncode, using a high-quality dataset carefully curated by us. Our extensive\nevaluation shows that SVEN is highly effective in achieving strong security\ncontrol. For instance, a state-of-the-art CodeGen LM with 2.7B parameters\ngenerates secure code for 59.1% of the time. When we employ SVEN to perform\nsecurity hardening (or adversarial testing) on this LM, the ratio is\nsignificantly boosted to 92.3% (or degraded to 36.8%). Importantly, SVEN\nclosely matches the original LMs in functional correctness.\n","authors":["Jingxuan He","Martin Vechev"],"pdf_url":"https://arxiv.org/pdf/2302.05319v5.pdf","comment":"Accepted to ACM CCS 2023"},{"id":"http://arxiv.org/abs/2401.13913v2","updated":"2024-08-16T06:00:05Z","published":"2024-01-25T03:17:03Z","title":"Spectral Clustering for Discrete Distributions","summary":" The discrete distribution is often used to describe complex instances in\nmachine learning, such as images, sequences, and documents. Traditionally,\nclustering of discrete distributions (D2C) has been approached using\nWasserstein barycenter methods. These methods operate under the assumption that\nclusters can be well-represented by barycenters, which is seldom true in many\nreal-world applications. Additionally, these methods are not scalable for large\ndatasets due to the high computational cost of calculating Wasserstein\nbarycenters. In this work, we explore the feasibility of using spectral\nclustering combined with distribution affinity measures (e.g., maximum mean\ndiscrepancy and Wasserstein distance) to cluster discrete distributions. We\ndemonstrate that these methods can be more accurate and efficient than\nbarycenter methods. To further enhance scalability, we propose using linear\noptimal transport to construct affinity matrices efficiently for large\ndatasets. We provide theoretical guarantees for the success of our methods in\nclustering distributions. Experiments on both synthetic and real data show that\nour methods outperform existing baselines.\n","authors":["Zixiao Wang","Dong Qiao","Jicong Fan"],"pdf_url":"https://arxiv.org/pdf/2401.13913v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08541v1","updated":"2024-08-16T05:56:10Z","published":"2024-08-16T05:56:10Z","title":"Where is the signal in tokenization space?","summary":" Large Language Models (LLMs) are typically shipped with tokenizers that\ndeterministically encode text into so-called canonical token sequences, to\nwhich the LLMs assign probability values. One common assumption is that the\nprobability of a piece of text is the probability of its canonical token\nsequence. However, the tokenization of a string is not unique: e.g., the Llama2\ntokenizer encodes Tokens as [Tok,ens], but [Tok,en,s] also represents the same\ntext. In this paper, we study non-canonical tokenizations. We prove that, given\na string, it is computationally hard to find the most likely tokenization for\nan autoregressive LLM, as well as to compute the marginal probability over all\npossible tokenizations. We then show how the marginal is, in most cases,\nindistinguishable from the canonical probability. Surprisingly, we then\nempirically demonstrate the existence of a significant amount of signal hidden\nwithin tokenization space. Notably, by simply aggregating the probabilities of\nnon-canonical tokenizations, we achieve improvements across a range of LLM\nevaluation benchmarks for a variety of architectures, including transformers\nand state space models.\n","authors":["Renato Lui Geh","Honghua Zhang","Kareem Ahmed","Benjie Wang","Guy Van den Broeck"],"pdf_url":"https://arxiv.org/pdf/2408.08541v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08536v1","updated":"2024-08-16T05:34:50Z","published":"2024-08-16T05:34:50Z","title":"Blockchain-Enabled Accountability in Data Supply Chain: A Data Bill of\n Materials Approach","summary":" In the era of advanced artificial intelligence, highlighted by large-scale\ngenerative models like GPT-4, ensuring the traceability, verifiability, and\nreproducibility of datasets throughout their lifecycle is paramount for\nresearch institutions and technology companies. These organisations\nincreasingly rely on vast corpora to train and fine-tune advanced AI models,\nresulting in intricate data supply chains that demand effective data governance\nmechanisms. In addition, the challenge intensifies as diverse stakeholders may\nuse assorted tools, often without adequate measures to ensure the\naccountability of data and the reliability of outcomes. In this study, we adapt\nthe concept of ``Software Bill of Materials\" into the field of data governance\nand management to address the above challenges, and introduce ``Data Bill of\nMaterials\" (DataBOM) to capture the dependency relationship between different\ndatasets and stakeholders by storing specific metadata. We demonstrate a\nplatform architecture for providing blockchain-based DataBOM services, present\nthe interaction protocol for stakeholders, and discuss the minimal requirements\nfor DataBOM metadata. The proposed solution is evaluated in terms of\nfeasibility and performance via case study and quantitative analysis\nrespectively.\n","authors":["Yue Liu","Dawen Zhang","Boming Xia","Julia Anticev","Tunde Adebayo","Zhenchang Xing","Moses Machao"],"pdf_url":"https://arxiv.org/pdf/2408.08536v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08533v1","updated":"2024-08-16T05:11:52Z","published":"2024-08-16T05:11:52Z","title":"Unsupervised Transfer Learning via Adversarial Contrastive Training","summary":" Learning a data representation for downstream supervised learning tasks under\nunlabeled scenario is both critical and challenging. In this paper, we propose\na novel unsupervised transfer learning approach using adversarial contrastive\ntraining (ACT). Our experimental results demonstrate outstanding classification\naccuracy with both fine-tuned linear probe and K-NN protocol across various\ndatasets, showing competitiveness with existing state-of-the-art\nself-supervised learning methods. Moreover, we provide an end-to-end\ntheoretical guarantee for downstream classification tasks in a misspecified,\nover-parameterized setting, highlighting how a large amount of unlabeled data\ncontributes to prediction accuracy. Our theoretical findings suggest that the\ntesting error of downstream tasks depends solely on the efficiency of data\naugmentation used in ACT when the unlabeled sample size is sufficiently large.\nThis offers a theoretical understanding of learning downstream tasks with a\nsmall sample size.\n","authors":["Chenguang Duan","Yuling Jiao","Huazhen Lin","Wensen Ma","Jerry Zhijian Yang"],"pdf_url":"https://arxiv.org/pdf/2408.08533v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08531v1","updated":"2024-08-16T04:57:54Z","published":"2024-08-16T04:57:54Z","title":"Detecting Unsuccessful Students in Cybersecurity Exercises in Two\n Different Learning Environments","summary":" This full paper in the research track evaluates the usage of data logged from\ncybersecurity exercises in order to predict students who are potentially at\nrisk of performing poorly. Hands-on exercises are essential for learning since\nthey enable students to practice their skills. In cybersecurity, hands-on\nexercises are often complex and require knowledge of many topics. Therefore,\nstudents may miss solutions due to gaps in their knowledge and become\nfrustrated, which impedes their learning. Targeted aid by the instructor helps,\nbut since the instructor's time is limited, efficient ways to detect struggling\nstudents are needed. This paper develops automated tools to predict when a\nstudent is having difficulty. We formed a dataset with the actions of 313\nstudents from two countries and two learning environments: KYPO CRP and\nEDURange. These data are used in machine learning algorithms to predict the\nsuccess of students in exercises deployed in these environments. After\nextracting features from the data, we trained and cross-validated eight\nclassifiers for predicting the exercise outcome and evaluated their predictive\npower. The contribution of this paper is comparing two approaches to feature\nengineering, modeling, and classification performance on data from two learning\nenvironments. Using the features from either learning environment, we were able\nto detect and distinguish between successful and struggling students. A\ndecision tree classifier achieved the highest balanced accuracy and sensitivity\nwith data from both learning environments. The results show that activity data\nfrom cybersecurity exercises are suitable for predicting student success. In a\npotential application, such models can aid instructors in detecting struggling\nstudents and providing targeted help. We publish data and code for building\nthese models so that others can adopt or adapt them.\n","authors":["Valdemar Švábenský","Kristián Tkáčik","Aubrey Birdwell","Richard Weiss","Ryan S. Baker","Pavel Čeleda","Jan Vykopal","Jens Mache","Ankur Chattopadhyay"],"pdf_url":"https://arxiv.org/pdf/2408.08531v1.pdf","comment":"To appear for publication in the FIE 2024 conference proceedings"},{"id":"http://arxiv.org/abs/2408.08526v1","updated":"2024-08-16T04:54:09Z","published":"2024-08-16T04:54:09Z","title":"Inverse design with conditional cascaded diffusion models","summary":" Adjoint-based design optimizations are usually computationally expensive and\nthose costs scale with resolution. To address this, researchers have proposed\nmachine learning approaches for inverse design that can predict\nhigher-resolution solutions from lower cost/resolution ones. Due to the recent\nsuccess of diffusion models over traditional generative models, we extend the\nuse of diffusion models for multi-resolution tasks by proposing the conditional\ncascaded diffusion model (cCDM). Compared to GANs, cCDM is more stable to\ntrain, and each diffusion model within the cCDM can be trained independently,\nthus each model's parameters can be tuned separately to maximize the\nperformance of the pipeline. Our study compares cCDM against a cGAN model with\ntransfer learning.\n Our results demonstrate that the cCDM excels in capturing finer details,\npreserving volume fraction constraints, and minimizing compliance errors in\nmulti-resolution tasks when a sufficient amount of high-resolution training\ndata (more than 102 designs) is available. Furthermore, we explore the impact\nof training data size on the performance of both models. While both models show\ndecreased performance with reduced high-resolution training data, the cCDM\nloses its superiority to the cGAN model with transfer learning when training\ndata is limited (less than 102), and we show the break-even point for this\ntransition. Also, we highlight that while the diffusion model may achieve\nbetter pixel-wise performance in both low-resolution and high-resolution\nscenarios, this does not necessarily guarantee that the model produces optimal\ncompliance error or constraint satisfaction.\n","authors":["Milad Habibi","Mark Fuge"],"pdf_url":"https://arxiv.org/pdf/2408.08526v1.pdf","comment":"Accepted for presentation at IDETC/CIE 2024 conference, Washington,\n DC. 11 pages, 9 figures"},{"id":"http://arxiv.org/abs/2402.11124v4","updated":"2024-08-16T03:43:17Z","published":"2024-02-16T23:17:00Z","title":"Implicit Causal Representation Learning via Switchable Mechanisms","summary":" Learning causal representations from observational and interventional data in\nthe absence of known ground-truth graph structures necessitates implicit latent\ncausal representation learning. Implicit learning of causal mechanisms\ntypically involves two categories of interventional data: hard and soft\ninterventions. In real-world scenarios, soft interventions are often more\nrealistic than hard interventions, as the latter require fully controlled\nenvironments. Unlike hard interventions, which directly force changes in a\ncausal variable, soft interventions exert influence indirectly by affecting the\ncausal mechanism. However, the subtlety of soft interventions impose several\nchallenges for learning causal models. One challenge is that soft\nintervention's effects are ambiguous, since parental relations remain intact.\nIn this paper, we tackle the challenges of learning causal models using soft\ninterventions while retaining implicit modelling. We propose ICLR-SM, which\nmodels the effects of soft interventions by employing a causal mechanism switch\nvariable designed to toggle between different causal mechanisms. In our\nexperiments, we consistently observe improved learning of identifiable, causal\nrepresentations, compared to baseline approaches.\n","authors":["Shayan Shirahmad Gale Bagi","Zahra Gharaee","Oliver Schulte","Mark Crowley"],"pdf_url":"https://arxiv.org/pdf/2402.11124v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18753v2","updated":"2024-08-16T03:29:18Z","published":"2024-05-29T04:37:19Z","title":"Confronting the Reproducibility Crisis: A Case Study of Challenges in\n Cybersecurity AI","summary":" In the rapidly evolving field of cybersecurity, ensuring the reproducibility\nof AI-driven research is critical to maintaining the reliability and integrity\nof security systems. This paper addresses the reproducibility crisis within the\ndomain of adversarial robustness -- a key area in AI-based cybersecurity that\nfocuses on defending deep neural networks against malicious perturbations.\nThrough a detailed case study, we attempt to validate results from prior work\non certified robustness using the VeriGauge toolkit, revealing significant\nchallenges due to software and hardware incompatibilities, version conflicts,\nand obsolescence. Our findings underscore the urgent need for standardized\nmethodologies, containerization, and comprehensive documentation to ensure the\nreproducibility of AI models deployed in critical cybersecurity applications.\nBy tackling these reproducibility challenges, we aim to contribute to the\nbroader discourse on securing AI systems against advanced persistent threats,\nenhancing network and IoT security, and protecting critical infrastructure.\nThis work advocates for a concerted effort within the research community to\nprioritize reproducibility, thereby strengthening the foundation upon which\nfuture cybersecurity advancements are built.\n","authors":["Richard H. Moulton","Gary A. McCully","John D. Hastings"],"pdf_url":"https://arxiv.org/pdf/2405.18753v2.pdf","comment":"8 pages, 0 figures, 2 tables, updated to incorporate feedback and\n improvements"},{"id":"http://arxiv.org/abs/2408.08508v1","updated":"2024-08-16T03:22:18Z","published":"2024-08-16T03:22:18Z","title":"Mitigating Degree Bias in Signed Graph Neural Networks","summary":" Like Graph Neural Networks (GNNs), Signed Graph Neural Networks (SGNNs) are\nalso up against fairness issues from source data and typical aggregation\nmethod. In this paper, we are pioneering to make the investigation of fairness\nin SGNNs expanded from GNNs. We identify the issue of degree bias within signed\ngraphs, offering a new perspective on the fairness issues related to SGNNs. To\nhandle the confronted bias issue, inspired by previous work on degree bias, a\nnew Model-Agnostic method is consequently proposed to enhance representation of\nnodes with different degrees, which named as Degree Debiased Signed Graph\nNeural Network (DD-SGNN) . More specifically, in each layer, we make a transfer\nfrom nodes with high degree to nodes with low degree inside a head-to-tail\ntriplet, which to supplement the underlying domain missing structure of the\ntail nodes and meanwhile maintain the positive and negative semantics specified\nby balance theory in signed graphs. We make extensive experiments on four\nreal-world datasets. The result verifies the validity of the model, that is,\nour model mitigates the degree bias issue without compromising\nperformance($\\textit{i.e.}$, AUC, F1). The code is provided in supplementary\nmaterial.\n","authors":["Fang He","Jinhai Deng","Ruizhan Xue","Maojun Wang","Zeyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.08508v1.pdf","comment":"10 pages, 7 figures, The 39th Annual AAAI Conference on Artificial\n Intelligence"},{"id":"http://arxiv.org/abs/2408.02223v2","updated":"2024-08-16T03:18:12Z","published":"2024-08-05T03:54:52Z","title":"Large Language Model Aided QoS Prediction for Service Recommendation","summary":" Large language models (LLMs) have seen rapid improvement in the recent years,\nand have been used in a wider range of applications. After being trained on\nlarge text corpus, LLMs obtain the capability of extracting rich features from\ntextual data. Such capability is potentially useful for the web service\nrecommendation task, where the web users and services have intrinsic attributes\nthat can be described using natural language sentences and are useful for\nrecommendation. In this paper, we explore the possibility and practicality of\nusing LLMs for web service recommendation. We propose the large language model\naided QoS prediction (llmQoS) model, which use LLMs to extract useful\ninformation from attributes of web users and services via descriptive\nsentences. This information is then used in combination with the QoS values of\nhistorical interactions of users and services, to predict QoS values for any\ngiven user-service pair. On the WSDream dataset, llmQoS is shown to overcome\nthe data sparsity issue inherent to the QoS prediction problem, and outperforms\ncomparable baseline models consistently.\n","authors":["Huiying Liu","Zekun Zhang","Honghao Li","Qilin Wu","Yiwen Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.02223v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15734v2","updated":"2024-08-16T02:57:35Z","published":"2024-03-23T06:01:45Z","title":"Space Group Informed Transformer for Crystalline Materials Generation","summary":" We introduce CrystalFormer, a transformer-based autoregressive model\nspecifically designed for space group-controlled generation of crystalline\nmaterials. The incorporation of space group symmetry significantly simplifies\nthe crystal space, which is crucial for data and compute efficient generative\nmodeling of crystalline materials. Leveraging the prominent discrete and\nsequential nature of the Wyckoff positions, CrystalFormer learns to generate\ncrystals by directly predicting the species and locations of\nsymmetry-inequivalent atoms in the unit cell. We demonstrate the advantages of\nCrystalFormer in standard tasks such as symmetric structure initialization and\nelement substitution compared to conventional methods implemented in popular\ncrystal structure prediction software. Moreover, we showcase the application of\nCrystalFormer of property-guided materials design in a plug-and-play manner.\nOur analysis shows that CrystalFormer ingests sensible solid-state chemistry\nknowledge and heuristics by compressing the material dataset, thus enabling\nsystematic exploration of crystalline materials. The simplicity, generality,\nand flexibility of CrystalFormer position it as a promising architecture to be\nthe foundational model of the entire crystalline materials space, heralding a\nnew era in materials modeling and discovery.\n","authors":["Zhendong Cao","Xiaoshan Luo","Jian Lv","Lei Wang"],"pdf_url":"https://arxiv.org/pdf/2403.15734v2.pdf","comment":"26 pages, 11 figures"},{"id":"http://arxiv.org/abs/2408.08499v1","updated":"2024-08-16T02:53:01Z","published":"2024-08-16T02:53:01Z","title":"The Limitations of Model Retraining in the Face of Performativity","summary":" We study stochastic optimization in the context of performative shifts, where\nthe data distribution changes in response to the deployed model. We demonstrate\nthat naive retraining can be provably suboptimal even for simple distribution\nshifts. The issue worsens when models are retrained given a finite number of\nsamples at each retraining step. We show that adding regularization to\nretraining corrects both of these issues, attaining provably optimal models in\nthe face of distribution shifts. Our work advocates rethinking how machine\nlearning models are retrained in the presence of performative effects.\n","authors":["Anmol Kabra","Kumar Kshitij Patel"],"pdf_url":"https://arxiv.org/pdf/2408.08499v1.pdf","comment":"Accepted to 2024 ICML Workshop on Humans, Algorithmic Decision-Making\n and Society"},{"id":"http://arxiv.org/abs/2405.17938v2","updated":"2024-08-16T02:43:59Z","published":"2024-05-28T08:02:42Z","title":"RC-Mixup: A Data Augmentation Strategy against Noisy Data for Regression\n Tasks","summary":" We study the problem of robust data augmentation for regression tasks in the\npresence of noisy data. Data augmentation is essential for generalizing deep\nlearning models, but most of the techniques like the popular Mixup are\nprimarily designed for classification tasks on image data. Recently, there are\nalso Mixup techniques that are specialized to regression tasks like C-Mixup. In\ncomparison to Mixup, which takes linear interpolations of pairs of samples,\nC-Mixup is more selective in which samples to mix based on their label\ndistances for better regression performance. However, C-Mixup does not\ndistinguish noisy versus clean samples, which can be problematic when mixing\nand lead to suboptimal model performance. At the same time, robust training has\nbeen heavily studied where the goal is to train accurate models against noisy\ndata through multiple rounds of model training. We thus propose our data\naugmentation strategy RC-Mixup, which tightly integrates C-Mixup with\nmulti-round robust training methods for a synergistic effect. In particular,\nC-Mixup improves robust training in identifying clean data, while robust\ntraining provides cleaner data to C-Mixup for it to perform better. A key\nadvantage of RC-Mixup is that it is data-centric where the robust model\ntraining algorithm itself does not need to be modified, but can simply benefit\nfrom data mixing. We show in our experiments that RC-Mixup significantly\noutperforms C-Mixup and robust training baselines on noisy data benchmarks and\ncan be integrated with various robust training methods.\n","authors":["Seong-Hyeon Hwang","Minsu Kim","Steven Euijong Whang"],"pdf_url":"https://arxiv.org/pdf/2405.17938v2.pdf","comment":"Accepted to KDD 2024"},{"id":"http://arxiv.org/abs/2408.08494v1","updated":"2024-08-16T02:33:07Z","published":"2024-08-16T02:33:07Z","title":"Optimal Sketching for Residual Error Estimation for Matrix and Vector\n Norms","summary":" We study the problem of residual error estimation for matrix and vector norms\nusing a linear sketch. Such estimates can be used, for example, to quickly\nassess how useful a more expensive low-rank approximation computation will be.\nThe matrix case concerns the Frobenius norm and the task is to approximate the\n$k$-residual $\\|A - A_k\\|_F$ of the input matrix $A$ within a\n$(1+\\epsilon)$-factor, where $A_k$ is the optimal rank-$k$ approximation. We\nprovide a tight bound of $\\Theta(k^2/\\epsilon^4)$ on the size of bilinear\nsketches, which have the form of a matrix product $SAT$. This improves the\nprevious $O(k^2/\\epsilon^6)$ upper bound in (Andoni et al. SODA 2013) and gives\nthe first non-trivial lower bound, to the best of our knowledge. In our\nalgorithm, our sketching matrices $S$ and $T$ can both be sparse matrices,\nallowing for a very fast update time. We demonstrate that this gives a\nsubstantial advantage empirically, for roughly the same sketch size and\naccuracy as in previous work.\n For the vector case, we consider the $\\ell_p$-norm for $p>2$, where the task\nis to approximate the $k$-residual $\\|x - x_k\\|_p$ up to a constant factor,\nwhere $x_k$ is the optimal $k$-sparse approximation to $x$. Such vector norms\nare frequently studied in the data stream literature and are useful for finding\nfrequent items or so-called heavy hitters. We establish an upper bound of\n$O(k^{2/p}n^{1-2/p}\\operatorname{poly}(\\log n))$ for constant $\\epsilon$ on the\ndimension of a linear sketch for this problem. Our algorithm can be extended to\nthe $\\ell_p$ sparse recovery problem with the same sketching dimension, which\nseems to be the first such bound for $p > 2$. We also show an\n$\\Omega(k^{2/p}n^{1-2/p})$ lower bound for the sparse recovery problem, which\nis tight up to a $\\mathrm{poly}(\\log n)$ factor.\n","authors":["Yi Li","Honghao Lin","David P. Woodruff"],"pdf_url":"https://arxiv.org/pdf/2408.08494v1.pdf","comment":"Published as a conference paper at ICLR 2024"},{"id":"http://arxiv.org/abs/2408.08493v1","updated":"2024-08-16T02:29:38Z","published":"2024-08-16T02:29:38Z","title":"Fishers Harvest Parallel Unlearning in Inherited Model Networks","summary":" Unlearning in various learning frameworks remains challenging, with the\ncontinuous growth and updates of models exhibiting complex inheritance\nrelationships. This paper presents a novel unlearning framework, which enables\nfully parallel unlearning among models exhibiting inheritance. A key enabler is\nthe new Unified Model Inheritance Graph (UMIG), which captures the inheritance\nusing a Directed Acyclic Graph (DAG).Central to our framework is the new Fisher\nInheritance Unlearning (FIUn) algorithm, which utilizes the Fisher Information\nMatrix (FIM) from initial unlearning models to pinpoint impacted parameters in\ninherited models. By employing FIM, the FIUn method breaks the sequential\ndependencies among the models, facilitating simultaneous unlearning and\nreducing computational overhead. We further design to merge disparate FIMs into\na single matrix, synchronizing updates across inherited models. Experiments\nconfirm the effectiveness of our unlearning framework. For single-class tasks,\nit achieves complete unlearning with 0\\% accuracy for unlearned labels while\nmaintaining 94.53\\% accuracy for retained labels on average. For multi-class\ntasks, the accuracy is 1.07\\% for unlearned labels and 84.77\\% for retained\nlabels on average. Our framework accelerates unlearning by 99\\% compared to\nalternative methods.\n","authors":["Xiao Liu","Mingyuan Li","Xu Wang","Guangsheng Yu","Wei Ni","Lixiang Li","Haipeng Peng","Renping Liu"],"pdf_url":"https://arxiv.org/pdf/2408.08493v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17789v3","updated":"2024-08-16T02:19:23Z","published":"2024-04-27T06:06:41Z","title":"BiLO: Bilevel Local Operator Learning for PDE inverse problems","summary":" We propose a new neural network based method for solving inverse problems for\npartial differential equations (PDEs) by formulating the PDE inverse problem as\na bilevel optimization problem. At the upper level, we minimize the data loss\nwith respect to the PDE parameters. At the lower level, we train a neural\nnetwork to locally approximate the PDE solution operator in the neighborhood of\na given set of PDE parameters, which enables an accurate approximation of the\ndescent direction for the upper level optimization problem. The lower level\nloss function includes the L2 norms of both the residual and its derivative\nwith respect to the PDE parameters. We apply gradient descent simultaneously on\nboth the upper and lower level optimization problems, leading to an effective\nand fast algorithm. The method, which we refer to as BiLO (Bilevel Local\nOperator learning), is also able to efficiently infer unknown functions in the\nPDEs through the introduction of an auxiliary variable. Through extensive\nexperiments over multiple PDE systems, we demonstrate that our method enforces\nstrong PDE constraints, is robust to sparse and noisy data, and eliminates the\nneed to balance the residual and the data loss, which is inherent to the soft\nPDE constraints in many existing methods.\n","authors":["Ray Zirui Zhang","Xiaohui Xie","John S. Lowengrub"],"pdf_url":"https://arxiv.org/pdf/2404.17789v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08488v1","updated":"2024-08-16T02:17:21Z","published":"2024-08-16T02:17:21Z","title":"Adversarial Contrastive Learning Based Physics-Informed Temporal\n Networks for Cuffless Blood Pressure Estimation","summary":" Time series data mining is immensely important in extensive applications,\nsuch as traffic, medical, and e-commerce. In this paper, we focus on medical\ntemporal variation modeling, \\emph{i.e.,} cuffless blood pressure (BP)\nmonitoring which has great value in cardiovascular healthcare. Although\nproviding a comfortable user experience, such methods are suffering from the\ndemand for a significant amount of realistic data to train an individual model\nfor each subject, especially considering the invasive or obtrusive BP\nground-truth measurements. To tackle this challenge, we introduce a novel\nphysics-informed temporal network~(PITN) with adversarial contrastive learning\nto enable precise BP estimation with very limited data. Specifically, we first\nenhance the physics-informed neural network~(PINN) with the temporal block for\ninvestigating BP dynamics' multi-periodicity for personal cardiovascular cycle\nmodeling and temporal variation. We then employ adversarial training to\ngenerate extra physiological time series data, improving PITN's robustness in\nthe face of sparse subject-specific training data. Furthermore, we utilize\ncontrastive learning to capture the discriminative variations of cardiovascular\nphysiologic phenomena. This approach aggregates physiological signals with\nsimilar blood pressure values in latent space while separating clusters of\nsamples with dissimilar blood pressure values. Experiments on three\nwidely-adopted datasets with different modailties (\\emph{i.e.,} bioimpedance,\nPPG, millimeter-wave) demonstrate the superiority and effectiveness of the\nproposed methods over previous state-of-the-art approaches. The code is\navailable at~\\url{https://github.com/Zest86/ACL-PITN}.\n","authors":["Rui Wang","Mengshi Qi","Yingxia Shao","Anfu Zhou","Huadong Ma"],"pdf_url":"https://arxiv.org/pdf/2408.08488v1.pdf","comment":"14 pages, 8 figures"},{"id":"http://arxiv.org/abs/2408.08484v1","updated":"2024-08-16T02:07:34Z","published":"2024-08-16T02:07:34Z","title":"An Unsupervised Learning Framework Combined with Heuristics for the\n Maximum Minimal Cut Problem","summary":" The Maximum Minimal Cut Problem (MMCP), a NP-hard combinatorial optimization\n(CO) problem, has not received much attention due to the demanding and\nchallenging bi-connectivity constraint. Moreover, as a CO problem, it is also a\ndaunting task for machine learning, especially without labeled instances. To\ndeal with these problems, this work proposes an unsupervised learning framework\ncombined with heuristics for MMCP that can provide valid and high-quality\nsolutions. As far as we know, this is the first work that explores machine\nlearning and heuristics to solve MMCP. The unsupervised solver is inspired by a\nrelaxation-plus-rounding approach, the relaxed solution is parameterized by\ngraph neural networks, and the cost and penalty of MMCP are explicitly written\nout, which can train the model end-to-end. A crucial observation is that each\nsolution corresponds to at least one spanning tree. Based on this finding, a\nheuristic solver that implements tree transformations by adding vertices is\nutilized to repair and improve the solution quality of the unsupervised solver.\nAlternatively, the graph is simplified while guaranteeing solution consistency,\nwhich reduces the running time. We conduct extensive experiments to evaluate\nour framework and give a specific application. The results demonstrate the\nsuperiority of our method against two techniques designed.\n","authors":["Huaiyuan Liu","Xianzhang Liu","Donghua Yang","Hongzhi Wang","Yingchi Long","Mengtong Ji","Dongjing Miao","Zhiyu Liang"],"pdf_url":"https://arxiv.org/pdf/2408.08484v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17987v3","updated":"2024-08-16T01:37:41Z","published":"2024-02-28T02:11:47Z","title":"Multistatic-Radar RCS-Signature Recognition of Aerial Vehicles: A\n Bayesian Fusion Approach","summary":" Radar Automated Target Recognition (RATR) for Unmanned Aerial Vehicles (UAVs)\ninvolves transmitting Electromagnetic Waves (EMWs) and performing target type\nrecognition on the received radar echo, crucial for defense and aerospace\napplications. Previous studies highlighted the advantages of multistatic radar\nconfigurations over monostatic ones in RATR. However, fusion methods in\nmultistatic radar configurations often suboptimally combine classification\nvectors from individual radars probabilistically. To address this, we propose a\nfully Bayesian RATR framework employing Optimal Bayesian Fusion (OBF) to\naggregate classification probability vectors from multiple radars. OBF, based\non expected 0-1 loss, updates a Recursive Bayesian Classification (RBC)\nposterior distribution for target UAV type, conditioned on historical\nobservations across multiple time steps. We evaluate the approach using\nsimulated random walk trajectories for seven drones, correlating target aspect\nangles to Radar Cross Section (RCS) measurements in an anechoic chamber.\nComparing against single radar Automated Target Recognition (ATR) systems and\nsuboptimal fusion methods, our empirical results demonstrate that the OBF\nmethod integrated with RBC significantly enhances classification accuracy\ncompared to other fusion methods and single radar configurations.\n","authors":["Michael Potter","Murat Akcakaya","Marius Necsoiu","Gunar Schirner","Deniz Erdogmus","Tales Imbiriba"],"pdf_url":"https://arxiv.org/pdf/2402.17987v3.pdf","comment":"Accepted to IEEE Transactions on Aerospace and Electronic Systems"},{"id":"http://arxiv.org/abs/2311.16536v3","updated":"2024-08-16T01:27:18Z","published":"2023-11-28T05:45:20Z","title":"Personalized Predictions of Glioblastoma Infiltration: Mathematical\n Models, Physics-Informed Neural Networks and Multimodal Scans","summary":" Predicting the infiltration of Glioblastoma (GBM) from medical MRI scans is\ncrucial for understanding tumor growth dynamics and designing personalized\nradiotherapy treatment plans.Mathematical models of GBM growth can complement\nthe data in the prediction of spatial distributions of tumor cells. However,\nthis requires estimating patient-specific parameters of the model from clinical\ndata, which is a challenging inverse problem due to limited temporal data and\nthe limited time between imaging and diagnosis. This work proposes a method\nthat uses Physics-Informed Neural Networks (PINNs) to estimate patient-specific\nparameters of a reaction-diffusion PDE model of GBM growth from a single 3D\nstructural MRI snapshot. PINNs embed both the data and the PDE into a loss\nfunction, thus integrating theory and data. Key innovations include the\nidentification and estimation of characteristic non-dimensional parameters, a\npre-training step that utilizes the non-dimensional parameters and a\nfine-tuning step to determine the patient specific parameters. Additionally,\nthe diffuse domain method is employed to handle the complex brain geometry\nwithin the PINN framework. Our method is validated both on synthetic and\npatient datasets, and shows promise for real-time parametric inference in the\nclinical setting for personalized GBM treatment.\n","authors":["Ray Zirui Zhang","Ivan Ezhov","Michal Balcerak","Andy Zhu","Benedikt Wiestler","Bjoern Menze","John S. Lowengrub"],"pdf_url":"https://arxiv.org/pdf/2311.16536v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08474v1","updated":"2024-08-16T01:20:27Z","published":"2024-08-16T01:20:27Z","title":"Enhancing Events in Neutrino Telescopes through Deep Learning-Driven\n Super-Resolution","summary":" Recent discoveries by neutrino telescopes, such as the IceCube Neutrino\nObservatory, relied extensively on machine learning (ML) tools to infer\nphysical quantities from the raw photon hits detected. Neutrino telescope\nreconstruction algorithms are limited by the sparse sampling of photons by the\noptical modules due to the relatively large spacing ($10-100\\,{\\rm m})$ between\nthem. In this letter, we propose a novel technique that learns photon transport\nthrough the detector medium through the use of deep learning-driven\nsuper-resolution of data events. These ``improved'' events can then be\nreconstructed using traditional or ML techniques, resulting in improved\nresolution. Our strategy arranges additional ``virtual'' optical modules within\nan existing detector geometry and trains a convolutional neural network to\npredict the hits on these virtual optical modules. We show that this technique\nimproves the angular reconstruction of muons in a generic ice-based neutrino\ntelescope. Our results readily extend to water-based neutrino telescopes and\nother event morphologies.\n","authors":["Felix J. Yu","Nicholas Kamp","Carlos A. Argüelles"],"pdf_url":"https://arxiv.org/pdf/2408.08474v1.pdf","comment":"5+1 pages, 4+1 figures"},{"id":"http://arxiv.org/abs/2408.03599v2","updated":"2024-08-16T01:19:04Z","published":"2024-08-07T07:36:49Z","title":"Activations Through Extensions: A Framework To Boost Performance Of\n Neural Networks","summary":" Activation functions are non-linearities in neural networks that allow them\nto learn complex mapping between inputs and outputs. Typical choices for\nactivation functions are ReLU, Tanh, Sigmoid etc., where the choice generally\ndepends on the application domain. In this work, we propose a\nframework/strategy that unifies several works on activation functions and\ntheoretically explains the performance benefits of these works. We also propose\nnovel techniques that originate from the framework and allow us to obtain\n``extensions'' (i.e. special generalizations of a given neural network) of\nneural networks through operations on activation functions. We theoretically\nand empirically show that ``extensions'' of neural networks have performance\nbenefits compared to vanilla neural networks with insignificant space and time\ncomplexity costs on standard test functions. We also show the benefits of\nneural network ``extensions'' in the time-series domain on real-world datasets.\n","authors":["Chandramouli Kamanchi","Sumanta Mukherjee","Kameshwaran Sampath","Pankaj Dayama","Arindam Jati","Vijay Ekambaram","Dzung Phan"],"pdf_url":"https://arxiv.org/pdf/2408.03599v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16837v2","updated":"2024-08-16T01:18:11Z","published":"2024-05-27T05:10:49Z","title":"Enhancing Accuracy in Generative Models via Knowledge Transfer","summary":" This paper investigates the accuracy of generative models and the impact of\nknowledge transfer on their generation precision. Specifically, we examine a\ngenerative model for a target task, fine-tuned using a pre-trained model from a\nsource task. Building on the \"Shared Embedding\" concept, which bridges the\nsource and target tasks, we introduce a novel framework for transfer learning\nunder distribution metrics such as the Kullback-Leibler divergence. This\nframework underscores the importance of leveraging inherent similarities\nbetween diverse tasks despite their distinct data distributions. Our theory\nsuggests that the shared structures can augment the generation accuracy for a\ntarget task, reliant on the capability of a source model to identify shared\nstructures and effective knowledge transfer from source to target learning. To\ndemonstrate the practical utility of this framework, we explore the theoretical\nimplications for two specific generative models: diffusion and normalizing\nflows. The results show enhanced performance in both models over their\nnon-transfer counterparts, indicating advancements for diffusion models and\nproviding fresh insights into normalizing flows in transfer and non-transfer\nsettings. These results highlight the significant contribution of knowledge\ntransfer in boosting the generation capabilities of these models.\n","authors":["Xinyu Tian","Xiaotong Shen"],"pdf_url":"https://arxiv.org/pdf/2405.16837v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08470v1","updated":"2024-08-16T01:12:21Z","published":"2024-08-16T01:12:21Z","title":"Context-Aware Assistant Selection for Improved Inference Acceleration\n with Large Language Models","summary":" Despite their widespread adoption, large language models (LLMs) remain\nprohibitive to use under resource constraints, with their ever growing sizes\nonly increasing the barrier for use. One noted issue is the high latency\nassociated with auto-regressive generation, rendering large LLMs use dependent\non advanced computing infrastructure. Assisted decoding, where a smaller draft\nmodel guides a larger target model's generation, has helped alleviate this, but\nremains dependent on alignment between the two models. Thus if the draft model\nis insufficiently capable on some domain relative to the target model,\nperformance can degrade. Alternatively, one can leverage multiple draft models\nto better cover the expertise of the target, but when multiple black-box draft\nmodels are available, selecting an assistant without details about its\nconstruction can be difficult. To better understand this decision making\nproblem, we observe it as a contextual bandit, where a policy must choose a\ndraft model based on a context. We show that even without prior knowledge of\nthe draft models, creating an offline dataset from only outputs of independent\ndraft/target models and training a policy over the alignment of these outputs\ncan accelerate performance on multiple domains provided the candidates are\neffective. Further results show this to hold on various settings with multiple\nassisted decoding candidates, highlighting its flexibility and the advantageous\nrole that such decision making can play.\n","authors":["Jerry Huang","Prasanna Parthasarathi","Mehdi Rezagholizadeh","Sarath Chandar"],"pdf_url":"https://arxiv.org/pdf/2408.08470v1.pdf","comment":"14 pages (9 pages main content + references + appendix)"},{"id":"http://arxiv.org/abs/2407.11790v3","updated":"2024-08-16T01:11:48Z","published":"2024-07-16T14:45:46Z","title":"Characterizing and Understanding HGNN Training on GPUs","summary":" Owing to their remarkable representation capabilities for heterogeneous graph\ndata, Heterogeneous Graph Neural Networks (HGNNs) have been widely adopted in\nmany critical real-world domains such as recommendation systems and medical\nanalysis. Prior to their practical application, identifying the optimal HGNN\nmodel parameters tailored to specific tasks through extensive training is a\ntime-consuming and costly process. To enhance the efficiency of HGNN training,\nit is essential to characterize and analyze the execution semantics and\npatterns within the training process to identify performance bottlenecks. In\nthis study, we conduct an in-depth quantification and analysis of two\nmainstream HGNN training scenarios, including single-GPU and multi-GPU\ndistributed training. Based on the characterization results, we disclose the\nperformance bottlenecks and their underlying causes in different HGNN training\nscenarios and provide optimization guidelines from both software and hardware\nperspectives.\n","authors":["Dengke Han","Mingyu Yan","Xiaochun Ye","Dongrui Fan"],"pdf_url":"https://arxiv.org/pdf/2407.11790v3.pdf","comment":"23 pages, 14 figures, submitted to ACM TACO"},{"id":"http://arxiv.org/abs/2301.08028v3","updated":"2024-08-16T00:59:44Z","published":"2023-01-19T12:01:41Z","title":"A Survey of Meta-Reinforcement Learning","summary":" While deep reinforcement learning (RL) has fueled multiple high-profile\nsuccesses in machine learning, it is held back from more widespread adoption by\nits often poor data efficiency and the limited generality of the policies it\nproduces. A promising approach for alleviating these limitations is to cast the\ndevelopment of better RL algorithms as a machine learning problem itself in a\nprocess called meta-RL. Meta-RL is most commonly studied in a problem setting\nwhere, given a distribution of tasks, the goal is to learn a policy that is\ncapable of adapting to any new task from the task distribution with as little\ndata as possible. In this survey, we describe the meta-RL problem setting in\ndetail as well as its major variations. We discuss how, at a high level,\nmeta-RL research can be clustered based on the presence of a task distribution\nand the learning budget available for each individual task. Using these\nclusters, we then survey meta-RL algorithms and applications. We conclude by\npresenting the open problems on the path to making meta-RL part of the standard\ntoolbox for a deep RL practitioner.\n","authors":["Jacob Beck","Risto Vuorio","Evan Zheran Liu","Zheng Xiong","Luisa Zintgraf","Chelsea Finn","Shimon Whiteson"],"pdf_url":"https://arxiv.org/pdf/2301.08028v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06261v2","updated":"2024-08-16T00:00:05Z","published":"2024-08-12T16:21:29Z","title":"Open-Source Molecular Processing Pipeline for Generating Molecules","summary":" Generative models for molecules have shown considerable promise for use in\ncomputational chemistry, but remain difficult to use for non-experts. For this\nreason, we introduce open-source infrastructure for easily building generative\nmolecular models into the widely used DeepChem [Ramsundar et al., 2019] library\nwith the aim of creating a robust and reusable molecular generation pipeline.\nIn particular, we add high quality PyTorch [Paszke et al., 2019]\nimplementations of the Molecular Generative Adversarial Networks (MolGAN) [Cao\nand Kipf, 2022] and Normalizing Flows [Papamakarios et al., 2021]. Our\nimplementations show strong performance comparable with past work [Kuznetsov\nand Polykovskiy, 2021, Cao and Kipf, 2022].\n","authors":["V Shreyas","Jose Siguenza","Karan Bania","Bharath Ramsundar"],"pdf_url":"https://arxiv.org/pdf/2408.06261v2.pdf","comment":"Presented at the 2024 Molecular Machine Learning Conference (MoML\n 2024)"}],"Multimedia":[{"id":"http://arxiv.org/abs/2408.08544v1","updated":"2024-08-16T06:04:25Z","published":"2024-08-16T06:04:25Z","title":"Scaling up Multimodal Pre-training for Sign Language Understanding","summary":" Sign language serves as the primary meaning of communication for the\ndeaf-mute community. Different from spoken language, it commonly conveys\ninformation by the collaboration of manual features, i.e., hand gestures and\nbody movements, and non-manual features, i.e., facial expressions and mouth\ncues. To facilitate communication between the deaf-mute and hearing people, a\nseries of sign language understanding (SLU) tasks have been studied in recent\nyears, including isolated/continuous sign language recognition (ISLR/CSLR),\ngloss-free sign language translation (GF-SLT) and sign language retrieval\n(SL-RT). Sign language recognition and translation aims to understand the\nsemantic meaning conveyed by sign languages from gloss-level and\nsentence-level, respectively. In contrast, SL-RT focuses on retrieving sign\nvideos or corresponding texts from a closed-set under the query-by-example\nsearch paradigm. These tasks investigate sign language topics from diverse\nperspectives and raise challenges in learning effective representation of sign\nlanguage videos. To advance the development of sign language understanding,\nexploring a generalized model that is applicable across various SLU tasks is a\nprofound research direction.\n","authors":["Wengang Zhou","Weichao Zhao","Hezhen Hu","Zecheng Li","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2408.08544v1.pdf","comment":"Sign language recognition; Sign language translation; Sign language\n retrieval"},{"id":"http://arxiv.org/abs/2407.19988v2","updated":"2024-08-16T05:12:35Z","published":"2024-07-29T13:20:22Z","title":"HeadsetOff: Enabling Photorealistic Video Conferencing on Economical VR\n Headsets","summary":" Virtual Reality (VR) has become increasingly popular for remote\ncollaboration, but video conferencing poses challenges when the user's face is\ncovered by the headset. Existing solutions have limitations in terms of\naccessibility. In this paper, we propose HeadsetOff, a novel system that\nachieves photorealistic video conferencing on economical VR headsets by\nleveraging voice-driven face reconstruction. HeadsetOff consists of three main\ncomponents: a multimodal predictor, a generator, and an adaptive controller.\nThe predictor effectively predicts user future behavior based on different\nmodalities. The generator employs voice, head motion, and eye blink to animate\nthe human face. The adaptive controller dynamically selects the appropriate\ngenerator model based on the trade-off between video quality and delay.\nExperimental results demonstrate the effectiveness of HeadsetOff in achieving\nhigh-quality, low-latency video conferencing on economical VR headsets.\n","authors":["Yili Jin","Xize Duan","Fangxin Wang","Xue Liu"],"pdf_url":"https://arxiv.org/pdf/2407.19988v2.pdf","comment":"Accepted by ACM Multimedia 2024"}]},"2024-08-19T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2408.10188v1","updated":"2024-08-19T17:48:08Z","published":"2024-08-19T17:48:08Z","title":"LongVILA: Scaling Long-Context Visual Language Models for Long Videos","summary":" Long-context capability is critical for multi-modal foundation models. We\nintroduce LongVILA, a full-stack solution for long-context vision-language\nmodels, including system, model training, and dataset development. On the\nsystem side, we introduce the first Multi-Modal Sequence Parallelism (MM-SP)\nsystem that enables long-context training and inference, enabling 2M context\nlength training on 256 GPUs. MM-SP is also efficient, being 2.1x - 5.7x faster\nthan Ring-Style Sequence Parallelism and 1.1x - 1.4x faster than Megatron-LM in\ntext-only settings. Moreover, it seamlessly integrates with Hugging Face\nTransformers. For model training, we propose a five-stage pipeline comprising\nalignment, pre-training, context extension, and long-short joint supervised\nfine-tuning. Regarding datasets, we meticulously construct large-scale visual\nlanguage pre-training datasets and long video instruction-following datasets to\nsupport our multi-stage training process. The full-stack solution extends the\nfeasible frame number of VILA by a factor of 128 (from 8 to 1024 frames) and\nimproves long video captioning score from 2.00 to 3.26 (1.6x), achieving 99.5%\naccuracy in 1400-frames video (274k context length) needle in a haystack.\nLongVILA-8B also demonstrates a consistent improvement in performance on long\nvideos within the VideoMME benchmark as the video frames increase.\n","authors":["Fuzhao Xue","Yukang Chen","Dacheng Li","Qinghao Hu","Ligeng Zhu","Xiuyu Li","Yunhao Fang","Haotian Tang","Shang Yang","Zhijian Liu","Ethan He","Hongxu Yin","Pavlo Molchanov","Jan Kautz","Linxi Fan","Yuke Zhu","Yao Lu","Song Han"],"pdf_url":"https://arxiv.org/pdf/2408.10188v1.pdf","comment":"Code and models are available at\n https://github.com/NVlabs/VILA/blob/main/LongVILA.md"},{"id":"http://arxiv.org/abs/2404.07904v2","updated":"2024-08-19T17:16:55Z","published":"2024-04-11T16:43:03Z","title":"HGRN2: Gated Linear RNNs with State Expansion","summary":" Hierarchically gated linear RNN (HGRN, \\citealt{HGRN}) has demonstrated\ncompetitive training speed and performance in language modeling while offering\nefficient inference. However, the recurrent state size of HGRN remains\nrelatively small, limiting its expressiveness. To address this issue, we\nintroduce a simple outer product-based state expansion mechanism, which\nsignificantly enlarges the recurrent state size without introducing any\nadditional parameters. This enhancement also provides a linear attention\ninterpretation for HGRN2, enabling hardware-efficient training. Our extensive\nexperiments verify the advantage of HGRN2 over HGRN consistently across\ndifferent settings and competitive with other recurrent models.\n","authors":["Zhen Qin","Songlin Yang","Weixuan Sun","Xuyang Shen","Dong Li","Weigao Sun","Yiran Zhong"],"pdf_url":"https://arxiv.org/pdf/2404.07904v2.pdf","comment":"Accept to COLM 2024. Yiran Zhong is the corresponding author. Zhen\n Qin and Songlin Yang contributed equally to this work. The source code is\n available at https://github.com/OpenNLPLab/HGRN2"},{"id":"http://arxiv.org/abs/2404.02138v3","updated":"2024-08-19T17:16:08Z","published":"2024-04-02T17:49:40Z","title":"Topic-Based Watermarks for LLM-Generated Text","summary":" The indistinguishability of text generated by large language models (LLMs)\nfrom human-generated text poses significant challenges. Watermarking algorithms\nare potential solutions by embedding detectable signatures within LLM-generated\noutputs. However, current watermarking schemes lack robustness to a range of\nattacks such as text substitution or manipulation, undermining their\nreliability. This paper proposes a novel topic-based watermarking algorithm for\nLLMs, designed to enhance the robustness of watermarking in LLMs. Our approach\nleverages the topics extracted from input prompts or outputs of non-watermarked\nLLMs in the generation process of watermarked text. We dynamically utilize\ntoken lists on identified topics and adjust token sampling weights accordingly.\nBy using these topic-specific token biases, we embed a topic-sensitive\nwatermarking into the generated text. We outline the theoretical framework of\nour topic-based watermarking algorithm and discuss its potential advantages in\nvarious scenarios. Additionally, we explore a comprehensive range of attacks\nagainst watermarking algorithms, including discrete alterations, paraphrasing,\nand tokenizations. We demonstrate that our proposed watermarking scheme\nclassifies various watermarked text topics with 99.99% confidence and\noutperforms existing algorithms in terms of z-score robustness and the\nfeasibility of modeling text degradation by potential attackers, while\nconsidering the trade-offs between the benefits and losses of watermarking\nLLM-generated text.\n","authors":["Alexander Nemecek","Yuzhou Jiang","Erman Ayday"],"pdf_url":"https://arxiv.org/pdf/2404.02138v3.pdf","comment":"Results for proposed scheme, additional/removal of content (figures\n and equations), 12 pages"},{"id":"http://arxiv.org/abs/2408.10151v1","updated":"2024-08-19T17:02:06Z","published":"2024-08-19T17:02:06Z","title":"Multilingual Needle in a Haystack: Investigating Long-Context Behavior\n of Multilingual Large Language Models","summary":" While recent large language models (LLMs) demonstrate remarkable abilities in\nresponding to queries in diverse languages, their ability to handle long\nmultilingual contexts is unexplored. As such, a systematic evaluation of the\nlong-context capabilities of LLMs in multilingual settings is crucial,\nspecifically in the context of information retrieval. To address this gap, we\nintroduce the MultiLingual Needle-in-a-Haystack (MLNeedle) test, designed to\nassess a model's ability to retrieve relevant information (the needle) from a\ncollection of multilingual distractor texts (the haystack). This test serves as\nan extension of the multilingual question-answering task, encompassing both\nmonolingual and cross-lingual retrieval. We evaluate four state-of-the-art LLMs\non MLNeedle. Our findings reveal that model performance can vary significantly\nwith language and needle position. Specifically, we observe that model\nperformance is the lowest when the needle is (i) in a language outside the\nEnglish language family and (ii) located in the middle of the input context.\nFurthermore, although some models claim a context size of $8k$ tokens or\ngreater, none demonstrate satisfactory cross-lingual retrieval performance as\nthe context length increases. Our analysis provides key insights into the\nlong-context behavior of LLMs in multilingual settings to guide future\nevaluation protocols. To our knowledge, this is the first study to investigate\nthe multilingual long-context behavior of LLMs.\n","authors":["Amey Hengle","Prasoon Bajpai","Soham Dan","Tanmoy Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2408.10151v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10147v1","updated":"2024-08-19T16:47:46Z","published":"2024-08-19T16:47:46Z","title":"In-Context Learning with Representations: Contextual Generalization of\n Trained Transformers","summary":" In-context learning (ICL) refers to a remarkable capability of pretrained\nlarge language models, which can learn a new task given a few examples during\ninference. However, theoretical understanding of ICL is largely under-explored,\nparticularly whether transformers can be trained to generalize to unseen\nexamples in a prompt, which will require the model to acquire contextual\nknowledge of the prompt for generalization. This paper investigates the\ntraining dynamics of transformers by gradient descent through the lens of\nnon-linear regression tasks. The contextual generalization here can be attained\nvia learning the template function for each task in-context, where all template\nfunctions lie in a linear space with $m$ basis functions. We analyze the\ntraining dynamics of one-layer multi-head transformers to in-contextly predict\nunlabeled inputs given partially labeled prompts, where the labels contain\nGaussian noise and the number of examples in each prompt are not sufficient to\ndetermine the template. Under mild assumptions, we show that the training loss\nfor a one-layer multi-head transformer converges linearly to a global minimum.\nMoreover, the transformer effectively learns to perform ridge regression over\nthe basis functions. To our knowledge, this study is the first provable\ndemonstration that transformers can learn contextual (i.e., template)\ninformation to generalize to both unseen examples and tasks when prompts\ncontain only a small number of query-answer pairs.\n","authors":["Tong Yang","Yu Huang","Yingbin Liang","Yuejie Chi"],"pdf_url":"https://arxiv.org/pdf/2408.10147v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10141v1","updated":"2024-08-19T16:41:07Z","published":"2024-08-19T16:41:07Z","title":"Instruction Finetuning for Leaderboard Generation from Empirical AI\n Research","summary":" This study demonstrates the application of instruction finetuning of\npretrained Large Language Models (LLMs) to automate the generation of AI\nresearch leaderboards, extracting (Task, Dataset, Metric, Score) quadruples\nfrom articles. It aims to streamline the dissemination of advancements in AI\nresearch by transitioning from traditional, manual community curation, or\notherwise taxonomy-constrained natural language inference (NLI) models, to an\nautomated, generative LLM-based approach. Utilizing the FLAN-T5 model, this\nresearch enhances LLMs' adaptability and reliability in information extraction,\noffering a novel method for structured knowledge representation.\n","authors":["Salomon Kabongo","Jennifer D'Souza"],"pdf_url":"https://arxiv.org/pdf/2408.10141v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2407.02409"},{"id":"http://arxiv.org/abs/2408.10130v1","updated":"2024-08-19T16:17:20Z","published":"2024-08-19T16:17:20Z","title":"Rhyme-aware Chinese lyric generator based on GPT","summary":" Neural language representation models such as GPT, pre-trained on large-scale\ncorpora, can effectively capture rich semantic patterns from plain text and be\nfine-tuned to consistently improve natural language generation performance.\nHowever, existing pre-trained language models used to generate lyrics rarely\nconsider rhyme information, which is crucial in lyrics. Using a pre-trained\nmodel directly results in poor performance. To enhance the rhyming quality of\ngenerated lyrics, we incorporate integrated rhyme information into our model,\nthereby improving lyric generation performance.\n","authors":["Yixiao Yuan","Yangchen Huang","Yu Ma","Xinjin Li","Zhenglin Li","Yiming Shi","Huapeng Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.10130v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10115v1","updated":"2024-08-19T16:01:48Z","published":"2024-08-19T16:01:48Z","title":"GLIMMER: Incorporating Graph and Lexical Features in Unsupervised\n Multi-Document Summarization","summary":" Pre-trained language models are increasingly being used in multi-document\nsummarization tasks. However, these models need large-scale corpora for\npre-training and are domain-dependent. Other non-neural unsupervised\nsummarization approaches mostly rely on key sentence extraction, which can lead\nto information loss. To address these challenges, we propose a lightweight yet\neffective unsupervised approach called GLIMMER: a Graph and LexIcal features\nbased unsupervised Multi-docuMEnt summaRization approach. It first constructs a\nsentence graph from the source documents, then automatically identifies\nsemantic clusters by mining low-level features from raw texts, thereby\nimproving intra-cluster correlation and the fluency of generated sentences.\nFinally, it summarizes clusters into natural sentences. Experiments conducted\non Multi-News, Multi-XScience and DUC-2004 demonstrate that our approach\noutperforms existing unsupervised approaches. Furthermore, it surpasses\nstate-of-the-art pre-trained multi-document summarization models (e.g. PEGASUS\nand PRIMERA) under zero-shot settings in terms of ROUGE scores. Additionally,\nhuman evaluations indicate that summaries generated by GLIMMER achieve high\nreadability and informativeness scores. Our code is available at\nhttps://github.com/Oswald1997/GLIMMER.\n","authors":["Ran Liu","Ming Liu","Min Yu","Jianguo Jiang","Gang Li","Dan Zhang","Jingyuan Li","Xiang Meng","Weiqing Huang"],"pdf_url":"https://arxiv.org/pdf/2408.10115v1.pdf","comment":"19 pages, 7 figures. Accepted by ECAI 2024"},{"id":"http://arxiv.org/abs/2309.15238v2","updated":"2024-08-19T15:39:54Z","published":"2023-09-26T20:04:48Z","title":"Learning Using Generated Privileged Information by Text-to-Image\n Diffusion Models","summary":" Learning Using Privileged Information is a particular type of knowledge\ndistillation where the teacher model benefits from an additional data\nrepresentation during training, called privileged information, improving the\nstudent model, which does not see the extra representation. However, privileged\ninformation is rarely available in practice. To this end, we propose a text\nclassification framework that harnesses text-to-image diffusion models to\ngenerate artificial privileged information. The generated images and the\noriginal text samples are further used to train multimodal teacher models based\non state-of-the-art transformer-based architectures. Finally, the knowledge\nfrom multimodal teachers is distilled into a text-based (unimodal) student.\nHence, by employing a generative model to produce synthetic data as privileged\ninformation, we guide the training of the student model. Our framework, called\nLearning Using Generated Privileged Information (LUGPI), yields noticeable\nperformance gains on four text classification data sets, demonstrating its\npotential in text classification without any additional cost during inference.\n","authors":["Rafael-Edy Menadil","Mariana-Iuliana Georgescu","Radu Tudor Ionescu"],"pdf_url":"https://arxiv.org/pdf/2309.15238v2.pdf","comment":"Accepted at ICPR 2024"},{"id":"http://arxiv.org/abs/2407.12021v2","updated":"2024-08-19T15:28:37Z","published":"2024-06-27T22:20:39Z","title":"Adaptive Draft-Verification for Efficient Large Language Model Decoding","summary":" Large language model (LLM) decoding involves generating a sequence of tokens\nbased on a given context, where each token is predicted one at a time using the\nmodel's learned probabilities. The typical autoregressive decoding method\nrequires a separate forward pass through the model for each token generated,\nwhich is computationally inefficient and poses challenges for deploying LLMs in\nlatency-sensitive scenarios. The main limitations of current decoding methods\nstem from their inefficiencies and resource demands. Existing approaches either\nnecessitate fine-tuning smaller models, which is resource-intensive, or rely on\nfixed retrieval schemes to construct drafts for the next tokens, which lack\nadaptability and fail to generalize across different models and contexts. To\naddress these issues, we introduce a novel methodology called ADED, which\naccelerates LLM decoding without requiring fine-tuning. Our approach involves\nan adaptive draft-verification process that evolves over time to improve\nefficiency. We utilize a tri-gram matrix-based LLM representation to\ndynamically approximate the output distribution of the LLM, allowing the model\nto adjust to changing token probabilities during the decoding process.\nAdditionally, we implement a draft construction mechanism that effectively\nbalances exploration and exploitation, ensuring that the drafts generated are\nboth diverse and close to the true output distribution of the LLM. The\nimportance of this design lies in its ability to optimize the draft\ndistribution adaptively, leading to faster and more accurate decoding. Through\nextensive experiments on various benchmark datasets and LLM architectures, we\ndemonstrate that ADED significantly accelerates the decoding process while\nmaintaining high accuracy, making it suitable for deployment in a wide range of\npractical applications.\n","authors":["Xukun Liu","Bowen Lei","Ruqi Zhang","Dongkuan Xu"],"pdf_url":"https://arxiv.org/pdf/2407.12021v2.pdf","comment":"Under review of Neurips 2024"},{"id":"http://arxiv.org/abs/2408.10075v1","updated":"2024-08-19T15:18:30Z","published":"2024-08-19T15:18:30Z","title":"Personalizing Reinforcement Learning from Human Feedback with\n Variational Preference Learning","summary":" Reinforcement Learning from Human Feedback (RLHF) is a powerful paradigm for\naligning foundation models to human values and preferences. However, current\nRLHF techniques cannot account for the naturally occurring differences in\nindividual human preferences across a diverse population. When these\ndifferences arise, traditional RLHF frameworks simply average over them,\nleading to inaccurate rewards and poor performance for individual subgroups. To\naddress the need for pluralistic alignment, we develop a class of multimodal\nRLHF methods. Our proposed techniques are based on a latent variable\nformulation - inferring a novel user-specific latent and learning reward models\nand policies conditioned on this latent without additional user-specific data.\nWhile conceptually simple, we show that in practice, this reward modeling\nrequires careful algorithmic considerations around model architecture and\nreward scaling. To empirically validate our proposed technique, we first show\nthat it can provide a way to combat underspecification in simulated control\nproblems, inferring and optimizing user-specific reward functions. Next, we\nconduct experiments on pluralistic language datasets representing diverse user\npreferences and demonstrate improved reward function accuracy. We additionally\nshow the benefits of this probabilistic framework in terms of measuring\nuncertainty, and actively learning user preferences. This work enables learning\nfrom diverse populations of users with divergent preferences, an important\nchallenge that naturally occurs in problems from robot learning to foundation\nmodel alignment.\n","authors":["Sriyash Poddar","Yanming Wan","Hamish Ivison","Abhishek Gupta","Natasha Jaques"],"pdf_url":"https://arxiv.org/pdf/2408.10075v1.pdf","comment":"weirdlabuw.github.io/vpl"},{"id":"http://arxiv.org/abs/2408.10053v1","updated":"2024-08-19T14:48:04Z","published":"2024-08-19T14:48:04Z","title":"Privacy Checklist: Privacy Violation Detection Grounding on Contextual\n Integrity Theory","summary":" Privacy research has attracted wide attention as individuals worry that their\nprivate data can be easily leaked during interactions with smart devices,\nsocial platforms, and AI applications. Computer science researchers, on the\nother hand, commonly study privacy issues through privacy attacks and defenses\non segmented fields. Privacy research is conducted on various sub-fields,\nincluding Computer Vision (CV), Natural Language Processing (NLP), and Computer\nNetworks. Within each field, privacy has its own formulation. Though pioneering\nworks on attacks and defenses reveal sensitive privacy issues, they are\nnarrowly trapped and cannot fully cover people's actual privacy concerns.\nConsequently, the research on general and human-centric privacy research\nremains rather unexplored. In this paper, we formulate the privacy issue as a\nreasoning problem rather than simple pattern matching. We ground on the\nContextual Integrity (CI) theory which posits that people's perceptions of\nprivacy are highly correlated with the corresponding social context. Based on\nsuch an assumption, we develop the first comprehensive checklist that covers\nsocial identities, private attributes, and existing privacy regulations. Unlike\nprior works on CI that either cover limited expert annotated norms or model\nincomplete social context, our proposed privacy checklist uses the whole Health\nInsurance Portability and Accountability Act of 1996 (HIPAA) as an example, to\nshow that we can resort to large language models (LLMs) to completely cover the\nHIPAA's regulations. Additionally, our checklist also gathers expert\nannotations across multiple ontologies to determine private information\nincluding but not limited to personally identifiable information (PII). We use\nour preliminary results on the HIPAA to shed light on future context-centric\nprivacy research to cover more privacy regulations, social norms and standards.\n","authors":["Haoran Li","Wei Fan","Yulin Chen","Jiayang Cheng","Tianshu Chu","Xuebing Zhou","Peizhao Hu","Yangqiu Song"],"pdf_url":"https://arxiv.org/pdf/2408.10053v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03009v2","updated":"2024-08-19T14:47:15Z","published":"2024-02-05T13:47:53Z","title":"UniMem: Towards a Unified View of Long-Context Large Language Models","summary":" Long-context processing is a critical ability that constrains the\napplicability of large language models (LLMs). Although there exist various\nmethods devoted to enhancing the long-context processing ability of LLMs, they\nare developed in an isolated manner and lack systematic analysis and\nintegration of their strengths, hindering further developments. In this paper,\nwe introduce UniMem, a Unified framework that reformulates existing\nlong-context methods from the view of Memory augmentation of LLMs.\nDistinguished by its four core dimensions-Memory Management, Memory Writing,\nMemory Reading, and Memory Injection, UniMem empowers researchers to conduct\nsystematic exploration of long-context methods. We re-formulate 16 existing\nmethods based on UniMem and analyze four representative methods:\nTransformer-XL, Memorizing Transformer, RMT, and Longformer into equivalent\nUniMem forms to reveal their design principles and strengths. Based on these\nanalyses, we propose UniMix, an innovative approach that integrates the\nstrengths of these algorithms. Experimental results show that UniMix achieves\nsuperior performance in handling long contexts with significantly lower\nperplexity than baselines.\n","authors":["Junjie Fang","Likai Tang","Hongzhe Bi","Yujia Qin","Si Sun","Zhenyu Li","Haolun Li","Yongjian Li","Xin Cong","Yankai Lin","Yukun Yan","Xiaodong Shi","Sen Song","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2402.03009v2.pdf","comment":"COLM 2024"},{"id":"http://arxiv.org/abs/2404.09937v2","updated":"2024-08-19T13:55:42Z","published":"2024-04-15T17:03:41Z","title":"Compression Represents Intelligence Linearly","summary":" There is a belief that learning to compress well will lead to intelligence.\nRecently, language modeling has been shown to be equivalent to compression,\nwhich offers a compelling rationale for the success of large language models\n(LLMs): the development of more advanced language models is essentially\nenhancing compression which facilitates intelligence. Despite such appealing\ndiscussions, little empirical evidence is present for the interplay between\ncompression and intelligence. In this work, we examine their relationship in\nthe context of LLMs, treating LLMs as data compressors. Given the abstract\nconcept of \"intelligence\", we adopt the average downstream benchmark scores as\na surrogate, specifically targeting intelligence related to knowledge and\ncommonsense, coding, and mathematical reasoning. Across 12 benchmarks, our\nstudy brings together 31 public LLMs that originate from diverse organizations.\nRemarkably, we find that LLMs' intelligence -- reflected by average benchmark\nscores -- almost linearly correlates with their ability to compress external\ntext corpora. These results provide concrete evidence supporting the belief\nthat superior compression indicates greater intelligence. Furthermore, our\nfindings suggest that compression efficiency, as an unsupervised metric derived\nfrom raw text corpora, serves as a reliable evaluation measure that is linearly\nassociated with the model capabilities. We open-source our compression datasets\nas well as our data collection pipelines to facilitate future researchers to\nassess compression properly.\n","authors":["Yuzhen Huang","Jinghan Zhang","Zifei Shan","Junxian He"],"pdf_url":"https://arxiv.org/pdf/2404.09937v2.pdf","comment":"COLM 2024. Data and code are available at\n https://github.com/hkust-nlp/llm-compression-intelligence"},{"id":"http://arxiv.org/abs/2310.18208v3","updated":"2024-08-19T13:40:47Z","published":"2023-10-27T15:31:22Z","title":"ArcheType: A Novel Framework for Open-Source Column Type Annotation\n using Large Language Models","summary":" Existing deep-learning approaches to semantic column type annotation (CTA)\nhave important shortcomings: they rely on semantic types which are fixed at\ntraining time; require a large number of training samples per type and incur\nlarge run-time inference costs; and their performance can degrade when\nevaluated on novel datasets, even when types remain constant. Large language\nmodels have exhibited strong zero-shot classification performance on a wide\nrange of tasks and in this paper we explore their use for CTA. We introduce\nArcheType, a simple, practical method for context sampling, prompt\nserialization, model querying, and label remapping, which enables large\nlanguage models to solve CTA problems in a fully zero-shot manner. We ablate\neach component of our method separately, and establish that improvements to\ncontext sampling and label remapping provide the most consistent gains.\nArcheType establishes a new state-of-the-art performance on zero-shot CTA\nbenchmarks (including three new domain-specific benchmarks which we release\nalong with this paper), and when used in conjunction with classical CTA\ntechniques, it outperforms a SOTA DoDuo model on the fine-tuned SOTAB\nbenchmark. Our code is available at https://github.com/penfever/ArcheType.\n","authors":["Benjamin Feuer","Yurong Liu","Chinmay Hegde","Juliana Freire"],"pdf_url":"https://arxiv.org/pdf/2310.18208v3.pdf","comment":"VLDB 2024"},{"id":"http://arxiv.org/abs/2407.09816v3","updated":"2024-08-19T13:16:16Z","published":"2024-07-13T09:22:33Z","title":"MaskMoE: Boosting Token-Level Learning via Routing Mask in\n Mixture-of-Experts","summary":" Scaling the size of a model enhances its capabilities but significantly\nincreases computation complexity. Mixture-of-Experts models (MoE) address the\nissue by allowing model size to scale up without substantially increasing\ntraining or inference costs. In MoE, there is an important module called the\nrouter, which is used to distribute each token to the experts. Currently, the\nmainstream routing methods include dynamic routing and fixed routing. Despite\ntheir promising results, MoE models encounter several challenges. Primarily,\nfor dynamic routing methods, the dispersion of training tokens across multiple\nexperts can lead to underfitting, particularly for infrequent tokens.\nAdditionally, though fixed routing methods can mitigate that issue, they\ncompromise on the diversity of representations. In this paper, we propose\n\\textbf{MaskMoE}, a method designed to enhance token-level learning by\nemploying a routing \\textbf{mask}ing technique within the\n\\textbf{M}ixture-\\textbf{o}f-\\textbf{E}xperts model. MaskMoE is capable of\nmaintaining representation diversity while achieving more comprehensive\ntraining. Experimental results demonstrate that our method outperforms previous\ndominant Mixture-of-Experts models in terms of both perplexity (PPL) and\ndownstream task performance.\n","authors":["Zhenpeng Su","Zijia Lin","Xue Bai","Xing Wu","Yizhe Xiong","Haoran Lian","Guangyuan Ma","Hui Chen","Guiguang Ding","Wei Zhou","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2407.09816v3.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2408.09949v1","updated":"2024-08-19T12:42:10Z","published":"2024-08-19T12:42:10Z","title":"C${^2}$RL: Content and Context Representation Learning for Gloss-free\n Sign Language Translation and Retrieval","summary":" Sign Language Representation Learning (SLRL) is crucial for a range of sign\nlanguage-related downstream tasks such as Sign Language Translation (SLT) and\nSign Language Retrieval (SLRet). Recently, many gloss-based and gloss-free SLRL\nmethods have been proposed, showing promising performance. Among them, the\ngloss-free approach shows promise for strong scalability without relying on\ngloss annotations. However, it currently faces suboptimal solutions due to\nchallenges in encoding the intricate, context-sensitive characteristics of sign\nlanguage videos, mainly struggling to discern essential sign features using a\nnon-monotonic video-text alignment strategy. Therefore, we introduce an\ninnovative pretraining paradigm for gloss-free SLRL, called C${^2}$RL, in this\npaper. Specifically, rather than merely incorporating a non-monotonic semantic\nalignment of video and text to learn language-oriented sign features, we\nemphasize two pivotal aspects of SLRL: Implicit Content Learning (ICL) and\nExplicit Context Learning (ECL). ICL delves into the content of communication,\ncapturing the nuances, emphasis, timing, and rhythm of the signs. In contrast,\nECL focuses on understanding the contextual meaning of signs and converting\nthem into equivalent sentences. Despite its simplicity, extensive experiments\nconfirm that the joint optimization of ICL and ECL results in robust sign\nlanguage representation and significant performance gains in gloss-free SLT and\nSLRet tasks. Notably, C${^2}$RL improves the BLEU-4 score by +5.3 on P14T,\n+10.6 on CSL-daily, +6.2 on OpenASL, and +1.3 on How2Sign. It also boosts the\nR@1 score by +8.3 on P14T, +14.4 on CSL-daily, and +5.9 on How2Sign.\nAdditionally, we set a new baseline for the OpenASL dataset in the SLRet task.\n","authors":["Zhigang Chen","Benjia Zhou","Yiqing Huang","Jun Wan","Yibo Hu","Hailin Shi","Yanyan Liang","Zhen Lei","Du Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.09949v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09946v1","updated":"2024-08-19T12:35:23Z","published":"2024-08-19T12:35:23Z","title":"Microscopic Analysis on LLM players via Social Deduction Game","summary":" Recent studies have begun developing autonomous game players for social\ndeduction games using large language models (LLMs). When building LLM players,\nfine-grained evaluations are crucial for addressing weaknesses in game-playing\nabilities. However, existing studies have often overlooked such assessments.\nSpecifically, we point out two issues with the evaluation methods employed.\nFirst, game-playing abilities have typically been assessed through game-level\noutcomes rather than specific event-level skills; Second, error analyses have\nlacked structured methodologies. To address these issues, we propose an\napproach utilizing a variant of the SpyFall game, named SpyGame. We conducted\nan experiment with four LLMs, analyzing their gameplay behavior in SpyGame both\nquantitatively and qualitatively. For the quantitative analysis, we introduced\neight metrics to resolve the first issue, revealing that these metrics are more\neffective than existing ones for evaluating the two critical skills: intent\nidentification and camouflage. In the qualitative analysis, we performed\nthematic analysis to resolve the second issue. This analysis identifies four\nmajor categories that affect gameplay of LLMs. Additionally, we demonstrate how\nthese categories complement and support the findings from the quantitative\nanalysis.\n","authors":["Byungjun Kim","Dayeon Seo","Bugeun Kim"],"pdf_url":"https://arxiv.org/pdf/2408.09946v1.pdf","comment":"Under review, 10 pages"},{"id":"http://arxiv.org/abs/2408.09945v1","updated":"2024-08-19T12:34:31Z","published":"2024-08-19T12:34:31Z","title":"Benchmarking LLMs for Translating Classical Chinese Poetry:Evaluating\n Adequacy, Fluency, and Elegance","summary":" Large language models (LLMs) have shown remarkable performance in general\ntranslation tasks. However, the increasing demand for high-quality translations\nthat are not only adequate but also fluent and elegant. To assess the extent to\nwhich current LLMs can meet these demands, we introduce a suitable benchmark\nfor translating classical Chinese poetry into English. This task requires not\nonly adequacy in translating culturally and historically significant content\nbut also a strict adherence to linguistic fluency and poetic elegance. Our\nstudy reveals that existing LLMs fall short of this task. To address these\nissues, we propose RAT, a \\textbf{R}etrieval-\\textbf{A}ugmented machine\n\\textbf{T}ranslation method that enhances the translation process by\nincorporating knowledge related to classical poetry. Additionally, we propose\nan automatic evaluation metric based on GPT-4, which better assesses\ntranslation quality in terms of adequacy, fluency, and elegance, overcoming the\nlimitations of traditional metrics. Our dataset and code will be made\navailable.\n","authors":["Andong Chen","Lianzhang Lou","Kehai Chen","Xuefeng Bai","Yang Xiang","Muyun Yang","Tiejun Zhao","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.09945v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2408.09939v1","updated":"2024-08-19T12:21:34Z","published":"2024-08-19T12:21:34Z","title":"\"Image, Tell me your story!\" Predicting the original meta-context of\n visual misinformation","summary":" To assist human fact-checkers, researchers have developed automated\napproaches for visual misinformation detection. These methods assign veracity\nscores by identifying inconsistencies between the image and its caption, or by\ndetecting forgeries in the image. However, they neglect a crucial point of the\nhuman fact-checking process: identifying the original meta-context of the\nimage. By explaining what is actually true about the image, fact-checkers can\nbetter detect misinformation, focus their efforts on check-worthy visual\ncontent, engage in counter-messaging before misinformation spreads widely, and\nmake their explanation more convincing. Here, we fill this gap by introducing\nthe task of automated image contextualization. We create 5Pils, a dataset of\n1,676 fact-checked images with question-answer pairs about their original\nmeta-context. Annotations are based on the 5 Pillars fact-checking framework.\nWe implement a first baseline that grounds the image in its original\nmeta-context using the content of the image and textual evidence retrieved from\nthe open web. Our experiments show promising results while highlighting several\nopen challenges in retrieval and reasoning. We make our code and data publicly\navailable.\n","authors":["Jonathan Tonglet","Marie-Francine Moens","Iryna Gurevych"],"pdf_url":"https://arxiv.org/pdf/2408.09939v1.pdf","comment":"Preprint. Code available at https://github.com/UKPLab/5pils"},{"id":"http://arxiv.org/abs/2408.09916v1","updated":"2024-08-19T11:44:40Z","published":"2024-08-19T11:44:40Z","title":"Attribution Analysis Meets Model Editing: Advancing Knowledge Correction\n in Vision Language Models with VisEdit","summary":" Model editing aims to correct outdated or erroneous knowledge in large models\nwithout costly retraining. Recent research discovered that the mid-layer\nrepresentation of the subject's final token in a prompt has a strong influence\non factual predictions, and developed Large Language Model (LLM) editing\ntechniques based on this observation. However, for Vision-LLMs (VLLMs), how\nvisual representations impact the predictions from a decoder-only language\nmodel remains largely unexplored. To the best of our knowledge, model editing\nfor VLLMs has not been extensively studied in the literature. In this work, we\nemploy the contribution allocation and noise perturbation methods to measure\nthe contributions of visual representations for token predictions. Our\nattribution analysis shows that visual representations in mid-to-later layers\nthat are highly relevant to the prompt contribute significantly to predictions.\nBased on these insights, we propose VisEdit, a novel model editor for VLLMs\nthat effectively corrects knowledge by editing intermediate visual\nrepresentations in regions important to the edit prompt. We evaluated VisEdit\nusing multiple VLLM backbones and public VLLM editing benchmark datasets. The\nresults show the superiority of VisEdit over the strong baselines adapted from\nexisting state-of-the-art editors for LLMs.\n","authors":["Qizhou Chen","Taolin Zhang","Chengyu Wang","Xiaofeng He","Dakan Wang","Tingting Liu"],"pdf_url":"https://arxiv.org/pdf/2408.09916v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06355v3","updated":"2024-08-19T11:43:18Z","published":"2023-12-11T13:03:39Z","title":"Linguistic and Structural Basis of Engineering Design Knowledge","summary":" Natural language artefact descriptions are primary carriers of engineering\ndesign knowledge, whose retrieval, representation, and reuse are fundamental to\nsupporting knowledge-intensive tasks in the design process. In this paper, we\nexplicate design knowledge from patented artefact descriptions as knowledge\ngraphs and examine these to understand the linguistic and structural basis. The\npurpose of our work is to advance the traditional and ontological perspectives\nof design knowledge and to guide Large-Language Models (LLMs) on how to\narticulate natural language responses that reflect knowledge that is valuable\nin a design environment. We populate 33,881 knowledge graphs from a sample of\npatents stratified according to technology classes. For linguistic basis, we\nconduct Zipf distribution analyses on the frequencies of unique entities and\nrelationships to identify 64 and 37 generalisable linguistic syntaxes\nrespectively. The relationships largely represent attributes ('of'), structure\n('in', 'with'), purpose ('to', 'for'), hierarchy ('include'), exemplification\n('such as'), and behaviour ('to', 'from'). For structural basis, we draw\ninspiration from various studies on biological/ecological networks and discover\nmotifs from patent knowledge graphs. We identify four 3-node and four 4-node\nsubgraph patterns that could be converged and simplified into sequence\n[->...->], aggregation [->...<-], and hierarchy [<-...->]. Based on these\nresults, we suggest concretisation strategies for entities and relationships\nand explicating hierarchical structures, potentially aiding the construction\nand modularisation of design knowledge.\n","authors":["L. Siddharth","Jianxi Luo"],"pdf_url":"https://arxiv.org/pdf/2312.06355v3.pdf","comment":"The data for this research is made available at Zenodo -\n https://zenodo.org/doi/10.5281/zenodo.13328257"},{"id":"http://arxiv.org/abs/2408.09914v1","updated":"2024-08-19T11:40:20Z","published":"2024-08-19T11:40:20Z","title":"Active Learning for Identifying Disaster-Related Tweets: A Comparison\n with Keyword Filtering and Generic Fine-Tuning","summary":" Information from social media can provide essential information for emergency\nresponse during natural disasters in near real-time. However, it is difficult\nto identify the disaster-related posts among the large amounts of unstructured\ndata available. Previous methods often use keyword filtering, topic modelling\nor classification-based techniques to identify such posts. Active Learning (AL)\npresents a promising sub-field of Machine Learning (ML) that has not been used\nmuch in the field of text classification of social media content. This study\ntherefore investigates the potential of AL for identifying disaster-related\nTweets. We compare a keyword filtering approach, a RoBERTa model fine-tuned\nwith generic data from CrisisLex, a base RoBERTa model trained with AL and a\nfine-tuned RoBERTa model trained with AL regarding classification performance.\nFor testing, data from CrisisLex and manually labelled data from the 2021 flood\nin Germany and the 2023 Chile forest fires were considered. The results show\nthat generic fine-tuning combined with 10 rounds of AL outperformed all other\napproaches. Consequently, a broadly applicable model for the identification of\ndisaster-related Tweets could be trained with very little labelling effort. The\nmodel can be applied to use cases beyond this study and provides a useful tool\nfor further research in social media analysis.\n","authors":["David Hanny","Sebastian Schmidt","Bernd Resch"],"pdf_url":"https://arxiv.org/pdf/2408.09914v1.pdf","comment":"Submitted for the Intelligent Systems Conference (IntelliSys 2024).\n The version of record of this contribution is published in the Springer\n series Lecture Notes in Networks and Systems, and is available online at\n https://doi.org/10.1007/978-3-031-66428-1_8. This preprint has not undergone\n peer review or any post-submission improvements or corrections. 13 pages, 2\n figures"},{"id":"http://arxiv.org/abs/2406.13213v2","updated":"2024-08-19T11:38:14Z","published":"2024-06-19T04:53:48Z","title":"Multi-Meta-RAG: Improving RAG for Multi-Hop Queries using Database\n Filtering with LLM-Extracted Metadata","summary":" The retrieval-augmented generation (RAG) enables retrieval of relevant\ninformation from an external knowledge source and allows large language models\n(LLMs) to answer queries over previously unseen document collections. However,\nit was demonstrated that traditional RAG applications perform poorly in\nanswering multi-hop questions, which require retrieving and reasoning over\nmultiple elements of supporting evidence. We introduce a new method called\nMulti-Meta-RAG, which uses database filtering with LLM-extracted metadata to\nimprove the RAG selection of the relevant documents from various sources,\nrelevant to the question. While database filtering is specific to a set of\nquestions from a particular domain and format, we found out that Multi-Meta-RAG\ngreatly improves the results on the MultiHop-RAG benchmark. The code is\navailable at https://github.com/mxpoliakov/Multi-Meta-RAG.\n","authors":["Mykhailo Poliakov","Nadiya Shvai"],"pdf_url":"https://arxiv.org/pdf/2406.13213v2.pdf","comment":"Accepted to ICTERI 2024 Posters Track"},{"id":"http://arxiv.org/abs/2408.06150v2","updated":"2024-08-19T11:24:44Z","published":"2024-08-12T13:44:24Z","title":"LipidBERT: A Lipid Language Model Pre-trained on METiS de novo Lipid\n Library","summary":" In this study, we generate and maintain a database of 10 million virtual\nlipids through METiS's in-house de novo lipid generation algorithms and lipid\nvirtual screening techniques. These virtual lipids serve as a corpus for\npre-training, lipid representation learning, and downstream task knowledge\ntransfer, culminating in state-of-the-art LNP property prediction performance.\nWe propose LipidBERT, a BERT-like model pre-trained with the Masked Language\nModel (MLM) and various secondary tasks. Additionally, we compare the\nperformance of embeddings generated by LipidBERT and PhatGPT, our GPT-like\nlipid generation model, on downstream tasks. The proposed bilingual LipidBERT\nmodel operates in two languages: the language of ionizable lipid pre-training,\nusing in-house dry-lab lipid structures, and the language of LNP fine-tuning,\nutilizing in-house LNP wet-lab data. This dual capability positions LipidBERT\nas a key AI-based filter for future screening tasks, including new versions of\nMETiS de novo lipid libraries and, more importantly, candidates for in vivo\ntesting for orgran-targeting LNPs. To the best of our knowledge, this is the\nfirst successful demonstration of the capability of a pre-trained language\nmodel on virtual lipids and its effectiveness in downstream tasks using web-lab\ndata. This work showcases the clever utilization of METiS's in-house de novo\nlipid library as well as the power of dry-wet lab integration.\n","authors":["Tianhao Yu","Cai Yao","Zhuorui Sun","Feng Shi","Lin Zhang","Kangjie Lyu","Xuan Bai","Andong Liu","Xicheng Zhang","Jiali Zou","Wenshou Wang","Chris Lai","Kai Wang"],"pdf_url":"https://arxiv.org/pdf/2408.06150v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09895v1","updated":"2024-08-19T11:09:12Z","published":"2024-08-19T11:09:12Z","title":"Performance Law of Large Language Models","summary":" Guided by the belief of the scaling law, large language models (LLMs) have\nachieved impressive performance in recent years. However, scaling law only\ngives a qualitative estimation of loss, which is influenced by various factors\nsuch as model architectures, data distributions, tokenizers, and computation\nprecision. Thus, estimating the real performance of LLMs with different\ntraining settings rather than loss may be quite useful in practical\ndevelopment. In this article, we present an empirical equation named\n\"Performance Law\" to directly predict the MMLU score of an LLM, which is a\nwidely used metric to indicate the general capability of LLMs in real-world\nconversations and applications. Based on only a few key hyperparameters of the\nLLM architecture and the size of training data, we obtain a quite accurate MMLU\nprediction of various LLMs with diverse sizes and architectures developed by\ndifferent organizations in different years. Performance law can be used to\nguide the choice of LLM architecture and the effective allocation of\ncomputational resources without extensive experiments.\n","authors":["Chuhan Wu","Ruiming Tang"],"pdf_url":"https://arxiv.org/pdf/2408.09895v1.pdf","comment":"Personal opinions of the authors"},{"id":"http://arxiv.org/abs/2408.03297v2","updated":"2024-08-19T10:38:45Z","published":"2024-08-06T16:55:54Z","title":"KnowPO: Knowledge-aware Preference Optimization for Controllable\n Knowledge Selection in Retrieval-Augmented Language Models","summary":" By integrating external knowledge, Retrieval-Augmented Generation (RAG) has\nbecome an effective strategy for mitigating the hallucination problems that\nlarge language models (LLMs) encounter when dealing with knowledge-intensive\ntasks. However, in the process of integrating external non-parametric\nsupporting evidence with internal parametric knowledge, inevitable knowledge\nconflicts may arise, leading to confusion in the model's responses. To enhance\nthe knowledge selection of LLMs in various contexts, some research has focused\non refining their behavior patterns through instruction-tuning. Nonetheless,\ndue to the absence of explicit negative signals and comparative objectives,\nmodels fine-tuned in this manner may still exhibit undesirable behaviors such\nas contextual ignorance and contextual overinclusion. To this end, we propose a\nKnowledge-aware Preference Optimization strategy, dubbed KnowPO, aimed at\nachieving adaptive knowledge selection based on contextual relevance in real\nretrieval scenarios. Concretely, we proposed a general paradigm for\nconstructing knowledge conflict datasets, which comprehensively cover various\nerror types and learn how to avoid these negative signals through preference\noptimization methods. Simultaneously, we proposed a rewriting strategy and data\nratio optimization strategy to address preference imbalances. Experimental\nresults show that KnowPO outperforms previous methods for handling knowledge\nconflicts by over 37\\%, while also exhibiting robust generalization across\nvarious out-of-distribution datasets.\n","authors":["Ruizhe Zhang","Yongxin Xu","Yuzhen Xiao","Runchuan Zhu","Xinke Jiang","Xu Chu","Junfeng Zhao","Yasha Wang"],"pdf_url":"https://arxiv.org/pdf/2408.03297v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09869v1","updated":"2024-08-19T10:20:06Z","published":"2024-08-19T10:20:06Z","title":"Docling Technical Report","summary":" This technical report introduces Docling, an easy to use, self-contained,\nMIT-licensed open-source package for PDF document conversion. It is powered by\nstate-of-the-art specialized AI models for layout analysis (DocLayNet) and\ntable structure recognition (TableFormer), and runs efficiently on commodity\nhardware in a small resource budget. The code interface allows for easy\nextensibility and addition of new features and models.\n","authors":["Christoph Auer","Maksym Lysak","Ahmed Nassar","Michele Dolfi","Nikolaos Livathinos","Panos Vagenas","Cesar Berrospi Ramis","Matteo Omenetti","Fabian Lindlbauer","Kasper Dinkla","Valery Weber","Lucas Morin","Ingmar Meijer","Viktor Kuropiatnyk","Peter W. J. Staar"],"pdf_url":"https://arxiv.org/pdf/2408.09869v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2206.01062"},{"id":"http://arxiv.org/abs/2408.09865v1","updated":"2024-08-19T10:12:52Z","published":"2024-08-19T10:12:52Z","title":"MAPLE: Enhancing Review Generation with Multi-Aspect Prompt LEarning in\n Explainable Recommendation","summary":" Explainable Recommendation task is designed to receive a pair of user and\nitem and output explanations to justify why an item is recommended to a user.\nMany models treat review-generation as a proxy of explainable recommendation.\nAlthough they are able to generate fluent and grammatical sentences, they\nsuffer from generality and hallucination issues. We propose a personalized,\naspect-controlled model called Multi-Aspect Prompt LEarner (MAPLE), in which it\nintegrates aspect category as another input dimension to facilitate the\nmemorization of fine-grained aspect terms. Experiments on two real-world review\ndatasets in restaurant domain show that MAPLE outperforms the baseline\nreview-generation models in terms of text and feature diversity while\nmaintaining excellent coherence and factual relevance. We further treat MAPLE\nas a retriever component in the retriever-reader framework and employ a\nLarge-Language Model (LLM) as the reader, showing that MAPLE's explanation\nalong with the LLM's comprehension ability leads to enriched and personalized\nexplanation as a result. We will release the code and data in this http upon\nacceptance.\n","authors":["Ching-Wen Yang","Che Wei Chen","Kun-da Wu","Hao Xu","Jui-Feng Yao","Hung-Yu Kao"],"pdf_url":"https://arxiv.org/pdf/2408.09865v1.pdf","comment":"8 main pages, 10 pages for appendix. Under review"},{"id":"http://arxiv.org/abs/2406.12614v3","updated":"2024-08-19T10:01:56Z","published":"2024-06-18T13:43:22Z","title":"EUvsDisinfo: a Dataset for Multilingual Detection of Pro-Kremlin\n Disinformation in News Articles","summary":" This work introduces EUvsDisinfo, a multilingual dataset of disinformation\narticles originating from pro-Kremlin outlets, along with trustworthy articles\nfrom credible / less biased sources. It is sourced directly from the debunk\narticles written by experts leading the EUvsDisinfo project. Our dataset is the\nlargest to-date resource in terms of the overall number of articles and\ndistinct languages. It also provides the largest topical and temporal coverage.\nUsing this dataset, we investigate the dissemination of pro-Kremlin\ndisinformation across different languages, uncovering language-specific\npatterns targeting certain disinformation topics. We further analyse the\nevolution of topic distribution over an eight-year period, noting a significant\nsurge in disinformation content before the full-scale invasion of Ukraine in\n2022. Lastly, we demonstrate the dataset's applicability in training models to\neffectively distinguish between disinformation and trustworthy content in\nmultilingual settings.\n","authors":["João A. Leite","Olesya Razuvayevskaya","Kalina Bontcheva","Carolina Scarton"],"pdf_url":"https://arxiv.org/pdf/2406.12614v3.pdf","comment":"Published at CIKM 2024"},{"id":"http://arxiv.org/abs/2408.09857v1","updated":"2024-08-19T10:01:28Z","published":"2024-08-19T10:01:28Z","title":"TaSL: Continual Dialog State Tracking via Task Skill Localization and\n Consolidation","summary":" A practical dialogue system requires the capacity for ongoing skill\nacquisition and adaptability to new tasks while preserving prior knowledge.\nHowever, current methods for Continual Dialogue State Tracking (DST), a crucial\nfunction of dialogue systems, struggle with the catastrophic forgetting issue\nand knowledge transfer between tasks. We present TaSL, a novel framework for\ntask skill localization and consolidation that enables effective knowledge\ntransfer without relying on memory replay. TaSL uses a novel group-wise\ntechnique to pinpoint task-specific and task-shared areas. Additionally, a\nfine-grained skill consolidation strategy protects task-specific knowledge from\nbeing forgotten while updating shared knowledge for bi-directional knowledge\ntransfer. As a result, TaSL strikes a balance between preserving previous\nknowledge and excelling at new tasks. Comprehensive experiments on various\nbackbones highlight the significant performance improvements of TaSL over\nexisting state-of-the-art methods. The source code is provided for\nreproducibility.\n","authors":["Yujie Feng","Xu Chu","Yongxin Xu","Guangyuan Shi","Bo Liu","Xiao-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2408.09857v1.pdf","comment":"Accepted to ACL 2024 Main Conference"},{"id":"http://arxiv.org/abs/2408.09856v1","updated":"2024-08-19T09:58:53Z","published":"2024-08-19T09:58:53Z","title":"TeamLoRA: Boosting Low-Rank Adaptation with Expert Collaboration and\n Competition","summary":" While Parameter-Efficient Fine-Tuning (PEFT) methods like LoRA have\neffectively addressed GPU memory constraints during fine-tuning, their\nperformance often falls short, especially in multidimensional task scenarios.\nTo address this issue, one straightforward solution is to introduce\ntask-specific LoRA modules as domain experts, leveraging the modeling of\nmultiple experts' capabilities and thus enhancing the general capability of\nmulti-task learning. Despite promising, these additional components often add\ncomplexity to the training and inference process, contravening the efficient\ncharacterization of PEFT designed for. Considering this, we introduce an\ninnovative PEFT method, TeamLoRA, consisting of a collaboration and competition\nmodule for experts, and thus achieving the right balance of effectiveness and\nefficiency: (i) For collaboration, a novel knowledge-sharing and -organizing\nmechanism is devised to appropriately reduce the scale of matrix operations,\nthereby boosting the training and inference speed. (ii) For competition, we\npropose leveraging a game-theoretic interaction mechanism for experts,\nencouraging experts to transfer their domain-specific knowledge while facing\ndiverse downstream tasks, and thus enhancing the performance. By doing so,\nTeamLoRA elegantly connects the experts as a \"Team\" with internal collaboration\nand competition, enabling a faster and more accurate PEFT paradigm for\nmulti-task learning. To validate the superiority of TeamLoRA, we curate a\ncomprehensive multi-task evaluation(CME) benchmark to thoroughly assess the\ncapability of multi-task learning. Experiments conducted on our CME and other\nbenchmarks indicate the effectiveness and efficiency of TeamLoRA. Our project\nis available at https://github.com/Lin-Tianwei/TeamLoRA.\n","authors":["Tianwei Lin","Jiang Liu","Wenqiao Zhang","Zhaocheng Li","Yang Dai","Haoyuan Li","Zhelun Yu","Wanggui He","Juncheng Li","Hao Jiang","Siliang Tang","Yueting Zhuang"],"pdf_url":"https://arxiv.org/pdf/2408.09856v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09853v1","updated":"2024-08-19T09:57:28Z","published":"2024-08-19T09:57:28Z","title":"Self-Directed Turing Test for Large Language Models","summary":" The Turing test examines whether AIs can exhibit human-like behaviour in\nnatural language conversations. Traditional Turing tests adopt a rigid dialogue\nformat where each participant sends only one message each time and require\ncontinuous human involvement to direct the entire interaction with the test\nsubject. This fails to reflect a natural conversational style and hinders the\nevaluation of Large Language Models (LLMs) in complex and prolonged dialogues.\nThis paper proposes the Self-Directed Turing Test, which extends the original\ntest with a burst dialogue format, allowing more dynamic exchanges by multiple\nconsecutive messages. It further efficiently reduces human workload by having\nthe LLM self-direct the majority of the test process, iteratively generating\ndialogues that simulate its interaction with humans. With the pseudo-dialogue\nhistory, the model then engages in a shorter dialogue with a human, which is\npaired with a human-human conversation on the same topic to be judged using\nquestionnaires. We introduce the X-Turn Pass-Rate metric to assess the human\nlikeness of LLMs across varying durations. While LLMs like GPT-4 initially\nperform well, achieving pass rates of 51.9% and 38.9% during 3 turns and 10\nturns of dialogues respectively, their performance drops as the dialogue\nprogresses, which underscores the difficulty in maintaining consistency in the\nlong term.\n","authors":["Weiqi Wu","Hongqiu Wu","Hai Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.09853v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09849v1","updated":"2024-08-19T09:51:02Z","published":"2024-08-19T09:51:02Z","title":"Importance Weighting Can Help Large Language Models Self-Improve","summary":" Large language models (LLMs) have shown remarkable capability in numerous\ntasks and applications. However, fine-tuning LLMs using high-quality datasets\nunder external supervision remains prohibitively expensive. In response, LLM\nself-improvement approaches have been vibrantly developed recently. The typical\nparadigm of LLM self-improvement involves training LLM on self-generated data,\npart of which may be detrimental and should be filtered out due to the unstable\ndata quality. While current works primarily employs filtering strategies based\non answer correctness, in this paper, we demonstrate that filtering out correct\nbut with high distribution shift extent (DSE) samples could also benefit the\nresults of self-improvement. Given that the actual sample distribution is\nusually inaccessible, we propose a new metric called DS weight to approximate\nDSE, inspired by the Importance Weighting methods. Consequently, we integrate\nDS weight with self-consistency to comprehensively filter the self-generated\nsamples and fine-tune the language model. Experiments show that with only a\ntiny valid set (up to 5\\% size of the training set) to compute DS weight, our\napproach can notably promote the reasoning ability of current LLM\nself-improvement methods. The resulting performance is on par with methods that\nrely on external supervision from pre-trained reward models.\n","authors":["Chunyang Jiang","Chi-min Chan","Wei Xue","Qifeng Liu","Yike Guo"],"pdf_url":"https://arxiv.org/pdf/2408.09849v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09846v1","updated":"2024-08-19T09:48:50Z","published":"2024-08-19T09:48:50Z","title":"Continual Dialogue State Tracking via Reason-of-Select Distillation","summary":" An ideal dialogue system requires continuous skill acquisition and adaptation\nto new tasks while retaining prior knowledge. Dialogue State Tracking (DST),\nvital in these systems, often involves learning new services and confronting\ncatastrophic forgetting, along with a critical capability loss termed the\n\"Value Selection Quandary.\" To address these challenges, we introduce the\nReason-of-Select (RoS) distillation method by enhancing smaller models with a\nnovel 'meta-reasoning' capability. Meta-reasoning employs an enhanced\nmulti-domain perspective, combining fragments of meta-knowledge from\ndomain-specific dialogues during continual learning. This transcends\ntraditional single-perspective reasoning. The domain bootstrapping process\nenhances the model's ability to dissect intricate dialogues from multiple\npossible values. Its domain-agnostic property aligns data distribution across\ndifferent domains, effectively mitigating forgetting. Additionally, two novel\nimprovements, \"multi-value resolution\" strategy and Semantic Contrastive\nReasoning Selection method, significantly enhance RoS by generating\nDST-specific selection chains and mitigating hallucinations in teachers'\nreasoning, ensuring effective and reliable knowledge transfer. Extensive\nexperiments validate the exceptional performance and robust generalization\ncapabilities of our method. The source code is provided for reproducibility.\n","authors":["Yujie Feng","Bo Liu","Xiaoyu Dong","Zexin Lu","Li-Ming Zhan","Xiao-Ming Wu","Albert Y. S. Lam"],"pdf_url":"https://arxiv.org/pdf/2408.09846v1.pdf","comment":"Accepted to ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2406.10868v2","updated":"2024-08-19T09:46:39Z","published":"2024-06-16T09:36:32Z","title":"Identifying Query-Relevant Neurons in Large Language Models for\n Long-Form Texts","summary":" Large Language Models (LLMs) possess vast amounts of knowledge within their\nparameters, prompting research into methods for locating and editing this\nknowledge. Previous work has largely focused on locating entity-related (often\nsingle-token) facts in smaller models. However, several key questions remain\nunanswered: (1) How can we effectively locate query-relevant neurons in\ncontemporary autoregressive LLMs, such as Llama and Mistral? (2) How can we\naddress the challenge of long-form text generation? (3) Are there localized\nknowledge regions in LLMs? In this study, we introduce Query-Relevant Neuron\nCluster Attribution (QRNCA), a novel architecture-agnostic framework capable of\nidentifying query-relevant neurons in LLMs. QRNCA allows for the examination of\nlong-form answers beyond triplet facts by employing the proxy task of\nmulti-choice question answering. To evaluate the effectiveness of our detected\nneurons, we build two multi-choice QA datasets spanning diverse domains and\nlanguages. Empirical evaluations demonstrate that our method outperforms\nbaseline methods significantly. Further, analysis of neuron distributions\nreveals the presence of visible localized regions, particularly within\ndifferent domains. Finally, we show potential applications of our detected\nneurons in knowledge editing and neuron-based prediction.\n","authors":["Lihu Chen","Adam Dejl","Francesca Toni"],"pdf_url":"https://arxiv.org/pdf/2406.10868v2.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2408.09819v1","updated":"2024-08-19T09:15:35Z","published":"2024-08-19T09:15:35Z","title":"CMoralEval: A Moral Evaluation Benchmark for Chinese Large Language\n Models","summary":" What a large language model (LLM) would respond in ethically relevant\ncontext? In this paper, we curate a large benchmark CMoralEval for morality\nevaluation of Chinese LLMs. The data sources of CMoralEval are two-fold: 1) a\nChinese TV program discussing Chinese moral norms with stories from the society\nand 2) a collection of Chinese moral anomies from various newspapers and\nacademic papers on morality. With these sources, we aim to create a moral\nevaluation dataset characterized by diversity and authenticity. We develop a\nmorality taxonomy and a set of fundamental moral principles that are not only\nrooted in traditional Chinese culture but also consistent with contemporary\nsocietal norms. To facilitate efficient construction and annotation of\ninstances in CMoralEval, we establish a platform with AI-assisted instance\ngeneration to streamline the annotation process. These help us curate\nCMoralEval that encompasses both explicit moral scenarios (14,964 instances)\nand moral dilemma scenarios (15,424 instances), each with instances from\ndifferent data sources. We conduct extensive experiments with CMoralEval to\nexamine a variety of Chinese LLMs. Experiment results demonstrate that\nCMoralEval is a challenging benchmark for Chinese LLMs. The dataset is publicly\navailable at \\url{https://github.com/tjunlp-lab/CMoralEval}.\n","authors":["Linhao Yu","Yongqi Leng","Yufei Huang","Shang Wu","Haixin Liu","Xinmeng Ji","Jiahui Zhao","Jinwang Song","Tingting Cui","Xiaoqing Cheng","Tao Liu","Deyi Xiong"],"pdf_url":"https://arxiv.org/pdf/2408.09819v1.pdf","comment":"Accepted by ACL 2024 (Findings)"},{"id":"http://arxiv.org/abs/2405.11441v2","updated":"2024-08-19T08:50:54Z","published":"2024-05-19T04:31:54Z","title":"EmbSum: Leveraging the Summarization Capabilities of Large Language\n Models for Content-Based Recommendations","summary":" Content-based recommendation systems play a crucial role in delivering\npersonalized content to users in the digital world. In this work, we introduce\nEmbSum, a novel framework that enables offline pre-computations of users and\ncandidate items while capturing the interactions within the user engagement\nhistory. By utilizing the pretrained encoder-decoder model and poly-attention\nlayers, EmbSum derives User Poly-Embedding (UPE) and Content Poly-Embedding\n(CPE) to calculate relevance scores between users and candidate items. EmbSum\nactively learns the long user engagement histories by generating user-interest\nsummary with supervision from large language model (LLM). The effectiveness of\nEmbSum is validated on two datasets from different domains, surpassing\nstate-of-the-art (SoTA) methods with higher accuracy and fewer parameters.\nAdditionally, the model's ability to generate summaries of user interests\nserves as a valuable by-product, enhancing its usefulness for personalized\ncontent recommendations.\n","authors":["Chiyu Zhang","Yifei Sun","Minghao Wu","Jun Chen","Jie Lei","Muhammad Abdul-Mageed","Rong Jin","Angli Liu","Ji Zhu","Sem Park","Ning Yao","Bo Long"],"pdf_url":"https://arxiv.org/pdf/2405.11441v2.pdf","comment":"Accepted by RecSys 2024"},{"id":"http://arxiv.org/abs/2408.09794v1","updated":"2024-08-19T08:41:40Z","published":"2024-08-19T08:41:40Z","title":"AutoML-guided Fusion of Entity and LLM-based representations","summary":" Large semantic knowledge bases are grounded in factual knowledge. However,\nrecent approaches to dense text representations (embeddings) do not efficiently\nexploit these resources. Dense and robust representations of documents are\nessential for effectively solving downstream classification and retrieval\ntasks. This work demonstrates that injecting embedded information from\nknowledge bases can augment the performance of contemporary Large Language\nModel (LLM)-based representations for the task of text classification. Further,\nby considering automated machine learning (AutoML) with the fused\nrepresentation space, we demonstrate it is possible to improve classification\naccuracy even if we use low-dimensional projections of the original\nrepresentation space obtained via efficient matrix factorization. This result\nshows that significantly faster classifiers can be achieved with minimal or no\nloss in predictive performance, as demonstrated using five strong LLM baselines\non six diverse real-life datasets.\n","authors":["Boshko Koloski","Senja Pollak","Roberto Navigli","Blaž Škrlj"],"pdf_url":"https://arxiv.org/pdf/2408.09794v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09787v1","updated":"2024-08-19T08:27:31Z","published":"2024-08-19T08:27:31Z","title":"Anim-Director: A Large Multimodal Model Powered Agent for Controllable\n Animation Video Generation","summary":" Traditional animation generation methods depend on training generative models\nwith human-labelled data, entailing a sophisticated multi-stage pipeline that\ndemands substantial human effort and incurs high training costs. Due to limited\nprompting plans, these methods typically produce brief, information-poor, and\ncontext-incoherent animations. To overcome these limitations and automate the\nanimation process, we pioneer the introduction of large multimodal models\n(LMMs) as the core processor to build an autonomous animation-making agent,\nnamed Anim-Director. This agent mainly harnesses the advanced understanding and\nreasoning capabilities of LMMs and generative AI tools to create animated\nvideos from concise narratives or simple instructions. Specifically, it\noperates in three main stages: Firstly, the Anim-Director generates a coherent\nstoryline from user inputs, followed by a detailed director's script that\nencompasses settings of character profiles and interior/exterior descriptions,\nand context-coherent scene descriptions that include appearing characters,\ninteriors or exteriors, and scene events. Secondly, we employ LMMs with the\nimage generation tool to produce visual images of settings and scenes. These\nimages are designed to maintain visual consistency across different scenes\nusing a visual-language prompting method that combines scene descriptions and\nimages of the appearing character and setting. Thirdly, scene images serve as\nthe foundation for producing animated videos, with LMMs generating prompts to\nguide this process. The whole process is notably autonomous without manual\nintervention, as the LMMs interact seamlessly with generative tools to generate\nprompts, evaluate visual quality, and select the best one to optimize the final\noutput.\n","authors":["Yunxin Li","Haoyuan Shi","Baotian Hu","Longyue Wang","Jiashun Zhu","Jinyi Xu","Zhen Zhao","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.09787v1.pdf","comment":"Accepted by SIGGRAPH Asia 2024, Project and Codes:\n https://github.com/HITsz-TMG/Anim-Director"},{"id":"http://arxiv.org/abs/2408.09785v1","updated":"2024-08-19T08:22:20Z","published":"2024-08-19T08:22:20Z","title":"GoNoGo: An Efficient LLM-based Multi-Agent System for Streamlining\n Automotive Software Release Decision-Making","summary":" Traditional methods for making software deployment decisions in the\nautomotive industry typically rely on manual analysis of tabular software test\ndata. These methods often lead to higher costs and delays in the software\nrelease cycle due to their labor-intensive nature. Large Language Models (LLMs)\npresent a promising solution to these challenges. However, their application\ngenerally demands multiple rounds of human-driven prompt engineering, which\nlimits their practical deployment, particularly for industrial end-users who\nneed reliable and efficient results. In this paper, we propose GoNoGo, an LLM\nagent system designed to streamline automotive software deployment while\nmeeting both functional requirements and practical industrial constraints.\nUnlike previous systems, GoNoGo is specifically tailored to address\ndomain-specific and risk-sensitive systems. We evaluate GoNoGo's performance\nacross different task difficulties using zero-shot and few-shot examples taken\nfrom industrial practice. Our results show that GoNoGo achieves a 100% success\nrate for tasks up to Level 2 difficulty with 3-shot examples, and maintains\nhigh performance even for more complex tasks. We find that GoNoGo effectively\nautomates decision-making for simpler tasks, significantly reducing the need\nfor manual intervention. In summary, GoNoGo represents an efficient and\nuser-friendly LLM-based solution currently employed in our industrial partner's\ncompany to assist with software release decision-making, supporting more\ninformed and timely decisions in the release process for risk-sensitive vehicle\nsystems.\n","authors":["Arsham Gholamzadeh Khoee","Yinan Yu","Robert Feldt","Andris Freimanis","Patrick Andersson","Dhasarathy Parthasarathy"],"pdf_url":"https://arxiv.org/pdf/2408.09785v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09777v1","updated":"2024-08-19T08:07:25Z","published":"2024-08-19T08:07:25Z","title":"Summarizing long regulatory documents with a multi-step pipeline","summary":" Due to their length and complexity, long regulatory texts are challenging to\nsummarize. To address this, a multi-step extractive-abstractive architecture is\nproposed to handle lengthy regulatory documents more effectively. In this\npaper, we show that the effectiveness of a two-step architecture for\nsummarizing long regulatory texts varies significantly depending on the model\nused. Specifically, the two-step architecture improves the performance of\ndecoder-only models. For abstractive encoder-decoder models with short context\nlengths, the effectiveness of an extractive step varies, whereas for\nlong-context encoder-decoder models, the extractive step worsens their\nperformance. This research also highlights the challenges of evaluating\ngenerated texts, as evidenced by the differing results from human and automated\nevaluations. Most notably, human evaluations favoured language models\npretrained on legal text, while automated metrics rank general-purpose language\nmodels higher. The results underscore the importance of selecting the\nappropriate summarization strategy based on model architecture and context\nlength.\n","authors":["Mika Sie","Ruby Beek","Michiel Bots","Sjaak Brinkkemper","Albert Gatt"],"pdf_url":"https://arxiv.org/pdf/2408.09777v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2408.09773v1","updated":"2024-08-19T08:01:11Z","published":"2024-08-19T08:01:11Z","title":"Are Large Language Models More Honest in Their Probabilistic or\n Verbalized Confidence?","summary":" Large language models (LLMs) have been found to produce hallucinations when\nthe question exceeds their internal knowledge boundaries. A reliable model\nshould have a clear perception of its knowledge boundaries, providing correct\nanswers within its scope and refusing to answer when it lacks knowledge.\nExisting research on LLMs' perception of their knowledge boundaries typically\nuses either the probability of the generated tokens or the verbalized\nconfidence as the model's confidence in its response. However, these studies\noverlook the differences and connections between the two. In this paper, we\nconduct a comprehensive analysis and comparison of LLMs' probabilistic\nperception and verbalized perception of their factual knowledge boundaries.\nFirst, we investigate the pros and cons of these two perceptions. Then, we\nstudy how they change under questions of varying frequencies. Finally, we\nmeasure the correlation between LLMs' probabilistic confidence and verbalized\nconfidence. Experimental results show that 1) LLMs' probabilistic perception is\ngenerally more accurate than verbalized perception but requires an in-domain\nvalidation set to adjust the confidence threshold. 2) Both perceptions perform\nbetter on less frequent questions. 3) It is challenging for LLMs to accurately\nexpress their internal confidence in natural language.\n","authors":["Shiyu Ni","Keping Bi","Lulu Yu","Jiafeng Guo"],"pdf_url":"https://arxiv.org/pdf/2408.09773v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02889v3","updated":"2024-08-19T07:53:17Z","published":"2024-03-05T11:50:01Z","title":"InterrogateLLM: Zero-Resource Hallucination Detection in LLM-Generated\n Answers","summary":" Despite the many advances of Large Language Models (LLMs) and their\nunprecedented rapid evolution, their impact and integration into every facet of\nour daily lives is limited due to various reasons. One critical factor\nhindering their widespread adoption is the occurrence of hallucinations, where\nLLMs invent answers that sound realistic, yet drift away from factual truth. In\nthis paper, we present a novel method for detecting hallucinations in large\nlanguage models, which tackles a critical issue in the adoption of these models\nin various real-world scenarios. Through extensive evaluations across multiple\ndatasets and LLMs, including Llama-2, we study the hallucination levels of\nvarious recent LLMs and demonstrate the effectiveness of our method to\nautomatically detect them. Notably, we observe up to 87% hallucinations for\nLlama-2 in a specific experiment, where our method achieves a Balanced Accuracy\nof 81%, all without relying on external knowledge.\n","authors":["Yakir Yehuda","Itzik Malkiel","Oren Barkan","Jonathan Weill","Royi Ronen","Noam Koenigstein"],"pdf_url":"https://arxiv.org/pdf/2403.02889v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05147v2","updated":"2024-08-19T07:51:05Z","published":"2024-08-09T16:06:42Z","title":"Gemma Scope: Open Sparse Autoencoders Everywhere All At Once on Gemma 2","summary":" Sparse autoencoders (SAEs) are an unsupervised method for learning a sparse\ndecomposition of a neural network's latent representations into seemingly\ninterpretable features. Despite recent excitement about their potential,\nresearch applications outside of industry are limited by the high cost of\ntraining a comprehensive suite of SAEs. In this work, we introduce Gemma Scope,\nan open suite of JumpReLU SAEs trained on all layers and sub-layers of Gemma 2\n2B and 9B and select layers of Gemma 2 27B base models. We primarily train SAEs\non the Gemma 2 pre-trained models, but additionally release SAEs trained on\ninstruction-tuned Gemma 2 9B for comparison. We evaluate the quality of each\nSAE on standard metrics and release these results. We hope that by releasing\nthese SAE weights, we can help make more ambitious safety and interpretability\nresearch easier for the community. Weights and a tutorial can be found at\nhttps://huggingface.co/google/gemma-scope and an interactive demo can be found\nat https://www.neuronpedia.org/gemma-scope\n","authors":["Tom Lieberum","Senthooran Rajamanoharan","Arthur Conmy","Lewis Smith","Nicolas Sonnerat","Vikrant Varma","János Kramár","Anca Dragan","Rohin Shah","Neel Nanda"],"pdf_url":"https://arxiv.org/pdf/2408.05147v2.pdf","comment":"12 main text pages, and 14 pages of acknowledgements, references and\n appendices"},{"id":"http://arxiv.org/abs/2404.06063v2","updated":"2024-08-19T07:50:54Z","published":"2024-04-09T07:02:14Z","title":"Heuristic-enhanced Candidates Selection strategy for GPTs tackle\n Few-Shot Aspect-Based Sentiment Analysis","summary":" Few-Shot Aspect-Based Sentiment Analysis (FSABSA) is an indispensable and\nhighly challenging task in natural language processing. However, methods based\non Pre-trained Language Models (PLMs) struggle to accommodate multiple\nsub-tasks, and methods based on Generative Pre-trained Transformers (GPTs)\nperform poorly. To address the above issues, the paper designs a\nHeuristic-enhanced Candidates Selection (HCS) strategy and further proposes All\nin One (AiO) model based on it. The model works in a two-stage, which\nsimultaneously accommodates the accuracy of PLMs and the generalization\ncapability of GPTs. Specifically, in the first stage, a backbone model based on\nPLMs generates rough heuristic candidates for the input sentence. In the second\nstage, AiO leverages LLMs' contextual learning capabilities to generate precise\npredictions. The study conducted comprehensive comparative and ablation\nexperiments on five benchmark datasets. The experimental results demonstrate\nthat the proposed model can better adapt to multiple sub-tasks, and also\noutperforms the methods that directly utilize GPTs.\n","authors":["Baoxing Jiang","Yujie Wan","Shenggen Ju"],"pdf_url":"https://arxiv.org/pdf/2404.06063v2.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.09757v1","updated":"2024-08-19T07:34:43Z","published":"2024-08-19T07:34:43Z","title":"Strategic Demonstration Selection for Improved Fairness in LLM\n In-Context Learning","summary":" Recent studies highlight the effectiveness of using in-context learning (ICL)\nto steer large language models (LLMs) in processing tabular data, a challenging\ntask given the structured nature of such data. Despite advancements in\nperformance, the fairness implications of these methods are less understood.\nThis study investigates how varying demonstrations within ICL prompts influence\nthe fairness outcomes of LLMs. Our findings reveal that deliberately including\nminority group samples in prompts significantly boosts fairness without\nsacrificing predictive accuracy. Further experiments demonstrate that the\nproportion of minority to majority samples in demonstrations affects the\ntrade-off between fairness and prediction accuracy. Based on these insights, we\nintroduce a mitigation technique that employs clustering and evolutionary\nstrategies to curate a diverse and representative sample set from the training\ndata. This approach aims to enhance both predictive performance and fairness in\nICL applications. Experimental results validate that our proposed method\ndramatically improves fairness across various metrics, showing its efficacy in\nreal-world scenarios.\n","authors":["Jingyu Hu","Weiru Liu","Mengnan Du"],"pdf_url":"https://arxiv.org/pdf/2408.09757v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09743v1","updated":"2024-08-19T07:15:11Z","published":"2024-08-19T07:15:11Z","title":"R2GenCSR: Retrieving Context Samples for Large Language Model based\n X-ray Medical Report Generation","summary":" Inspired by the tremendous success of Large Language Models (LLMs), existing\nX-ray medical report generation methods attempt to leverage large models to\nachieve better performance. They usually adopt a Transformer to extract the\nvisual features of a given X-ray image, and then, feed them into the LLM for\ntext generation. How to extract more effective information for the LLMs to help\nthem improve final results is an urgent problem that needs to be solved.\nAdditionally, the use of visual Transformer models also brings high\ncomputational complexity. To address these issues, this paper proposes a novel\ncontext-guided efficient X-ray medical report generation framework.\nSpecifically, we introduce the Mamba as the vision backbone with linear\ncomplexity, and the performance obtained is comparable to that of the strong\nTransformer model. More importantly, we perform context retrieval from the\ntraining set for samples within each mini-batch during the training phase,\nutilizing both positively and negatively related samples to enhance feature\nrepresentation and discriminative learning. Subsequently, we feed the vision\ntokens, context information, and prompt statements to invoke the LLM for\ngenerating high-quality medical reports. Extensive experiments on three X-ray\nreport generation datasets (i.e., IU-Xray, MIMIC-CXR, CheXpert Plus) fully\nvalidated the effectiveness of our proposed model. The source code of this work\nwill be released on \\url{https://github.com/Event-AHU/Medical_Image_Analysis}.\n","authors":["Xiao Wang","Yuehang Li","Fuling Wang","Shiao Wang","Chuanfu Li","Bo Jiang"],"pdf_url":"https://arxiv.org/pdf/2408.09743v1.pdf","comment":"In Peer Review"},{"id":"http://arxiv.org/abs/2408.09742v1","updated":"2024-08-19T07:14:15Z","published":"2024-08-19T07:14:15Z","title":"Paired Completion: Flexible Quantification of Issue-framing at Scale\n with LLMs","summary":" Detecting and quantifying issue framing in textual discourse - the\nperspective one takes to a given topic (e.g. climate science vs. denialism,\nmisogyny vs. gender equality) - is highly valuable to a range of end-users from\nsocial and political scientists to program evaluators and policy analysts.\nHowever, conceptual framing is notoriously challenging for automated natural\nlanguage processing (NLP) methods since the words and phrases used by either\n`side' of an issue are often held in common, with only subtle stylistic\nflourishes separating their use. Here we develop and rigorously evaluate new\ndetection methods for issue framing and narrative analysis within large text\ndatasets. By introducing a novel application of next-token log probabilities\nderived from generative large language models (LLMs) we show that issue framing\ncan be reliably and efficiently detected in large corpora with only a few\nexamples of either perspective on a given issue, a method we call `paired\ncompletion'. Through 192 independent experiments over three novel, synthetic\ndatasets, we evaluate paired completion against prompt-based LLM methods and\nlabelled methods using traditional NLP and recent LLM contextual embeddings. We\nadditionally conduct a cost-based analysis to mark out the feasible set of\nperformant methods at production-level scales, and a model bias analysis.\nTogether, our work demonstrates a feasible path to scalable, accurate and\nlow-bias issue-framing in large corpora.\n","authors":["Simon D Angus","Lachlan O'Neill"],"pdf_url":"https://arxiv.org/pdf/2408.09742v1.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.10323v2","updated":"2024-08-19T07:02:19Z","published":"2023-07-19T07:20:30Z","title":"IncDSI: Incrementally Updatable Document Retrieval","summary":" Differentiable Search Index is a recently proposed paradigm for document\nretrieval, that encodes information about a corpus of documents within the\nparameters of a neural network and directly maps queries to corresponding\ndocuments. These models have achieved state-of-the-art performances for\ndocument retrieval across many benchmarks. These kinds of models have a\nsignificant limitation: it is not easy to add new documents after a model is\ntrained. We propose IncDSI, a method to add documents in real time (about\n20-50ms per document), without retraining the model on the entire dataset (or\neven parts thereof). Instead we formulate the addition of documents as a\nconstrained optimization problem that makes minimal changes to the network\nparameters. Although orders of magnitude faster, our approach is competitive\nwith re-training the model on the whole dataset and enables the development of\ndocument retrieval systems that can be updated with new information in\nreal-time. Our code for IncDSI is available at\nhttps://github.com/varshakishore/IncDSI.\n","authors":["Varsha Kishore","Chao Wan","Justin Lovelace","Yoav Artzi","Kilian Q. Weinberger"],"pdf_url":"https://arxiv.org/pdf/2307.10323v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03133v2","updated":"2024-08-19T06:45:06Z","published":"2024-05-06T03:06:33Z","title":"Lory: Fully Differentiable Mixture-of-Experts for Autoregressive\n Language Model Pre-training","summary":" Mixture-of-experts (MoE) models facilitate efficient scaling; however,\ntraining the router network introduces the challenge of optimizing a\nnon-differentiable, discrete objective. Recently, a fully-differentiable MoE\narchitecture, SMEAR, was proposed (Muqeeth et al., 2023), which softly merges\nexperts in the parameter space; nevertheless, its effectiveness was only\ndemonstrated in downstream fine-tuning on classification tasks. In this paper,\nwe present Lory, the first approach that scales such architectures to\nautoregressive language model pre-training. Lory introduces two key techniques:\n(1) a causal segment routing strategy that achieves high efficiency for expert\nmerging operations while preserving the autoregressive nature of language\nmodels; (2) a similarity-based data batching method that encourages expert\nspecialization by grouping similar documents in training instances. We\npre-train a series of Lory models on 150B tokens from scratch, with up to 32\nexperts and 30B (1.5B active) parameters. Experimental results show significant\nperformance gains over parameter-matched dense models on both perplexity\n(+13.9%) and a variety of downstream tasks (+1.5%-11.1%). Despite segment-level\nrouting, Lory models achieve competitive performance compared to\nstate-of-the-art MoE models with token-level routing. We further demonstrate\nthat the trained experts in Lory capture domain-level specialization without\nsupervision. Our work highlights the potential of fully-differentiable MoE\narchitectures for language model pre-training and advocates future research in\nthis area.\n","authors":["Zexuan Zhong","Mengzhou Xia","Danqi Chen","Mike Lewis"],"pdf_url":"https://arxiv.org/pdf/2405.03133v2.pdf","comment":"COLM 2024"},{"id":"http://arxiv.org/abs/2408.09720v1","updated":"2024-08-19T06:19:31Z","published":"2024-08-19T06:19:31Z","title":"Pedestrian Attribute Recognition: A New Benchmark Dataset and A Large\n Language Model Augmented Framework","summary":" Pedestrian Attribute Recognition (PAR) is one of the indispensable tasks in\nhuman-centered research. However, existing datasets neglect different domains\n(e.g., environments, times, populations, and data sources), only conducting\nsimple random splits, and the performance of these datasets has already\napproached saturation. In the past five years, no large-scale dataset has been\nopened to the public. To address this issue, this paper proposes a new\nlarge-scale, cross-domain pedestrian attribute recognition dataset to fill the\ndata gap, termed MSP60K. It consists of 60,122 images and 57 attribute\nannotations across eight scenarios. Synthetic degradation is also conducted to\nfurther narrow the gap between the dataset and real-world challenging\nscenarios. To establish a more rigorous benchmark, we evaluate 17\nrepresentative PAR models under both random and cross-domain split protocols on\nour dataset. Additionally, we propose an innovative Large Language Model (LLM)\naugmented PAR framework, named LLM-PAR. This framework processes pedestrian\nimages through a Vision Transformer (ViT) backbone to extract features and\nintroduces a multi-embedding query Transformer to learn partial-aware features\nfor attribute classification. Significantly, we enhance this framework with LLM\nfor ensemble learning and visual feature augmentation. Comprehensive\nexperiments across multiple PAR benchmark datasets have thoroughly validated\nthe efficacy of our proposed framework. The dataset and source code\naccompanying this paper will be made publicly available at\n\\url{https://github.com/Event-AHU/OpenPAR}.\n","authors":["Jiandong Jin","Xiao Wang","Qian Zhu","Haiyang Wang","Chenglong Li"],"pdf_url":"https://arxiv.org/pdf/2408.09720v1.pdf","comment":"MSP60K PAR Benchmark Dataset, LLM based PAR model, In Peer Review"},{"id":"http://arxiv.org/abs/2408.09717v1","updated":"2024-08-19T06:13:19Z","published":"2024-08-19T06:13:19Z","title":"SEMDR: A Semantic-Aware Dual Encoder Model for Legal Judgment Prediction\n with Legal Clue Tracing","summary":" Legal Judgment Prediction (LJP) aims to form legal judgments based on the\ncriminal fact description. However, researchers struggle to classify confusing\ncriminal cases, such as robbery and theft, which requires LJP models to\ndistinguish the nuances between similar crimes. Existing methods usually design\nhandcrafted features to pick up necessary semantic legal clues to make more\naccurate legal judgment predictions. In this paper, we propose a Semantic-Aware\nDual Encoder Model (SEMDR), which designs a novel legal clue tracing mechanism\nto conduct fine-grained semantic reasoning between criminal facts and\ninstruments. Our legal clue tracing mechanism is built from three reasoning\nlevels: 1) Lexicon-Tracing, which aims to extract criminal facts from criminal\ndescriptions; 2) Sentence Representation Learning, which contrastively trains\nlanguage models to better represent confusing criminal facts; 3) Multi-Fact\nReasoning, which builds a reasons graph to propagate semantic clues among fact\nnodes to capture the subtle difference among criminal facts. Our legal clue\ntracing mechanism helps SEMDR achieve state-of-the-art on the CAIL2018 dataset\nand shows its advance in few-shot scenarios. Our experiments show that SEMDR\nhas a strong ability to learn more uniform and distinguished representations\nfor criminal facts, which helps to make more accurate predictions on confusing\ncriminal cases and reduces the model uncertainty during making judgments. All\ncodes will be released via GitHub.\n","authors":["Pengjie Liu","Wang Zhang","Yulong Ding","Xuefeng Zhang","Shuang-Hua Yang"],"pdf_url":"https://arxiv.org/pdf/2408.09717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.07970v2","updated":"2024-08-19T06:08:46Z","published":"2024-06-12T07:49:36Z","title":"Guiding In-Context Learning of LLMs through Quality Estimation for\n Machine Translation","summary":" The quality of output from large language models (LLMs), particularly in\nmachine translation (MT), is closely tied to the quality of in-context examples\n(ICEs) provided along with the query, i.e., the text to translate. The\neffectiveness of these ICEs is influenced by various factors, such as the\ndomain of the source text, the order in which the ICEs are presented, the\nnumber of these examples, and the prompt templates used. Naturally, selecting\nthe most impactful ICEs depends on understanding how these affect the resulting\ntranslation quality, which ultimately relies on translation references or human\njudgment. This paper presents a novel methodology for in-context learning (ICL)\nthat relies on a search algorithm guided by domain-specific quality estimation\n(QE). Leveraging the XGLM model, our methodology estimates the resulting\ntranslation quality without the need for translation references, selecting\neffective ICEs for MT to maximize translation quality. Our results demonstrate\nsignificant improvements over existing ICL methods and higher translation\nperformance compared to fine-tuning a pre-trained language model (PLM),\nspecifically mBART-50.\n","authors":["Javad Pourmostafa Roshan Sharami","Dimitar Shterionov","Pieter Spronck"],"pdf_url":"https://arxiv.org/pdf/2406.07970v2.pdf","comment":"Camera-ready version of the Association for Machine Translation in\n the Americas (AMTA)"},{"id":"http://arxiv.org/abs/2308.07922v3","updated":"2024-08-19T05:46:56Z","published":"2023-08-15T17:59:18Z","title":"RAVEN: In-Context Learning with Retrieval-Augmented Encoder-Decoder\n Language Models","summary":" In this paper, we investigate the in-context learning ability of\nretrieval-augmented encoder-decoder language models. We first conduct a\ncomprehensive analysis of existing models and identify their limitations in\nin-context learning, primarily due to a mismatch between pretraining and\ninference, as well as a restricted context length. To address these issues, we\npropose RAVEN, a model that combines retrieval-augmented masked language\nmodeling and prefix language modeling. We further introduce Fusion-in-Context\nLearning to enhance the few-shot performance by enabling the model to leverage\nmore in-context examples without requiring additional training. Through\nextensive experiments, we demonstrate that our simple yet effective design\nsignificantly improves performance, achieving results comparable to the most\nadvanced language models in certain scenarios, despite having substantially\nfewer parameters. Our work underscores the potential of retrieval-augmented\nencoder-decoder language models for in-context learning and encourages further\nresearch in this direction.\n","authors":["Jie Huang","Wei Ping","Peng Xu","Mohammad Shoeybi","Kevin Chen-Chuan Chang","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2308.07922v3.pdf","comment":"COLM 2024"},{"id":"http://arxiv.org/abs/2408.09701v1","updated":"2024-08-19T05:11:46Z","published":"2024-08-19T05:11:46Z","title":"Bridging the Language Gap: Enhancing Multilingual Prompt-Based Code\n Generation in LLMs via Zero-Shot Cross-Lingual Transfer","summary":" The use of Large Language Models (LLMs) for program code generation has\ngained substantial attention, but their biases and limitations with non-English\nprompts challenge global inclusivity. This paper investigates the complexities\nof multilingual prompt-based code generation. Our evaluations of LLMs,\nincluding CodeLLaMa and CodeGemma, reveal significant disparities in code\nquality for non-English prompts; we also demonstrate the inadequacy of simple\napproaches like prompt translation, bootstrapped data augmentation, and\nfine-tuning. To address this, we propose a zero-shot cross-lingual approach\nusing a neural projection technique, integrating a cross-lingual encoder like\nLASER artetxe2019massively to map multilingual embeddings from it into the\nLLM's token space. This method requires training only on English data and\nscales effectively to other languages. Results on a translated and\nquality-checked MBPP dataset show substantial improvements in code quality.\nThis research promotes a more inclusive code generation landscape by empowering\nLLMs with multilingual capabilities to support the diverse linguistic spectrum\nin programming.\n","authors":["Mingda Li","Abhijit Mishra","Utkarsh Mujumdar"],"pdf_url":"https://arxiv.org/pdf/2408.09701v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2408.07978v2","updated":"2024-08-19T05:04:38Z","published":"2024-08-15T06:52:24Z","title":"Coupling without Communication and Drafter-Invariant Speculative\n Decoding","summary":" Suppose Alice has a distribution $P$ and Bob has a distribution $Q$. Alice\nwants to generate a sample $a\\sim P$ and Bob a sample $b \\sim Q$ such that $a =\nb$ with has as high of probability as possible. It is well-known that, by\nsampling from an optimal coupling between the distributions, Alice and Bob can\nachieve $Pr[a = b] = 1 - D_{TV}(P,Q)$, where $D_{TV}(P,Q)$ is the total\nvariation distance. What if Alice and Bob must solve this same problem without\ncommunicating at all? Perhaps surprisingly, with access to public randomness,\nthey can still achieve $Pr[a=b] \\geq \\frac{1-D_{TV}(P,Q)}{1+D_{TV}(P,Q)} \\geq\n1-2D_{TV}(P,Q)$. In fact, this bound can be obtained using a simple protocol\nbased on the Weighted MinHash algorithm. In this work, we explore the\ncommunication-free coupling problem in greater depth. First, we show that an\nequally simple protocol based on Gumbel sampling matches the worst-case\nguarantees of the Weighted MinHash approach, but tends to perform better in\npractice. Conversely, we prove that both approaches are actually sharp: no\ncommunication-free protocol can achieve\n$Pr[a=b]>\\frac{1-D_{TV}(P,Q)}{1+D_{TV}(P,Q)}$ in the worst-case. Finally, we\nprove that, for distributions over $n$ items, there exists a scheme that uses\njust $O(\\log(n/\\epsilon))$ bits of communication to achieve $Pr[a = b] = 1 -\nD_{TV}(P,Q) - \\epsilon$, i.e. to essentially match optimal coupling. Beyond our\ntheoretical results, we demonstrate an application of communication-free\ncoupling to speculative decoding, a recent method for accelerating\nautoregressive large language models [Leviathan, Kalman, Matias, ICML 2023]. We\nshow that communication-free protocols yield a variant of speculative decoding\nthat we call Drafter-Invariant Speculative Decoding, which has the desirable\nproperty that the output of the method is fixed given a fixed random seed,\nregardless of what drafter is used for speculation.\n","authors":["Majid Daliri","Christopher Musco","Ananda Theertha Suresh"],"pdf_url":"https://arxiv.org/pdf/2408.07978v2.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2403.08694v3","updated":"2024-08-19T04:54:36Z","published":"2024-03-13T16:57:57Z","title":"TeaMs-RL: Teaching LLMs to Generate Better Instruction Datasets via\n Reinforcement Learning","summary":" The development of Large Language Models (LLMs) often confronts challenges\nstemming from the heavy reliance on human annotators in the reinforcement\nlearning with human feedback (RLHF) framework, or the frequent and costly\nexternal queries tied to the self-instruct paradigm. In this work, we pivot to\nReinforcement Learning (RL) -- but with a twist. Diverging from the typical\nRLHF, which refines LLMs following instruction data training, we use RL to\ndirectly generate the foundational instruction dataset that alone suffices for\nfine-tuning. Our method, TeaMs-RL, uses a suite of textual operations and\nrules, prioritizing the diversification of training datasets. It facilitates\nthe generation of high-quality data without excessive reliance on external\nadvanced models, paving the way for a single fine-tuning step and negating the\nneed for subsequent RLHF stages. Our findings highlight key advantages of our\napproach: reduced need for human involvement and fewer model queries (only\n$5.73\\%$ of the strong baseline's total), along with enhanced capabilities of\nLLMs in crafting and comprehending complex instructions compared to strong\nbaselines, and substantially improved model privacy protection. Code is\navailable at the link: https://github.com/SafeRL-Lab/TeaMs-RL\n","authors":["Shangding Gu","Alois Knoll","Ming Jin"],"pdf_url":"https://arxiv.org/pdf/2403.08694v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08869v2","updated":"2024-08-19T04:29:34Z","published":"2024-08-16T17:54:09Z","title":"PEDAL: Enhancing Greedy Decoding with Large Language Models using\n Diverse Exemplars","summary":" Self-ensembling techniques with diverse reasoning paths such as\nSelf-Consistency have demonstrated remarkable performance gains in text\ngeneration with Large Language Models (LLMs). However, such techniques depend\non the availability of an accurate answer extraction process to aggregate\nacross multiple outputs. Moreover, they acquire higher inference cost, in\ncomparison to Greedy Decoding, due to generation of relatively higher number of\noutput tokens. Research has shown that the free form text outputs from\nSelf-Consistency can be aggregated reliably using LLMs to produce the final\noutput. Additionally, recent advancements in LLM inference have demonstrated\nthat usage of diverse exemplars in prompts have the ability to induce diversity\nin the LLM outputs. Such proven techniques can be easily extended to\nself-ensembling based approaches to achieve enhanced results in text\ngeneration. In this paper, we introduce PEDAL (Prompts based on Exemplar\nDiversity Aggregated using LLMs), a hybrid self-ensembling approach, that\ncombines the strengths of diverse exemplar based prompts and LLM based\naggregation to achieve improvement in overall performance. On the publicly\navailable SVAMP and ARC datasets, our experiments reveal that PEDAL can achieve\nbetter accuracy than Greedy Decoding based strategies with lower inference cost\ncompared to Self Consistency based approaches.\n","authors":["Sumanth Prabhu"],"pdf_url":"https://arxiv.org/pdf/2408.08869v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00958v3","updated":"2024-08-19T04:02:44Z","published":"2024-07-01T04:29:35Z","title":"Universal Approximation Theory: The Basic Theory for Transformer-based\n Large Language Models","summary":" Language models have emerged as a critical area of focus in artificial\nintelligence, particularly with the introduction of groundbreaking innovations\nlike ChatGPT. Large-scale Transformer networks have quickly become the leading\napproach for advancing natural language processing algorithms. Built on the\nTransformer architecture, these models enable interactions that closely mimic\nhuman communication and, equipped with extensive knowledge, can even assist in\nguiding human tasks. Despite their impressive capabilities and growing\ncomplexity, a key question remains-the theoretical foundations of large\nlanguage models (LLMs). What makes Transformer so effective for powering\nintelligent language applications, such as translation and coding? What\nunderlies LLMs' ability for In-Context Learning (ICL)? How does the LoRA scheme\nenhance the fine-tuning of LLMs? And what supports the practicality of pruning\nLLMs? To address these critical questions and explore the technological\nstrategies within LLMs, we leverage the Universal Approximation Theory (UAT) to\noffer a theoretical backdrop, shedding light on the mechanisms that underpin\nthese advancements.\n","authors":["Wei Wang","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2407.00958v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09688v1","updated":"2024-08-19T03:53:48Z","published":"2024-08-19T03:53:48Z","title":"Recording for Eyes, Not Echoing to Ears: Contextualized\n Spoken-to-Written Conversion of ASR Transcripts","summary":" Automatic Speech Recognition (ASR) transcripts exhibit recognition errors and\nvarious spoken language phenomena such as disfluencies, ungrammatical\nsentences, and incomplete sentences, hence suffering from poor readability. To\nimprove readability, we propose a Contextualized Spoken-to-Written conversion\n(CoS2W) task to address ASR and grammar errors and also transfer the informal\ntext into the formal style with content preserved, utilizing contexts and\nauxiliary information. This task naturally matches the in-context learning\ncapabilities of Large Language Models (LLMs). To facilitate comprehensive\ncomparisons of various LLMs, we construct a document-level Spoken-to-Written\nconversion of ASR Transcripts Benchmark (SWAB) dataset. Using SWAB, we study\nthe impact of different granularity levels on the CoS2W performance, and\npropose methods to exploit contexts and auxiliary information to enhance the\noutputs. Experimental results reveal that LLMs have the potential to excel in\nthe CoS2W task, particularly in grammaticality and formality, our methods\nachieve effective understanding of contexts and auxiliary information by LLMs.\nWe further investigate the effectiveness of using LLMs as evaluators and find\nthat LLM evaluators show strong correlations with human evaluations on rankings\nof faithfulness and formality, which validates the reliability of LLM\nevaluators for the CoS2W task.\n","authors":["Jiaqing Liu","Chong Deng","Qinglin Zhang","Qian Chen","Hai Yu","Wen Wang"],"pdf_url":"https://arxiv.org/pdf/2408.09688v1.pdf","comment":"7 pages, 3 figures"},{"id":"http://arxiv.org/abs/2406.14192v2","updated":"2024-08-19T03:47:16Z","published":"2024-06-20T10:52:14Z","title":"Timo: Towards Better Temporal Reasoning for Language Models","summary":" Reasoning about time is essential for Large Language Models (LLMs) to\nunderstand the world. Previous works focus on solving specific tasks, primarily\non time-sensitive question answering. While these methods have proven\neffective, they cannot generalize to a wider spectrum of temporal reasoning\ntasks. Therefore, we propose a crucial question: Can we build a universal\nframework to handle a variety of temporal reasoning tasks? To that end, we\nsystematically study 38 temporal reasoning tasks. Based on the observation that\n19 tasks are directly related to mathematics, we first leverage the available\nmathematical dataset to set a solid foundation for temporal reasoning. However,\nthe in-depth study indicates that focusing solely on mathematical enhancement\nfalls short of addressing pure temporal reasoning tasks. To mitigate this\nlimitation, we propose a simple but effective self-critic temporal optimization\nmethod to enhance the model's temporal reasoning capabilities without\nsacrificing general task abilities. Finally, we develop Timo, a model designed\nto excel in temporal reasoning at the 7B and 13B scales. Notably, Timo\noutperforms the counterpart LLMs by 10.0 and 7.6 in average accuracy scores and\nachieves the new state-of-the-art (SOTA) performance of comparable size.\nExtensive experiments further validate our framework's effectiveness and its\ngeneralization across diverse temporal tasks. The code is available at\nhttps://github.com/zhaochen0110/Timo.\n","authors":["Zhaochen Su","Jun Zhang","Tong Zhu","Xiaoye Qu","Juntao Li","Min Zhang","Yu Cheng"],"pdf_url":"https://arxiv.org/pdf/2406.14192v2.pdf","comment":"This paper has been accepted to the COLM 2024 conference"},{"id":"http://arxiv.org/abs/2307.13269v3","updated":"2024-08-19T03:31:19Z","published":"2023-07-25T05:39:21Z","title":"LoraHub: Efficient Cross-Task Generalization via Dynamic LoRA\n Composition","summary":" Low-rank adaptations (LoRA) are often employed to fine-tune large language\nmodels (LLMs) for new tasks. This paper investigates LoRA composability for\ncross-task generalization and introduces LoraHub, a simple framework devised\nfor the purposive assembly of LoRA modules trained on diverse given tasks, with\nthe objective of achieving adaptable performance on unseen tasks. With just a\nfew examples from a new task, LoraHub can fluidly combine multiple LoRA\nmodules, eliminating the need for human expertise and assumptions. Notably, the\ncomposition requires neither additional model parameters nor gradients.\nEmpirical results on the Big-Bench Hard benchmark suggest that LoraHub, while\nnot surpassing the performance of in-context learning, offers a notable\nperformance-efficiency trade-off in few-shot scenarios by employing a\nsignificantly reduced number of tokens per example during inference. Notably,\nLoraHub establishes a better upper bound compared to in-context learning when\npaired with different demonstration examples, demonstrating its potential for\nfuture development. Our vision is to establish a platform for LoRA modules,\nempowering users to share their trained LoRA modules. This collaborative\napproach facilitates the seamless application of LoRA modules to novel tasks,\ncontributing to an adaptive ecosystem. Our code is available at\nhttps://github.com/sail-sg/lorahub, and all the pre-trained LoRA modules are\nreleased at https://huggingface.co/lorahub.\n","authors":["Chengsong Huang","Qian Liu","Bill Yuchen Lin","Tianyu Pang","Chao Du","Min Lin"],"pdf_url":"https://arxiv.org/pdf/2307.13269v3.pdf","comment":"COLM 2024"},{"id":"http://arxiv.org/abs/2401.13444v2","updated":"2024-08-19T03:25:52Z","published":"2024-01-24T13:36:50Z","title":"Clue-Guided Path Exploration: Optimizing Knowledge Graph Retrieval with\n Large Language Models to Address the Information Black Box Challenge","summary":" In recent times, large language models (LLMs) have showcased remarkable\ncapabilities. However, updating their knowledge poses challenges, potentially\nleading to inaccuracies when confronted with unfamiliar queries. To address\nthis issue, integrating external knowledge bases such as knowledge graphs with\nlarge language models is a viable approach. The key challenge lies in\nextracting the required knowledge from knowledge graphs based on natural\nlanguage, demanding high semantic understanding. Therefore, researchers are\nconsidering leveraging large language models directly for knowledge retrieval\nfrom these graphs. Current efforts typically rely on the comprehensive\nproblem-solving capabilities of large language models. We argue that a problem\nwe term the 'information black box' can significantly impact the practical\neffectiveness of such methods. Moreover, this kind of methods is less effective\nfor scenarios where the questions are unfamiliar to the large language models.\nIn this paper, we propose a Clue-Guided Path Exploration (CGPE) framework to\noptimize knowledge retrieval based on large language models. By addressing the\n'information black box' issue and employing single-task approaches instead of\ncomplex tasks, we have enhanced the accuracy and efficiency of using large\nlanguage models for retrieving knowledge graphs. Experiments on open-source\ndatasets reveal that CGPE outperforms previous methods and is highly applicable\nto LLMs with fewer parameters. In some instances, even ChatGLM3, with its 6\nbillion parameters, can rival the performance of GPT-4. Furthermore, the\nresults indicate a minimal invocation frequency of CGPE on LLMs, suggesting\nreduced computational overhead. For organizations and individuals facing\nconstraints in computational resources, our research offers significant\npractical value.\n","authors":["Dehao Tao","Feng Huang","Congqi Wang","Yongfeng Huang","Minghu Jiang"],"pdf_url":"https://arxiv.org/pdf/2401.13444v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05517v3","updated":"2024-08-19T03:21:21Z","published":"2024-08-10T11:00:13Z","title":"SWIFT:A Scalable lightWeight Infrastructure for Fine-Tuning","summary":" Recent development in Large Language Models (LLMs) and Multi-modal Large\nLanguage Models (MLLMs) have leverage Attention-based Transformer architectures\nand achieved superior performance and generalization capabilities. They have\nsince covered extensive areas of traditional learning tasks. For instance,\ntext-based tasks such as text-classification and sequence-labeling, as well as\nmulti-modal tasks like Visual Question Answering (VQA) and Optical Character\nRecognition (OCR), which were previously addressed using different models, can\nnow be tackled based on one foundation model. Consequently, the training and\nlightweight fine-tuning of LLMs and MLLMs, especially those based on\nTransformer architecture, has become particularly important. In recognition of\nthese overwhelming needs, we develop SWIFT, a customizable one-stop\ninfrastructure for large models. With support of over $300+$ LLMs and $50+$\nMLLMs, SWIFT stands as the open-source framework that provide the most\ncomprehensive support for fine-tuning large models. In particular, it is the\nfirst training framework that provides systematic support for MLLMs. In\naddition to the core functionalities of fine-tuning, SWIFT also integrates\npost-training processes such as inference, evaluation, and model quantization,\nto facilitate fast adoptions of large models in various application scenarios.\nWith a systematic integration of various training techniques, SWIFT offers\nhelpful utilities such as benchmark comparisons among different training\ntechniques for large models. For fine-tuning models specialized in agent\nframework, we show that notable improvements on the ToolBench leader-board can\nbe achieved by training with customized dataset on SWIFT, with an increase of\n5.2%-21.8% in the Act.EM metric over various baseline models, a reduction in\nhallucination by 1.6%-14.1%, and an average performance improvement of 8%-17%.\n","authors":["Yuze Zhao","Jintao Huang","Jinghan Hu","Xingjun Wang","Yunlin Mao","Daoze Zhang","Zeyinzi Jiang","Zhikai Wu","Baole Ai","Ang Wang","Wenmeng Zhou","Yingda Chen"],"pdf_url":"https://arxiv.org/pdf/2408.05517v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03637v2","updated":"2024-08-19T03:18:59Z","published":"2024-07-04T05:13:58Z","title":"HERA: High-efficiency Matrix Compression via Element Replacement","summary":" Matrix quantization involves encoding matrix elements in a more\nspace-efficient manner to minimize storage requirements, with dequantization\nused to reconstruct the original matrix for practical use. We define the\nQuantization Error Minimization (QEM) problem as minimizing the difference\nbetween a matrix before and after quantization while ensuring that the\nquantized matrix occupies the same amount of memory. Matrix quantization is\nessential in various fields, including weight quantization in Large Language\nModels (LLMs), vector databases, KV cache quantization, graph compression, and\nimage compression. The growing scale of LLMs, such as GPT-4 and BERT,\nunderscores the need for matrix compression due to the large size of parameters\nand KV caches, which are stored as matrices.\n To address the QEM problem, we introduce HETA, an algorithm that leverages\nthe local orderliness of matrix elements by iteratively swapping elements to\ncreate a locally ordered matrix. This matrix is then grouped and quantized by\ncolumns. To further improve HETA, we present two optimizations: additional\nquantization of residuals to reduce mean squared error (MSE) and the\napplication of masking and batch processing to accelerate the algorithm.\n Our experiments show that HETA effectively reduces MSE to 12.3% of its\noriginal value at the same compression ratio, outperforming leading baseline\nalgorithms. Our contributions include formalizing the QEM problem, developing\nthe HETA algorithm, and proposing two optimizations to enhance both accuracy\nand processing speed.\n","authors":["Yanshu Wang","Wang Li","Tong Yang"],"pdf_url":"https://arxiv.org/pdf/2407.03637v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.20329v2","updated":"2024-08-19T03:06:24Z","published":"2024-03-29T17:59:06Z","title":"ReALM: Reference Resolution As Language Modeling","summary":" Reference resolution is an important problem, one that is essential to\nunderstand and successfully handle context of different kinds. This context\nincludes both previous turns and context that pertains to non-conversational\nentities, such as entities on the user's screen or those running in the\nbackground. While LLMs have been shown to be extremely powerful for a variety\nof tasks, their use in reference resolution, particularly for\nnon-conversational entities, remains underutilized. This paper demonstrates how\nLLMs can be used to create an extremely effective system to resolve references\nof various types, by showing how reference resolution can be converted into a\nlanguage modeling problem, despite involving forms of entities like those on\nscreen that are not traditionally conducive to being reduced to a text-only\nmodality. We demonstrate large improvements over an existing system with\nsimilar functionality across different types of references, with our smallest\nmodel obtaining absolute gains of over 5% for on-screen references. We also\nbenchmark against GPT-3.5 and GPT-4, with our smallest model achieving\nperformance comparable to that of GPT-4, and our larger models substantially\noutperforming it.\n","authors":["Joel Ruben Antony Moniz","Soundarya Krishnan","Melis Ozyildirim","Prathamesh Saraf","Halim Cagri Ates","Yuan Zhang","Hong Yu"],"pdf_url":"https://arxiv.org/pdf/2403.20329v2.pdf","comment":"Accepted at SIGDIAL 2024 (Oral presentation)"},{"id":"http://arxiv.org/abs/2408.09667v1","updated":"2024-08-19T02:59:35Z","published":"2024-08-19T02:59:35Z","title":"BLADE: Benchmarking Language Model Agents for Data-Driven Science","summary":" Data-driven scientific discovery requires the iterative integration of\nscientific domain knowledge, statistical expertise, and an understanding of\ndata semantics to make nuanced analytical decisions, e.g., about which\nvariables, transformations, and statistical models to consider. LM-based agents\nequipped with planning, memory, and code execution capabilities have the\npotential to support data-driven science. However, evaluating agents on such\nopen-ended tasks is challenging due to multiple valid approaches, partially\ncorrect steps, and different ways to express the same decisions. To address\nthese challenges, we present BLADE, a benchmark to automatically evaluate\nagents' multifaceted approaches to open-ended research questions. BLADE\nconsists of 12 datasets and research questions drawn from existing scientific\nliterature, with ground truth collected from independent analyses by expert\ndata scientists and researchers. To automatically evaluate agent responses, we\ndeveloped corresponding computational methods to match different\nrepresentations of analyses to this ground truth. Though language models\npossess considerable world knowledge, our evaluation shows that they are often\nlimited to basic analyses. However, agents capable of interacting with the\nunderlying data demonstrate improved, but still non-optimal, diversity in their\nanalytical decision making. Our work enables the evaluation of agents for\ndata-driven science and provides researchers deeper insights into agents'\nanalysis approaches.\n","authors":["Ken Gu","Ruoxi Shang","Ruien Jiang","Keying Kuang","Richard-John Lin","Donghe Lyu","Yue Mao","Youran Pan","Teng Wu","Jiaqian Yu","Yikun Zhang","Tianmai M. Zhang","Lanyi Zhu","Mike A. Merrill","Jeffrey Heer","Tim Althoff"],"pdf_url":"https://arxiv.org/pdf/2408.09667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09656v1","updated":"2024-08-19T02:34:15Z","published":"2024-08-19T02:34:15Z","title":"A Comparison of Large Language Model and Human Performance on Random\n Number Generation Tasks","summary":" Random Number Generation Tasks (RNGTs) are used in psychology for examining\nhow humans generate sequences devoid of predictable patterns. By adapting an\nexisting human RNGT for an LLM-compatible environment, this preliminary study\ntests whether ChatGPT-3.5, a large language model (LLM) trained on\nhuman-generated text, exhibits human-like cognitive biases when generating\nrandom number sequences. Initial findings indicate that ChatGPT-3.5 more\neffectively avoids repetitive and sequential patterns compared to humans, with\nnotably lower repeat frequencies and adjacent number frequencies. Continued\nresearch into different models, parameters, and prompting methodologies will\ndeepen our understanding of how LLMs can more closely mimic human random\ngeneration behaviors, while also broadening their applications in cognitive and\nbehavioral science research.\n","authors":["Rachel M. Harrison"],"pdf_url":"https://arxiv.org/pdf/2408.09656v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12015v3","updated":"2024-08-19T01:59:54Z","published":"2023-11-20T18:54:39Z","title":"GPT-4V(ision) for Robotics: Multimodal Task Planning from Human\n Demonstration","summary":" We introduce a pipeline that enhances a general-purpose Vision Language\nModel, GPT-4V(ision), to facilitate one-shot visual teaching for robotic\nmanipulation. This system analyzes videos of humans performing tasks and\noutputs executable robot programs that incorporate insights into affordances.\nThe process begins with GPT-4V analyzing the videos to obtain textual\nexplanations of environmental and action details. A GPT-4-based task planner\nthen encodes these details into a symbolic task plan. Subsequently, vision\nsystems spatially and temporally ground the task plan in the videos. Object are\nidentified using an open-vocabulary object detector, and hand-object\ninteractions are analyzed to pinpoint moments of grasping and releasing. This\nspatiotemporal grounding allows for the gathering of affordance information\n(e.g., grasp types, waypoints, and body postures) critical for robot execution.\nExperiments across various scenarios demonstrate the method's efficacy in\nachieving real robots' operations from human demonstrations in a one-shot\nmanner. Meanwhile, quantitative tests have revealed instances of hallucination\nin GPT-4V, highlighting the importance of incorporating human supervision\nwithin the pipeline. The prompts of GPT-4V/GPT-4 are available at this project\npage: https://microsoft.github.io/GPT4Vision-Robot-Manipulation-Prompts/\n","authors":["Naoki Wake","Atsushi Kanehira","Kazuhiro Sasabuchi","Jun Takamatsu","Katsushi Ikeuchi"],"pdf_url":"https://arxiv.org/pdf/2311.12015v3.pdf","comment":"8 pages, 10 figures, 3 tables. Last updated on August 18th, 2024"},{"id":"http://arxiv.org/abs/2408.09640v1","updated":"2024-08-19T01:54:37Z","published":"2024-08-19T01:54:37Z","title":"Acquiring Bidirectionality via Large and Small Language Models","summary":" Using token representation from bidirectional language models (LMs) such as\nBERT is still a widely used approach for token-classification tasks. Even\nthough there exist much larger unidirectional LMs such as Llama-2, they are\nrarely used to replace the token representation of bidirectional LMs. In this\nwork, we hypothesize that their lack of bidirectionality is keeping them\nbehind. To that end, we propose to newly train a small backward LM and\nconcatenate its representations to those of existing LM for downstream tasks.\nThrough experiments in named entity recognition, we demonstrate that\nintroducing backward model improves the benchmark performance more than 10\npoints. Furthermore, we show that the proposed method is especially effective\nfor rare domains and in few-shot learning settings.\n","authors":["Takumi Goto","Hiroyoshi Nagao","Yuta Koreeda"],"pdf_url":"https://arxiv.org/pdf/2408.09640v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09639v1","updated":"2024-08-19T01:53:47Z","published":"2024-08-19T01:53:47Z","title":"How to Make the Most of LLMs' Grammatical Knowledge for Acceptability\n Judgments","summary":" The grammatical knowledge of language models (LMs) is often measured using a\nbenchmark of linguistic minimal pairs, where LMs are presented with a pair of\nacceptable and unacceptable sentences and required to judge which is\nacceptable. The existing dominant approach, however, naively calculates and\ncompares the probabilities of paired sentences using LMs. Additionally, large\nlanguage models (LLMs) have yet to be thoroughly examined in this field. We\nthus investigate how to make the most of LLMs' grammatical knowledge to\ncomprehensively evaluate it. Through extensive experiments of nine judgment\nmethods in English and Chinese, we demonstrate that a probability readout\nmethod, in-template LP, and a prompting-based method, Yes/No probability\ncomputing, achieve particularly high performance, surpassing the conventional\napproach. Our analysis reveals their different strengths, e.g., Yes/No\nprobability computing is robust against token-length bias, suggesting that they\nharness different aspects of LLMs' grammatical knowledge. Consequently, we\nrecommend using diverse judgment methods to evaluate LLMs comprehensively.\n","authors":["Yusuke Ide","Yuto Nishida","Miyu Oba","Yusuke Sakai","Justin Vasselli","Hidetaka Kamigaito","Taro Watanabe"],"pdf_url":"https://arxiv.org/pdf/2408.09639v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20602v2","updated":"2024-08-19T01:36:45Z","published":"2024-05-31T03:26:42Z","title":"Masked Language Modeling Becomes Conditional Density Estimation for\n Tabular Data Synthesis","summary":" In this paper, our goal is to generate synthetic data for heterogeneous\n(mixed-type) tabular datasets with high machine learning utility (MLu). Since\nthe MLu performance depends on accurately approximating the conditional\ndistributions, we focus on devising a synthetic data generation method based on\nconditional distribution estimation. We introduce MaCoDE by redefining the\nconsecutive multi-class classification task of Masked Language Modeling (MLM)\nas histogram-based non-parametric conditional density estimation. Our approach\nenables the estimation of conditional densities across arbitrary combinations\nof target and conditional variables. We bridge the theoretical gap between\ndistributional learning and MLM by demonstrating that minimizing the orderless\nmulti-class classification loss leads to minimizing the total variation\ndistance between conditional distributions. To validate our proposed model, we\nevaluate its performance in synthetic data generation across 10 real-world\ndatasets, demonstrating its ability to adjust data privacy levels easily\nwithout re-training. Additionally, since masked input tokens in MLM are\nanalogous to missing data, we further assess its effectiveness in handling\ntraining datasets with missing values, including multiple imputations of the\nmissing entries.\n","authors":["Seunghwan An","Gyeongdong Woo","Jaesung Lim","ChangHyun Kim","Sungchul Hong","Jong-June Jeon"],"pdf_url":"https://arxiv.org/pdf/2405.20602v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09632v1","updated":"2024-08-19T01:30:14Z","published":"2024-08-19T01:30:14Z","title":"MoDeGPT: Modular Decomposition for Large Language Model Compression","summary":" Large Language Models (LLMs) have reshaped the landscape of artificial\nintelligence by demonstrating exceptional performance across various tasks.\nHowever, substantial computational requirements make their deployment\nchallenging on devices with limited resources. Recently, compression methods\nusing low-rank matrix techniques have shown promise, yet these often lead to\ndegraded accuracy or introduce significant overhead in parameters and inference\nlatency. This paper introduces \\textbf{Mo}dular \\textbf{De}composition\n(MoDeGPT), a novel structured compression framework that does not need recovery\nfine-tuning while resolving the above drawbacks. MoDeGPT partitions the\nTransformer block into modules comprised of matrix pairs and reduces the hidden\ndimensions via reconstructing the module-level outputs. MoDeGPT is developed\nbased on a theoretical framework that utilizes three well-established matrix\ndecomposition algorithms -- Nystr\\\"om approximation, CR decomposition, and SVD\n-- and applies them to our redefined transformer modules. Our comprehensive\nexperiments show MoDeGPT, without backward propagation, matches or surpasses\nprevious structured compression methods that rely on gradient information, and\nsaves 98% of compute costs on compressing a 13B model. On \\textsc{Llama}-2/3\nand OPT models, MoDeGPT maintains 90-95% zero-shot performance with 25-30%\ncompression rates. Moreover, the compression can be done on a single GPU within\na few hours and increases the inference throughput by up to 46%.\n","authors":["Chi-Heng Lin","Shangqian Gao","James Seale Smith","Abhishek Patel","Shikhar Tuli","Yilin Shen","Hongxia Jin","Yen-Chang Hsu"],"pdf_url":"https://arxiv.org/pdf/2408.09632v1.pdf","comment":"31 pages, 9 figures"},{"id":"http://arxiv.org/abs/2408.09629v1","updated":"2024-08-19T01:22:21Z","published":"2024-08-19T01:22:21Z","title":"A Strategy to Combine 1stGen Transformers and Open LLMs for Automatic\n Text Classification","summary":" Transformer models have achieved state-of-the-art results, with Large\nLanguage Models (LLMs), an evolution of first-generation transformers (1stTR),\nbeing considered the cutting edge in several NLP tasks. However, the literature\nhas yet to conclusively demonstrate that LLMs consistently outperform 1stTRs\nacross all NLP tasks. This study compares three 1stTRs (BERT, RoBERTa, and\nBART) with two open LLMs (Llama 2 and Bloom) across 11 sentiment analysis\ndatasets. The results indicate that open LLMs may moderately outperform or\nmatch 1stTRs in 8 out of 11 datasets but only when fine-tuned. Given this\nsubstantial cost for only moderate gains, the practical applicability of these\nmodels in cost-sensitive scenarios is questionable. In this context, a\nconfidence-based strategy that seamlessly integrates 1stTRs with open LLMs\nbased on prediction certainty is proposed. High-confidence documents are\nclassified by the more cost-effective 1stTRs, while uncertain cases are handled\nby LLMs in zero-shot or few-shot modes, at a much lower cost than fine-tuned\nversions. Experiments in sentiment analysis demonstrate that our solution not\nonly outperforms 1stTRs, zero-shot, and few-shot LLMs but also competes closely\nwith fine-tuned LLMs at a fraction of the cost.\n","authors":["Claudio M. V. de Andrade","Washington Cunha","Davi Reis","Adriana Silvina Pagano","Leonardo Rocha","Marcos André Gonçalves"],"pdf_url":"https://arxiv.org/pdf/2408.09629v1.pdf","comment":"13 pages, 3 figures, 8 tables"},{"id":"http://arxiv.org/abs/2402.05374v3","updated":"2024-08-19T00:52:51Z","published":"2024-02-08T03:12:25Z","title":"CIC: A framework for Culturally-aware Image Captioning","summary":" Image Captioning generates descriptive sentences from images using\nVision-Language Pre-trained models (VLPs) such as BLIP, which has improved\ngreatly. However, current methods lack the generation of detailed descriptive\ncaptions for the cultural elements depicted in the images, such as the\ntraditional clothing worn by people from Asian cultural groups. In this paper,\nwe propose a new framework, Culturally-aware Image Captioning (CIC), that\ngenerates captions and describes cultural elements extracted from cultural\nvisual elements in images representing cultures. Inspired by methods combining\nvisual modality and Large Language Models (LLMs) through appropriate prompts,\nour framework (1) generates questions based on cultural categories from images,\n(2) extracts cultural visual elements from Visual Question Answering (VQA)\nusing generated questions, and (3) generates culturally-aware captions using\nLLMs with the prompts. Our human evaluation conducted on 45 participants from 4\ndifferent cultural groups with a high understanding of the corresponding\nculture shows that our proposed framework generates more culturally descriptive\ncaptions when compared to the image captioning baseline based on VLPs.\nResources can be found at https://shane3606.github.io/cic..\n","authors":["Youngsik Yun","Jihie Kim"],"pdf_url":"https://arxiv.org/pdf/2402.05374v3.pdf","comment":"Accepted in IJCAI 2024"},{"id":"http://arxiv.org/abs/2408.09621v1","updated":"2024-08-19T00:26:53Z","published":"2024-08-19T00:26:53Z","title":"Refining Packing and Shuffling Strategies for Enhanced Performance in\n Generative Language Models","summary":" Packing and shuffling tokens is a common practice in training auto-regressive\nlanguage models (LMs) to prevent overfitting and improve efficiency. Typically\ndocuments are concatenated to chunks of maximum sequence length (MSL) and then\nshuffled. However setting the atom size, the length for each data chunk\naccompanied by random shuffling, to MSL may lead to contextual incoherence due\nto tokens from different documents being packed into the same chunk. An\nalternative approach is to utilize padding, another common data packing\nstrategy, to avoid contextual incoherence by only including one document in\neach shuffled chunk. To optimize both packing strategies (concatenation vs\npadding), we investigated the optimal atom size for shuffling and compared\ntheir performance and efficiency. We found that matching atom size to MSL\noptimizes performance for both packing methods (concatenation and padding), and\npadding yields lower final perplexity (higher performance) than concatenation\nat the cost of more training steps and lower compute efficiency. This trade-off\ninforms the choice of packing methods in training language models.\n","authors":["Yanbing Chen","Ruilin Wang","Zihao Yang","Lavender Yao Jiang","Eric Karl Oermann"],"pdf_url":"https://arxiv.org/pdf/2408.09621v1.pdf","comment":"11 pages (include appendix), 26 figures, submitted to ACL ARR Aug\n 2024"},{"id":"http://arxiv.org/abs/2407.02337v2","updated":"2024-08-19T23:40:29Z","published":"2024-07-02T15:05:47Z","title":"Open foundation models for Azerbaijani language","summary":" The emergence of multilingual large language models has enabled the\ndevelopment of language understanding and generation systems in Azerbaijani.\nHowever, most of the production-grade systems rely on cloud solutions, such as\nGPT-4. While there have been several attempts to develop open foundation models\nfor Azerbaijani, these works have not found their way into common use due to a\nlack of systemic benchmarking. This paper encompasses several lines of work\nthat promote open-source foundation models for Azerbaijani. We introduce (1) a\nlarge text corpus for Azerbaijani, (2) a family of encoder-only language models\ntrained on this dataset, (3) labeled datasets for evaluating these models, and\n(4) extensive evaluation that covers all major open-source models with\nAzerbaijani support.\n","authors":["Jafar Isbarov","Kavsar Huseynova","Elvin Mammadov","Mammad Hajili","Duygu Ataman"],"pdf_url":"https://arxiv.org/pdf/2407.02337v2.pdf","comment":"Presented in the First Workshop on Natural Language Processing for\n Turkic Languages"},{"id":"http://arxiv.org/abs/2401.05544v4","updated":"2024-08-19T23:14:59Z","published":"2024-01-10T20:49:59Z","title":"Enhancing Source Code Classification Effectiveness via Prompt Learning\n Incorporating Knowledge Features","summary":" Researchers have investigated the potential of leveraging pre-trained\nlanguage models, such as CodeBERT, to enhance source code-related tasks.\nPrevious methodologies have relied on CodeBERT's '[CLS]' token as the embedding\nrepresentation of input sequences for task performance, necessitating\nadditional neural network layers to enhance feature representation, which in\nturn increases computational expenses. These approaches have also failed to\nfully leverage the comprehensive knowledge inherent within the source code and\nits associated text, potentially limiting classification efficacy. We propose\nCodeClassPrompt, a text classification technique that harnesses prompt learning\nto extract rich knowledge associated with input sequences from pre-trained\nmodels, thereby eliminating the need for additional layers and lowering\ncomputational costs. By applying an attention mechanism, we synthesize\nmulti-layered knowledge into task-specific features, enhancing classification\naccuracy. Our comprehensive experimentation across four distinct source\ncode-related tasks reveals that CodeClassPrompt achieves competitive\nperformance while significantly reducing computational overhead.\n","authors":["Yong Ma","Senlin Luo","Yu-Ming Shang","Yifei Zhang","Zhengjun Li"],"pdf_url":"https://arxiv.org/pdf/2401.05544v4.pdf","comment":"Accepted by Scientific Reports"},{"id":"http://arxiv.org/abs/2408.10443v1","updated":"2024-08-19T22:44:10Z","published":"2024-08-19T22:44:10Z","title":"Federated Learning of Large ASR Models in the Real World","summary":" Federated learning (FL) has shown promising results on training machine\nlearning models with privacy preservation. However, for large models with over\n100 million parameters, the training resource requirement becomes an obstacle\nfor FL because common devices do not have enough memory and computation power\nto finish the FL tasks. Although efficient training methods have been proposed,\nit is still a challenge to train the large models like Conformer based ASR.\nThis paper presents a systematic solution to train the full-size ASR models of\n130M parameters with FL. To our knowledge, this is the first real-world FL\napplication of the Conformer model, which is also the largest model ever\ntrained with FL so far. And this is the first paper showing FL can improve the\nASR model quality with a set of proposed methods to refine the quality of data\nand labels of clients. We demonstrate both the training efficiency and the\nmodel quality improvement in real-world experiments.\n","authors":["Yonghui Xiao","Yuxin Ding","Changwan Ryu","Petr Zadrazil","Francoise Beaufays"],"pdf_url":"https://arxiv.org/pdf/2408.10443v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13780v2","updated":"2024-08-19T22:38:14Z","published":"2024-03-20T17:42:08Z","title":"Information-Theoretic Distillation for Reference-less Summarization","summary":" The current winning recipe for automatic summarization is using proprietary\nlarge-scale language models (LLMs) such as ChatGPT as is, or imitation learning\nfrom them as teacher models. While increasingly ubiquitous dependence on such\nlarge-scale language models is convenient, there remains an important question\nof whether small-scale models could have achieved competitive results, if we\nwere to seek an alternative learning method -- that allows for a more\ncost-efficient, controllable, yet powerful summarizer. We present InfoSumm, a\nnovel framework to distill a powerful summarizer based on the\ninformation-theoretic objective for summarization, without relying on either\nthe LLM's capability or human-written references. To achieve this, we first\npropose a novel formulation of the desiderata of summarization (saliency,\nfaithfulness and brevity) through the lens of mutual information between the\noriginal document and the summary. Based on this formulation, we start off from\nPythia-2.8B as the teacher model, which is not yet capable of summarization,\nthen self-train the model to optimize for the information-centric measures of\nideal summaries. Distilling from the improved teacher, we arrive at a compact\nbut powerful summarizer with only 568M parameters that performs competitively\nagainst ChatGPT, without ever relying on ChatGPT's capabilities. Extensive\nanalysis demonstrates that our approach outperforms in-domain supervised models\nin human evaluation, let alone state-of-the-art unsupervised methods, and wins\nover ChatGPT in controllable summarization.\n","authors":["Jaehun Jung","Ximing Lu","Liwei Jiang","Faeze Brahman","Peter West","Pang Wei Koh","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2403.13780v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16635v4","updated":"2024-08-19T22:34:28Z","published":"2023-05-26T05:19:24Z","title":"Impossible Distillation: from Low-Quality Model to High-Quality Dataset\n & Model for Summarization and Paraphrasing","summary":" We present Impossible Distillation, a novel framework for paraphrasing and\nsentence summarization, that distills a high-quality dataset and model from a\nlow-quality teacher that itself cannot perform these tasks. Unlike prior works\nthat rely on an extreme-scale teacher model (e.g., GPT3) or task-specific\narchitecture, we hypothesize and verify the paraphrastic proximity intrinsic to\npre-trained LMs (e.g., GPT2), where paraphrases occupy a proximal subspace in\nthe LM distribution. By identifying and distilling generations from these\nsubspaces, Impossible Distillation produces a high-quality dataset and model\neven from GPT2-scale LMs. We evaluate our method on multiple benchmarks\nspanning unconstrained / syntax-controlled paraphrase generation and sentence\nsummarization. Our model with 770M parameters consistently outperforms strong\nbaselines, including models distilled from ChatGPT, and sometimes, even ChatGPT\nitself. Also, we find that our distilled dataset from 1.5B LMs exhibits higher\ndiversity and fidelity than up to 13 times larger datasets.\n","authors":["Jaehun Jung","Peter West","Liwei Jiang","Faeze Brahman","Ximing Lu","Jillian Fisher","Taylor Sorensen","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2305.16635v4.pdf","comment":"NAACL 2024"},{"id":"http://arxiv.org/abs/2408.10441v1","updated":"2024-08-19T22:31:21Z","published":"2024-08-19T22:31:21Z","title":"Goldfish: Monolingual Language Models for 350 Languages","summary":" For many low-resource languages, the only available language models are large\nmultilingual models trained on many languages simultaneously. However, using\nFLORES perplexity as a metric, we find that these models perform worse than\nbigrams for many languages (e.g. 24% of languages in XGLM 4.5B; 43% in BLOOM\n7.1B). To facilitate research that focuses on low-resource languages, we\npre-train and release Goldfish, a suite of monolingual autoregressive\nTransformer language models up to 125M parameters for 350 languages. The\nGoldfish reach lower FLORES perplexities than BLOOM, XGLM, and MaLA-500 on 98\nof 204 FLORES languages, despite each Goldfish model being over 10x smaller.\nHowever, the Goldfish significantly underperform larger multilingual models on\nreasoning benchmarks, suggesting that for low-resource languages,\nmultilinguality primarily improves general reasoning abilities rather than\nbasic text generation. We release models trained on 5MB (350 languages), 10MB\n(288 languages), 100MB (166 languages), and 1GB (83 languages) of text data\nwhere available. The Goldfish models are available as baselines, fine-tuning\nsources, or augmentations to existing models in low-resource NLP research, and\nthey are further useful for crosslinguistic studies requiring maximally\ncomparable models across languages.\n","authors":["Tyler A. Chang","Catherine Arnett","Zhuowen Tu","Benjamin K. Bergen"],"pdf_url":"https://arxiv.org/pdf/2408.10441v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03837v3","updated":"2024-08-19T21:57:24Z","published":"2024-08-07T15:22:44Z","title":"WalledEval: A Comprehensive Safety Evaluation Toolkit for Large Language\n Models","summary":" WalledEval is a comprehensive AI safety testing toolkit designed to evaluate\nlarge language models (LLMs). It accommodates a diverse range of models,\nincluding both open-weight and API-based ones, and features over 35 safety\nbenchmarks covering areas such as multilingual safety, exaggerated safety, and\nprompt injections. The framework supports both LLM and judge benchmarking and\nincorporates custom mutators to test safety against various text-style\nmutations, such as future tense and paraphrasing. Additionally, WalledEval\nintroduces WalledGuard, a new, small, and performant content moderation tool,\nand two datasets: SGXSTest and HIXSTest, which serve as benchmarks for\nassessing the exaggerated safety of LLMs and judges in cultural contexts. We\nmake WalledEval publicly available at https://github.com/walledai/walledeval.\n","authors":["Prannaya Gupta","Le Qi Yau","Hao Han Low","I-Shiang Lee","Hugo Maximus Lim","Yu Xin Teoh","Jia Hng Koh","Dar Win Liew","Rishabh Bhardwaj","Rajat Bhardwaj","Soujanya Poria"],"pdf_url":"https://arxiv.org/pdf/2408.03837v3.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2408.10417v1","updated":"2024-08-19T21:09:31Z","published":"2024-08-19T21:09:31Z","title":"Development of an AI Anti-Bullying System Using Large Language Model Key\n Topic Detection","summary":" This paper presents and evaluates work on the development of an artificial\nintelligence (AI) anti-bullying system. The system is designed to identify\ncoordinated bullying attacks via social media and other mechanisms,\ncharacterize them and propose remediation and response activities to them. In\nparticular, a large language model (LLM) is used to populate an enhanced expert\nsystem-based network model of a bullying attack. This facilitates analysis and\nremediation activity - such as generating report messages to social media\ncompanies - determination. The system is described and the efficacy of the LLM\nfor populating the model is analyzed herein.\n","authors":["Matthew Tassava","Cameron Kolodjski","Jordan Milbrath","Adorah Bishop","Nathan Flanders","Robbie Fetsch","Danielle Hanson","Jeremy Straub"],"pdf_url":"https://arxiv.org/pdf/2408.10417v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10411v1","updated":"2024-08-19T20:50:41Z","published":"2024-08-19T20:50:41Z","title":"Resolving Lexical Bias in Edit Scoping with Projector Editor Networks","summary":" Weight-preserving model editing techniques heavily rely on the scoping\nmechanism that decides when to apply an edit to the base model. These scoping\nmechanisms utilize distance functions in the representation space to ascertain\nthe scope of the edit. In this work, we show that distance-based scoping\nfunctions grapple with lexical biases leading to issues such as misfires with\nirrelevant prompts that share similar lexical characteristics. To address this\nproblem, we introduce, Projector Editor Networks for Model Editing (PENME),is a\nmodel editing approach that employs a compact adapter with a projection network\ntrained via a contrastive learning objective. We demonstrate the efficacy of\nPENME in achieving superior results while being compute efficient and flexible\nto adapt across model architectures.\n","authors":["Hammad Rizwan","Domenic Rosati","Ga Wu","Hassan Sajjad"],"pdf_url":"https://arxiv.org/pdf/2408.10411v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10392v1","updated":"2024-08-19T20:22:08Z","published":"2024-08-19T20:22:08Z","title":"Value Alignment from Unstructured Text","summary":" Aligning large language models (LLMs) to value systems has emerged as a\nsignificant area of research within the fields of AI and NLP. Currently, this\nalignment process relies on the availability of high-quality supervised and\npreference data, which can be both time-consuming and expensive to curate or\nannotate. In this paper, we introduce a systematic end-to-end methodology for\naligning LLMs to the implicit and explicit values represented in unstructured\ntext data. Our proposed approach leverages the use of scalable synthetic data\ngeneration techniques to effectively align the model to the values present in\nthe unstructured data. Through two distinct use-cases, we demonstrate the\nefficiency of our methodology on the Mistral-7B-Instruct model. Our approach\ncredibly aligns LLMs to the values embedded within documents, and shows\nimproved performance against other approaches, as quantified through the use of\nautomatic metrics and win rates.\n","authors":["Inkit Padhi","Karthikeyan Natesan Ramamurthy","Prasanna Sattigeri","Manish Nagireddy","Pierre Dognin","Kush R. Varshney"],"pdf_url":"https://arxiv.org/pdf/2408.10392v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10388v1","updated":"2024-08-19T20:09:56Z","published":"2024-08-19T20:09:56Z","title":"Narrowing the Gap between Vision and Action in Navigation","summary":" The existing methods for Vision and Language Navigation in the Continuous\nEnvironment (VLN-CE) commonly incorporate a waypoint predictor to discretize\nthe environment. This simplifies the navigation actions into a view selection\ntask and improves navigation performance significantly compared to direct\ntraining using low-level actions. However, the VLN-CE agents are still far from\nthe real robots since there are gaps between their visual perception and\nexecuted actions. First, VLN-CE agents that discretize the visual environment\nare primarily trained with high-level view selection, which causes them to\nignore crucial spatial reasoning within the low-level action movements. Second,\nin these models, the existing waypoint predictors neglect object semantics and\ntheir attributes related to passibility, which can be informative in indicating\nthe feasibility of actions. To address these two issues, we introduce a\nlow-level action decoder jointly trained with high-level action prediction,\nenabling the current VLN agent to learn and ground the selected visual view to\nthe low-level controls. Moreover, we enhance the current waypoint predictor by\nutilizing visual representations containing rich semantic information and\nexplicitly masking obstacles based on humans' prior knowledge about the\nfeasibility of actions. Empirically, our agent can improve navigation\nperformance metrics compared to the strong baselines on both high-level and\nlow-level actions.\n","authors":["Yue Zhang","Parisa Kordjamshidi"],"pdf_url":"https://arxiv.org/pdf/2408.10388v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01297v2","updated":"2024-08-19T20:06:10Z","published":"2024-06-03T13:05:46Z","title":"When Can LLMs Actually Correct Their Own Mistakes? A Critical Survey of\n Self-Correction of LLMs","summary":" Self-correction is an approach to improving responses from large language\nmodels (LLMs) by refining the responses using LLMs during inference. Prior work\nhas proposed various self-correction frameworks using different sources of\nfeedback, including self-evaluation and external feedback. However, there is\nstill no consensus on the question of when LLMs can correct their own mistakes,\nas recent studies also report negative results. In this work, we critically\nsurvey broad papers and discuss the conditions required for successful\nself-correction. We first find that prior studies often do not define their\nresearch questions in detail and involve impractical frameworks or unfair\nevaluations that over-evaluate self-correction. To tackle these issues, we\ncategorize research questions in self-correction research and provide a\nchecklist for designing appropriate experiments. Our critical survey based on\nthe newly categorized research questions shows that (1) no prior work\ndemonstrates successful self-correction with feedback from prompted LLMs,\nexcept for studies in tasks that are exceptionally suited for self-correction,\n(2) self-correction works well in tasks that can use reliable external\nfeedback, and (3) large-scale fine-tuning enables self-correction.\n","authors":["Ryo Kamoi","Yusen Zhang","Nan Zhang","Jiawei Han","Rui Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.01297v2.pdf","comment":"Accepted for publication in Transactions of the Association for\n Computational Linguistics (TACL), 2024. Author's final version"},{"id":"http://arxiv.org/abs/2408.10357v1","updated":"2024-08-19T18:54:20Z","published":"2024-08-19T18:54:20Z","title":"Beyond Relevant Documents: A Knowledge-Intensive Approach for\n Query-Focused Summarization using Large Language Models","summary":" Query-focused summarization (QFS) is a fundamental task in natural language\nprocessing with broad applications, including search engines and report\ngeneration. However, traditional approaches assume the availability of relevant\ndocuments, which may not always hold in practical scenarios, especially in\nhighly specialized topics. To address this limitation, we propose a novel\nknowledge-intensive approach that reframes QFS as a knowledge-intensive task\nsetup. This approach comprises two main components: a retrieval module and a\nsummarization controller. The retrieval module efficiently retrieves\npotentially relevant documents from a large-scale knowledge corpus based on the\ngiven textual query, eliminating the dependence on pre-existing document sets.\nThe summarization controller seamlessly integrates a powerful large language\nmodel (LLM)-based summarizer with a carefully tailored prompt, ensuring the\ngenerated summary is comprehensive and relevant to the query. To assess the\neffectiveness of our approach, we create a new dataset, along with\nhuman-annotated relevance labels, to facilitate comprehensive evaluation\ncovering both retrieval and summarization performance. Extensive experiments\ndemonstrate the superior performance of our approach, particularly its ability\nto generate accurate summaries without relying on the availability of relevant\ndocuments initially. This underscores our method's versatility and practical\napplicability across diverse query scenarios.\n","authors":["Weijia Zhang","Jia-Hong Huang","Svitlana Vakulenko","Yumo Xu","Thilina Rajapakse","Evangelos Kanoulas"],"pdf_url":"https://arxiv.org/pdf/2408.10357v1.pdf","comment":"Accepted by the 27th International Conference on Pattern Recognition\n (ICPR 2024)"},{"id":"http://arxiv.org/abs/2408.10841v1","updated":"2024-08-19T17:56:06Z","published":"2024-08-19T17:56:06Z","title":"DELIA: Diversity-Enhanced Learning for Instruction Adaptation in Large\n Language Models","summary":" Although instruction tuning is widely used to adjust behavior in Large\nLanguage Models (LLMs), extensive empirical evidence and research indicates\nthat it is primarily a process where the model fits to specific task formats,\nrather than acquiring new knowledge or capabilities. We propose that this\nlimitation stems from biased features learned during instruction tuning, which\ndiffer from ideal task-specfic features, leading to learn less underlying\nsemantics in downstream tasks. However, ideal features are unknown and\nincalculable, constraining past work to rely on prior knowledge to assist\nreasoning or training, which limits LLMs' capabilities to the developers'\nabilities, rather than data-driven scalable learning. In our paper, through our\nnovel data synthesis method, DELIA (Diversity-Enhanced Learning for Instruction\nAdaptation), we leverage the buffering effect of extensive diverse data in LLMs\ntraining to transform biased features in instruction tuning into approximations\nof ideal features, without explicit prior ideal features. Experiments show\nDELIA's better performance compared to common instruction tuning and other\nbaselines. It outperforms common instruction tuning by 17.07%-33.41% on\nIcelandic-English translation bleurt score (WMT-21 dataset, gemma-7b-it) and\nimproves accuracy by 36.1% on formatted text generation (Llama2-7b-chat).\nNotably, among knowledge injection methods we've known, DELIA uniquely align\nthe internal representations of new special tokens with their prior semantics.\n","authors":["Yuanhao Zeng","Fei Ren","Xinpeng Zhou","Yihang Wang","Yingxia Shao"],"pdf_url":"https://arxiv.org/pdf/2408.10841v1.pdf","comment":"8 pages, 5 figures"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2408.10204v1","updated":"2024-08-19T17:58:03Z","published":"2024-08-19T17:58:03Z","title":"Criticality Leveraged Adversarial Training (CLAT) for Boosted\n Performance via Parameter Efficiency","summary":" Adversarial training enhances neural network robustness but suffers from a\ntendency to overfit and increased generalization errors on clean data. This\nwork introduces CLAT, an innovative approach that mitigates adversarial\noverfitting by introducing parameter efficiency into the adversarial training\nprocess, improving both clean accuracy and adversarial robustness. Instead of\ntuning the entire model, CLAT identifies and fine-tunes robustness-critical\nlayers - those predominantly learning non-robust features - while freezing the\nremaining model to enhance robustness. It employs dynamic critical layer\nselection to adapt to changes in layer criticality throughout the fine-tuning\nprocess. Empirically, CLAT can be applied on top of existing adversarial\ntraining methods, significantly reduces the number of trainable parameters by\napproximately 95%, and achieves more than a 2% improvement in adversarial\nrobustness compared to baseline methods.\n","authors":["Bhavna Gopal","Huanrui Yang","Jingyang Zhang","Mark Horton","Yiran Chen"],"pdf_url":"https://arxiv.org/pdf/2408.10204v1.pdf","comment":"9 pages + appendix/ additional experiments"},{"id":"http://arxiv.org/abs/2408.10202v1","updated":"2024-08-19T17:57:28Z","published":"2024-08-19T17:57:28Z","title":"SANER: Annotation-free Societal Attribute Neutralizer for Debiasing CLIP","summary":" Large-scale vision-language models, such as CLIP, are known to contain\nharmful societal bias regarding protected attributes (e.g., gender and age). In\nthis paper, we aim to address the problems of societal bias in CLIP. Although\nprevious studies have proposed to debias societal bias through adversarial\nlearning or test-time projecting, our comprehensive study of these works\nidentifies two critical limitations: 1) loss of attribute information when it\nis explicitly disclosed in the input and 2) use of the attribute annotations\nduring debiasing process. To mitigate societal bias in CLIP and overcome these\nlimitations simultaneously, we introduce a simple-yet-effective debiasing\nmethod called SANER (societal attribute neutralizer) that eliminates attribute\ninformation from CLIP text features only of attribute-neutral descriptions.\nExperimental results show that SANER, which does not require attribute\nannotations and preserves original information for attribute-specific\ndescriptions, demonstrates superior debiasing ability than the existing\nmethods.\n","authors":["Yusuke Hirota","Min-Hung Chen","Chien-Yi Wang","Yuta Nakashima","Yu-Chiang Frank Wang","Ryo Hachiuma"],"pdf_url":"https://arxiv.org/pdf/2408.10202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10198v1","updated":"2024-08-19T17:55:17Z","published":"2024-08-19T17:55:17Z","title":"MeshFormer: High-Quality Mesh Generation with 3D-Guided Reconstruction\n Model","summary":" Open-world 3D reconstruction models have recently garnered significant\nattention. However, without sufficient 3D inductive bias, existing methods\ntypically entail expensive training costs and struggle to extract high-quality\n3D meshes. In this work, we introduce MeshFormer, a sparse-view reconstruction\nmodel that explicitly leverages 3D native structure, input guidance, and\ntraining supervision. Specifically, instead of using a triplane representation,\nwe store features in 3D sparse voxels and combine transformers with 3D\nconvolutions to leverage an explicit 3D structure and projective bias. In\naddition to sparse-view RGB input, we require the network to take input and\ngenerate corresponding normal maps. The input normal maps can be predicted by\n2D diffusion models, significantly aiding in the guidance and refinement of the\ngeometry's learning. Moreover, by combining Signed Distance Function (SDF)\nsupervision with surface rendering, we directly learn to generate high-quality\nmeshes without the need for complex multi-stage training processes. By\nincorporating these explicit 3D biases, MeshFormer can be trained efficiently\nand deliver high-quality textured meshes with fine-grained geometric details.\nIt can also be integrated with 2D diffusion models to enable fast\nsingle-image-to-3D and text-to-3D tasks. Project page:\nhttps://meshformer3d.github.io\n","authors":["Minghua Liu","Chong Zeng","Xinyue Wei","Ruoxi Shi","Linghao Chen","Chao Xu","Mengqi Zhang","Zhaoning Wang","Xiaoshuai Zhang","Isabella Liu","Hongzhi Wu","Hao Su"],"pdf_url":"https://arxiv.org/pdf/2408.10198v1.pdf","comment":"20 pages, 9 figures"},{"id":"http://arxiv.org/abs/2408.10195v1","updated":"2024-08-19T17:53:10Z","published":"2024-08-19T17:53:10Z","title":"SpaRP: Fast 3D Object Reconstruction and Pose Estimation from Sparse\n Views","summary":" Open-world 3D generation has recently attracted considerable attention. While\nmany single-image-to-3D methods have yielded visually appealing outcomes, they\noften lack sufficient controllability and tend to produce hallucinated regions\nthat may not align with users' expectations. In this paper, we explore an\nimportant scenario in which the input consists of one or a few unposed 2D\nimages of a single object, with little or no overlap. We propose a novel\nmethod, SpaRP, to reconstruct a 3D textured mesh and estimate the relative\ncamera poses for these sparse-view images. SpaRP distills knowledge from 2D\ndiffusion models and finetunes them to implicitly deduce the 3D spatial\nrelationships between the sparse views. The diffusion model is trained to\njointly predict surrogate representations for camera poses and multi-view\nimages of the object under known poses, integrating all information from the\ninput sparse views. These predictions are then leveraged to accomplish 3D\nreconstruction and pose estimation, and the reconstructed 3D model can be used\nto further refine the camera poses of input views. Through extensive\nexperiments on three datasets, we demonstrate that our method not only\nsignificantly outperforms baseline methods in terms of 3D reconstruction\nquality and pose prediction accuracy but also exhibits strong efficiency. It\nrequires only about 20 seconds to produce a textured mesh and camera poses for\nthe input views. Project page: https://chaoxu.xyz/sparp.\n","authors":["Chao Xu","Ang Li","Linghao Chen","Yulin Liu","Ruoxi Shi","Hao Su","Minghua Liu"],"pdf_url":"https://arxiv.org/pdf/2408.10195v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2408.10188v1","updated":"2024-08-19T17:48:08Z","published":"2024-08-19T17:48:08Z","title":"LongVILA: Scaling Long-Context Visual Language Models for Long Videos","summary":" Long-context capability is critical for multi-modal foundation models. We\nintroduce LongVILA, a full-stack solution for long-context vision-language\nmodels, including system, model training, and dataset development. On the\nsystem side, we introduce the first Multi-Modal Sequence Parallelism (MM-SP)\nsystem that enables long-context training and inference, enabling 2M context\nlength training on 256 GPUs. MM-SP is also efficient, being 2.1x - 5.7x faster\nthan Ring-Style Sequence Parallelism and 1.1x - 1.4x faster than Megatron-LM in\ntext-only settings. Moreover, it seamlessly integrates with Hugging Face\nTransformers. For model training, we propose a five-stage pipeline comprising\nalignment, pre-training, context extension, and long-short joint supervised\nfine-tuning. Regarding datasets, we meticulously construct large-scale visual\nlanguage pre-training datasets and long video instruction-following datasets to\nsupport our multi-stage training process. The full-stack solution extends the\nfeasible frame number of VILA by a factor of 128 (from 8 to 1024 frames) and\nimproves long video captioning score from 2.00 to 3.26 (1.6x), achieving 99.5%\naccuracy in 1400-frames video (274k context length) needle in a haystack.\nLongVILA-8B also demonstrates a consistent improvement in performance on long\nvideos within the VideoMME benchmark as the video frames increase.\n","authors":["Fuzhao Xue","Yukang Chen","Dacheng Li","Qinghao Hu","Ligeng Zhu","Xiuyu Li","Yunhao Fang","Haotian Tang","Shang Yang","Zhijian Liu","Ethan He","Hongxu Yin","Pavlo Molchanov","Jan Kautz","Linxi Fan","Yuke Zhu","Yao Lu","Song Han"],"pdf_url":"https://arxiv.org/pdf/2408.10188v1.pdf","comment":"Code and models are available at\n https://github.com/NVlabs/VILA/blob/main/LongVILA.md"},{"id":"http://arxiv.org/abs/2408.10187v1","updated":"2024-08-19T17:47:22Z","published":"2024-08-19T17:47:22Z","title":"Assessment of Spectral based Solutions for the Detection of Floating\n Marine Debris","summary":" Typically, the detection of marine debris relies on in-situ campaigns that\nare characterized by huge human effort and limited spatial coverage. Following\nthe need of a rapid solution for the detection of floating plastic, methods\nbased on remote sensing data have been proposed recently. Their main limitation\nis represented by the lack of a general reference for evaluating performance.\nRecently, the Marine Debris Archive (MARIDA) has been released as a standard\ndataset to develop and evaluate Machine Learning (ML) algorithms for detection\nof Marine Plastic Debris. The MARIDA dataset has been created for simplifying\nthe comparison between detection solutions with the aim of stimulating the\nresearch in the field of marine environment preservation. In this work, an\nassessment of spectral based solutions is proposed by evaluating performance on\nMARIDA dataset. The outcome highlights the need of precise reference for fair\nevaluation.\n","authors":["Muhammad Alì","Francesca Razzano","Sergio Vitale","Giampaolo Ferraioli","Vito Pascazio","Gilda Schirinzi","Silvia Ullo"],"pdf_url":"https://arxiv.org/pdf/2408.10187v1.pdf","comment":"5 pages, 3 figures, submitted and accepted for 2024 Second\n International Conference on Networks, Multimedia and Information Technology\n (NMITCON)"},{"id":"http://arxiv.org/abs/2408.10181v1","updated":"2024-08-19T17:40:18Z","published":"2024-08-19T17:40:18Z","title":"Imbalance-Aware Culvert-Sewer Defect Segmentation Using an Enhanced\n Feature Pyramid Network","summary":" Imbalanced datasets are a significant challenge in real-world scenarios. They\nlead to models that underperform on underrepresented classes, which is a\ncritical issue in infrastructure inspection. This paper introduces the Enhanced\nFeature Pyramid Network (E-FPN), a deep learning model for the semantic\nsegmentation of culverts and sewer pipes within imbalanced datasets. The E-FPN\nincorporates architectural innovations like sparsely connected blocks and\ndepth-wise separable convolutions to improve feature extraction and handle\nobject variations. To address dataset imbalance, the model employs strategies\nlike class decomposition and data augmentation. Experimental results on the\nculvert-sewer defects dataset and a benchmark aerial semantic segmentation\ndrone dataset show that the E-FPN outperforms state-of-the-art methods,\nachieving an average Intersection over Union (IoU) improvement of 13.8% and\n27.2%, respectively. Additionally, class decomposition and data augmentation\ntogether boost the model's performance by approximately 6.9% IoU. The proposed\nE-FPN presents a promising solution for enhancing object segmentation in\nchallenging, multi-class real-world datasets, with potential applications\nextending beyond culvert-sewer defect detection.\n","authors":["Rasha Alshawi","Md Meftahul Ferdaus","Mahdi Abdelguerfi","Kendall Niles","Ken Pathak","Steve Sloan"],"pdf_url":"https://arxiv.org/pdf/2408.10181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10178v1","updated":"2024-08-19T17:36:35Z","published":"2024-08-19T17:36:35Z","title":"NeuRodin: A Two-stage Framework for High-Fidelity Neural Surface\n Reconstruction","summary":" Signed Distance Function (SDF)-based volume rendering has demonstrated\nsignificant capabilities in surface reconstruction. Although promising,\nSDF-based methods often fail to capture detailed geometric structures,\nresulting in visible defects. By comparing SDF-based volume rendering to\ndensity-based volume rendering, we identify two main factors within the\nSDF-based approach that degrade surface quality: SDF-to-density representation\nand geometric regularization. These factors introduce challenges that hinder\nthe optimization of the SDF field. To address these issues, we introduce\nNeuRodin, a novel two-stage neural surface reconstruction framework that not\nonly achieves high-fidelity surface reconstruction but also retains the\nflexible optimization characteristics of density-based methods. NeuRodin\nincorporates innovative strategies that facilitate transformation of arbitrary\ntopologies and reduce artifacts associated with density bias. Extensive\nevaluations on the Tanks and Temples and ScanNet++ datasets demonstrate the\nsuperiority of NeuRodin, showing strong reconstruction capabilities for both\nindoor and outdoor environments using solely posed RGB captures. Project\nwebsite: https://open3dvlab.github.io/NeuRodin/\n","authors":["Yifan Wang","Di Huang","Weicai Ye","Guofeng Zhang","Wanli Ouyang","Tong He"],"pdf_url":"https://arxiv.org/pdf/2408.10178v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10175v1","updated":"2024-08-19T17:34:19Z","published":"2024-08-19T17:34:19Z","title":"Fairness Under Cover: Evaluating the Impact of Occlusions on Demographic\n Bias in Facial Recognition","summary":" This study investigates the effects of occlusions on the fairness of face\nrecognition systems, particularly focusing on demographic biases. Using the\nRacial Faces in the Wild (RFW) dataset and synthetically added realistic\nocclusions, we evaluate their effect on the performance of face recognition\nmodels trained on the BUPT-Balanced and BUPT-GlobalFace datasets. We note\nincreases in the dispersion of FMR, FNMR, and accuracy alongside decreases in\nfairness according to Equilized Odds, Demographic Parity, STD of Accuracy, and\nFairness Discrepancy Rate. Additionally, we utilize a pixel attribution method\nto understand the importance of occlusions in model predictions, proposing a\nnew metric, Face Occlusion Impact Ratio (FOIR), that quantifies the extent to\nwhich occlusions affect model performance across different demographic groups.\nOur results indicate that occlusions exacerbate existing demographic biases,\nwith models placing higher importance on occlusions in an unequal fashion,\nparticularly affecting African individuals more severely.\n","authors":["Rafael M. Mamede","Pedro C. Neto","Ana F. Sequeira"],"pdf_url":"https://arxiv.org/pdf/2408.10175v1.pdf","comment":"Accepted at ECCV Workshop FAILED"},{"id":"http://arxiv.org/abs/2408.10161v1","updated":"2024-08-19T17:13:34Z","published":"2024-08-19T17:13:34Z","title":"NeuFlow v2: High-Efficiency Optical Flow Estimation on Edge Devices","summary":" Real-time high-accuracy optical flow estimation is crucial for various\nreal-world applications. While recent learning-based optical flow methods have\nachieved high accuracy, they often come with significant computational costs.\nIn this paper, we propose a highly efficient optical flow method that balances\nhigh accuracy with reduced computational demands. Building upon NeuFlow v1, we\nintroduce new components including a much more light-weight backbone and a fast\nrefinement module. Both these modules help in keeping the computational demands\nlight while providing close to state of the art accuracy. Compares to other\nstate of the art methods, our model achieves a 10x-70x speedup while\nmaintaining comparable performance on both synthetic and real-world data. It is\ncapable of running at over 20 FPS on 512x384 resolution images on a Jetson Orin\nNano. The full training and evaluation code is available at\nhttps://github.com/neufieldrobotics/NeuFlow_v2.\n","authors":["Zhiyong Zhang","Aniket Gupta","Huaizu Jiang","Hanumant Singh"],"pdf_url":"https://arxiv.org/pdf/2408.10161v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10154v1","updated":"2024-08-19T17:04:18Z","published":"2024-08-19T17:04:18Z","title":"LoopSplat: Loop Closure by Registering 3D Gaussian Splats","summary":" Simultaneous Localization and Mapping (SLAM) based on 3D Gaussian Splats\n(3DGS) has recently shown promise towards more accurate, dense 3D scene maps.\nHowever, existing 3DGS-based methods fail to address the global consistency of\nthe scene via loop closure and/or global bundle adjustment. To this end, we\npropose LoopSplat, which takes RGB-D images as input and performs dense mapping\nwith 3DGS submaps and frame-to-model tracking. LoopSplat triggers loop closure\nonline and computes relative loop edge constraints between submaps directly via\n3DGS registration, leading to improvements in efficiency and accuracy over\ntraditional global-to-local point cloud registration. It uses a robust pose\ngraph optimization formulation and rigidly aligns the submaps to achieve global\nconsistency. Evaluation on the synthetic Replica and real-world TUM-RGBD,\nScanNet, and ScanNet++ datasets demonstrates competitive or superior tracking,\nmapping, and rendering compared to existing methods for dense RGB-D SLAM. Code\nis available at \\href{https://loopsplat.github.io/}{loopsplat.github.io}.\n","authors":["Liyuan Zhu","Yue Li","Erik Sandström","Konrad Schindler","Iro Armeni"],"pdf_url":"https://arxiv.org/pdf/2408.10154v1.pdf","comment":"Project page:\n \\href{https://loopsplat.github.io/}{loopsplat.github.io}"},{"id":"http://arxiv.org/abs/2408.10153v1","updated":"2024-08-19T17:02:16Z","published":"2024-08-19T17:02:16Z","title":"Structure-preserving Image Translation for Depth Estimation in\n Colonoscopy Video","summary":" Monocular depth estimation in colonoscopy video aims to overcome the unusual\nlighting properties of the colonoscopic environment. One of the major\nchallenges in this area is the domain gap between annotated but unrealistic\nsynthetic data and unannotated but realistic clinical data. Previous attempts\nto bridge this domain gap directly target the depth estimation task itself. We\npropose a general pipeline of structure-preserving synthetic-to-real (sim2real)\nimage translation (producing a modified version of the input image) to retain\ndepth geometry through the translation process. This allows us to generate\nlarge quantities of realistic-looking synthetic images for supervised depth\nestimation with improved generalization to the clinical domain. We also propose\na dataset of hand-picked sequences from clinical colonoscopies to improve the\nimage translation process. We demonstrate the simultaneous realism of the\ntranslated images and preservation of depth maps via the performance of\ndownstream depth estimation on various datasets.\n","authors":["Shuxian Wang","Akshay Paruchuri","Zhaoxi Zhang","Sarah McGill","Roni Sengupta"],"pdf_url":"https://arxiv.org/pdf/2408.10153v1.pdf","comment":"12 pages, 7 figures, accepted at MICCAI 2024"},{"id":"http://arxiv.org/abs/2305.12554v3","updated":"2024-08-19T16:54:21Z","published":"2023-05-21T19:31:56Z","title":"CoMusion: Towards Consistent Stochastic Human Motion Prediction via\n Motion Diffusion","summary":" Stochastic Human Motion Prediction (HMP) aims to predict multiple possible\nfuture human pose sequences from observed ones. Most prior works learn motion\ndistributions through encoding-decoding in the latent space, which does not\npreserve motion's spatial-temporal structure. While effective, these methods\noften require complex, multi-stage training and yield predictions that are\ninconsistent with the provided history and can be physically unrealistic. To\naddress these issues, we propose CoMusion, a single-stage, end-to-end\ndiffusion-based stochastic HMP framework. CoMusion is inspired from the insight\nthat a smooth future pose initialization improves prediction performance, a\nstrategy not previously utilized in stochastic models but evidenced in\ndeterministic works. To generate such initialization, CoMusion's motion\npredictor starts with a Transformer-based network for initial reconstruction of\ncorrupted motion. Then, a graph convolutional network (GCN) is employed to\nrefine the prediction considering past observations in the discrete cosine\ntransformation (DCT) space. Our method, facilitated by the Transformer-GCN\nmodule design and a proposed variance scheduler, excels in predicting accurate,\nrealistic, and consistent motions, while maintaining appropriate diversity.\nExperimental results on benchmark datasets demonstrate that CoMusion surpasses\nprior methods across metrics, while demonstrating superior generation quality.\nOur Code is released at https://github.com/jsun57/CoMusion/ .\n","authors":["Jiarui Sun","Girish Chowdhary"],"pdf_url":"https://arxiv.org/pdf/2305.12554v3.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2408.10145v1","updated":"2024-08-19T16:42:58Z","published":"2024-08-19T16:42:58Z","title":"Multi-Scale Representation Learning for Image Restoration with\n State-Space Model","summary":" Image restoration endeavors to reconstruct a high-quality, detail-rich image\nfrom a degraded counterpart, which is a pivotal process in photography and\nvarious computer vision systems. In real-world scenarios, different types of\ndegradation can cause the loss of image details at various scales and degrade\nimage contrast. Existing methods predominantly rely on CNN and Transformer to\ncapture multi-scale representations. However, these methods are often limited\nby the high computational complexity of Transformers and the constrained\nreceptive field of CNN, which hinder them from achieving superior performance\nand efficiency in image restoration. To address these challenges, we propose a\nnovel Multi-Scale State-Space Model-based (MS-Mamba) for efficient image\nrestoration that enhances the capacity for multi-scale representation learning\nthrough our proposed global and regional SSM modules. Additionally, an Adaptive\nGradient Block (AGB) and a Residual Fourier Block (RFB) are proposed to improve\nthe network's detail extraction capabilities by capturing gradients in various\ndirections and facilitating learning details in the frequency domain. Extensive\nexperiments on nine public benchmarks across four classic image restoration\ntasks, image deraining, dehazing, denoising, and low-light enhancement,\ndemonstrate that our proposed method achieves new state-of-the-art performance\nwhile maintaining low computational complexity. The source code will be\npublicly available.\n","authors":["Yuhong He","Long Peng","Qiaosi Yi","Chen Wu","Lu Wang"],"pdf_url":"https://arxiv.org/pdf/2408.10145v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.14379v2","updated":"2024-08-19T16:37:36Z","published":"2024-03-21T13:12:33Z","title":"Tensor network compressibility of convolutional models","summary":" Convolutional neural networks (CNNs) are one of the most widely used neural\nnetwork architectures, showcasing state-of-the-art performance in computer\nvision tasks. Although larger CNNs generally exhibit higher accuracy, their\nsize can be effectively reduced by ``tensorization'' while maintaining\naccuracy, namely, replacing the convolution kernels with compact decompositions\nsuch as Tucker, Canonical Polyadic decompositions, or quantum-inspired\ndecompositions such as matrix product states, and directly training the factors\nin the decompositions to bias the learning towards low-rank decompositions. But\nwhy doesn't tensorization seem to impact the accuracy adversely? We explore\nthis by assessing how \\textit{truncating} the convolution kernels of\n\\textit{dense} (untensorized) CNNs impact their accuracy. Specifically, we\ntruncated the kernels of (i) a vanilla four-layer CNN and (ii) ResNet-50\npre-trained for image classification on CIFAR-10 and CIFAR-100 datasets. We\nfound that kernels (especially those inside deeper layers) could often be\ntruncated along several cuts resulting in significant loss in kernel norm but\nnot in classification accuracy. This suggests that such ``correlation\ncompression'' (underlying tensorization) is an intrinsic feature of how\ninformation is encoded in dense CNNs. We also found that aggressively truncated\nmodels could often recover the pre-truncation accuracy after only a few epochs\nof re-training, suggesting that compressing the internal correlations of\nconvolution layers does not often transport the model to a worse minimum. Our\nresults can be applied to tensorize and compress CNN models more effectively.\n","authors":["Sukhbinder Singh","Saeed S. Jahromi","Roman Orus"],"pdf_url":"https://arxiv.org/pdf/2403.14379v2.pdf","comment":"40 pages, 21 images"},{"id":"http://arxiv.org/abs/2408.10135v1","updated":"2024-08-19T16:33:17Z","published":"2024-08-19T16:33:17Z","title":"$R^2$-Mesh: Reinforcement Learning Powered Mesh Reconstruction via\n Geometry and Appearance Refinement","summary":" Mesh reconstruction based on Neural Radiance Fields (NeRF) is popular in a\nvariety of applications such as computer graphics, virtual reality, and medical\nimaging due to its efficiency in handling complex geometric structures and\nfacilitating real-time rendering. However, existing works often fail to capture\nfine geometric details accurately and struggle with optimizing rendering\nquality. To address these challenges, we propose a novel algorithm that\nprogressively generates and optimizes meshes from multi-view images. Our\napproach initiates with the training of a NeRF model to establish an initial\nSigned Distance Field (SDF) and a view-dependent appearance field.\nSubsequently, we iteratively refine the SDF through a differentiable mesh\nextraction method, continuously updating both the vertex positions and their\nconnectivity based on the loss from mesh differentiable rasterization, while\nalso optimizing the appearance representation. To further leverage\nhigh-fidelity and detail-rich representations from NeRF, we propose an\nonline-learning strategy based on Upper Confidence Bound (UCB) to enhance\nviewpoints by adaptively incorporating images rendered by the initial NeRF\nmodel into the training dataset. Through extensive experiments, we demonstrate\nthat our method delivers highly competitive and robust performance in both mesh\nrendering quality and geometric quality.\n","authors":["Haoyang Wang","Liming Liu","Quanlu Jia","Jiangkai Wu","Haodan Zhang","Peiheng Wang","Xinggong Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.10135v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10134v1","updated":"2024-08-19T16:28:05Z","published":"2024-08-19T16:28:05Z","title":"Perceptual Depth Quality Assessment of Stereoscopic Omnidirectional\n Images","summary":" Depth perception plays an essential role in the viewer experience for\nimmersive virtual reality (VR) visual environments. However, previous research\ninvestigations in the depth quality of 3D/stereoscopic images are rather\nlimited, and in particular, are largely lacking for 3D viewing of 360-degree\nomnidirectional content. In this work, we make one of the first attempts to\ndevelop an objective quality assessment model named depth quality index (DQI)\nfor efficient no-reference (NR) depth quality assessment of stereoscopic\nomnidirectional images. Motivated by the perceptual characteristics of the\nhuman visual system (HVS), the proposed DQI is built upon multi-color-channel,\nadaptive viewport selection, and interocular discrepancy features. Experimental\nresults demonstrate that the proposed method outperforms state-of-the-art image\nquality assessment (IQA) and depth quality assessment (DQA) approaches in\npredicting the perceptual depth quality when tested using both single-viewport\nand omnidirectional stereoscopic image databases. Furthermore, we demonstrate\nthat combining the proposed depth quality model with existing IQA methods\nsignificantly boosts the performance in predicting the overall quality of 3D\nomnidirectional images.\n","authors":["Wei Zhou","Zhou Wang"],"pdf_url":"https://arxiv.org/pdf/2408.10134v1.pdf","comment":"Accepted by IEEE TCSVT"},{"id":"http://arxiv.org/abs/2408.10129v1","updated":"2024-08-19T16:15:56Z","published":"2024-08-19T16:15:56Z","title":"UNINEXT-Cutie: The 1st Solution for LSVOS Challenge RVOS Track","summary":" Referring video object segmentation (RVOS) relies on natural language\nexpressions to segment target objects in video. In this year, LSVOS Challenge\nRVOS Track replaced the origin YouTube-RVOS benchmark with MeViS. MeViS focuses\non referring the target object in a video through its motion descriptions\ninstead of static attributes, posing a greater challenge to RVOS task. In this\nwork, we integrate strengths of that leading RVOS and VOS models to build up a\nsimple and effective pipeline for RVOS. Firstly, We finetune the\nstate-of-the-art RVOS model to obtain mask sequences that are correlated with\nlanguage descriptions. Secondly, based on a reliable and high-quality key\nframes, we leverage VOS model to enhance the quality and temporal consistency\nof the mask results. Finally, we further improve the performance of the RVOS\nmodel using semi-supervised learning. Our solution achieved 62.57 J&F on the\nMeViS test set and ranked 1st place for 6th LSVOS Challenge RVOS Track.\n","authors":["Hao Fang","Feiyu Pan","Xiankai Lu","Wei Zhang","Runmin Cong"],"pdf_url":"https://arxiv.org/pdf/2408.10129v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10125v1","updated":"2024-08-19T16:13:14Z","published":"2024-08-19T16:13:14Z","title":"Video Object Segmentation via SAM 2: The 4th Solution for LSVOS\n Challenge VOS Track","summary":" Video Object Segmentation (VOS) task aims to segmenting a particular object\ninstance throughout the entire video sequence given only the object mask of the\nfirst frame. Recently, Segment Anything Model 2 (SAM 2) is proposed, which is a\nfoundation model towards solving promptable visual segmentation in images and\nvideos. SAM 2 builds a data engine, which improves model and data via user\ninteraction, to collect the largest video segmentation dataset to date. SAM 2\nis a simple transformer architecture with streaming memory for real-time video\nprocessing, which trained on the date provides strong performance across a wide\nrange of tasks. In this work, we evaluate the zero-shot performance of SAM 2 on\nthe more challenging VOS datasets MOSE and LVOS. Without fine-tuning on the\ntraining set, SAM 2 achieved 75.79 J&F on the test set and ranked 4th place for\n6th LSVOS Challenge VOS Track.\n","authors":["Feiyu Pan","Hao Fang","Runmin Cong","Wei Zhang","Xiankai Lu"],"pdf_url":"https://arxiv.org/pdf/2408.10125v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2408.00714"},{"id":"http://arxiv.org/abs/2408.10123v1","updated":"2024-08-19T16:11:47Z","published":"2024-08-19T16:11:47Z","title":"Learning Precise Affordances from Egocentric Videos for Robotic\n Manipulation","summary":" Affordance, defined as the potential actions that an object offers, is\ncrucial for robotic manipulation tasks. A deep understanding of affordance can\nlead to more intelligent AI systems. For example, such knowledge directs an\nagent to grasp a knife by the handle for cutting and by the blade when passing\nit to someone. In this paper, we present a streamlined affordance learning\nsystem that encompasses data collection, effective model training, and robot\ndeployment. First, we collect training data from egocentric videos in an\nautomatic manner. Different from previous methods that focus only on the object\ngraspable affordance and represent it as coarse heatmaps, we cover both\ngraspable (e.g., object handles) and functional affordances (e.g., knife\nblades, hammer heads) and extract data with precise segmentation masks. We then\npropose an effective model, termed Geometry-guided Affordance Transformer\n(GKT), to train on the collected data. GKT integrates an innovative Depth\nFeature Injector (DFI) to incorporate 3D shape and geometric priors, enhancing\nthe model's understanding of affordances. To enable affordance-oriented\nmanipulation, we further introduce Aff-Grasp, a framework that combines GKT\nwith a grasp generation model. For comprehensive evaluation, we create an\naffordance evaluation dataset with pixel-wise annotations, and design\nreal-world tasks for robot experiments. The results show that GKT surpasses the\nstate-of-the-art by 15.9% in mIoU, and Aff-Grasp achieves high success rates of\n95.5% in affordance prediction and 77.1% in successful grasping among 179\ntrials, including evaluations with seen, unseen objects, and cluttered scenes.\n","authors":["Gen Li","Nikolaos Tsagkas","Jifei Song","Ruaridh Mon-Williams","Sethu Vijayakumar","Kun Shao","Laura Sevilla-Lara"],"pdf_url":"https://arxiv.org/pdf/2408.10123v1.pdf","comment":"Project page: https://reagan1311.github.io/affgrasp"},{"id":"http://arxiv.org/abs/2408.10119v1","updated":"2024-08-19T16:08:00Z","published":"2024-08-19T16:08:00Z","title":"Factorized-Dreamer: Training A High-Quality Video Generator with Limited\n and Low-Quality Data","summary":" Text-to-video (T2V) generation has gained significant attention due to its\nwide applications to video generation, editing, enhancement and translation,\n\\etc. However, high-quality (HQ) video synthesis is extremely challenging\nbecause of the diverse and complex motions existed in real world. Most existing\nworks struggle to address this problem by collecting large-scale HQ videos,\nwhich are inaccessible to the community. In this work, we show that publicly\navailable limited and low-quality (LQ) data are sufficient to train a HQ video\ngenerator without recaptioning or finetuning. We factorize the whole T2V\ngeneration process into two steps: generating an image conditioned on a highly\ndescriptive caption, and synthesizing the video conditioned on the generated\nimage and a concise caption of motion details. Specifically, we present\n\\emph{Factorized-Dreamer}, a factorized spatiotemporal framework with several\ncritical designs for T2V generation, including an adapter to combine text and\nimage embeddings, a pixel-aware cross attention module to capture pixel-level\nimage information, a T5 text encoder to better understand motion description,\nand a PredictNet to supervise optical flows. We further present a noise\nschedule, which plays a key role in ensuring the quality and stability of video\ngeneration. Our model lowers the requirements in detailed captions and HQ\nvideos, and can be directly trained on limited LQ datasets with noisy and brief\ncaptions such as WebVid-10M, largely alleviating the cost to collect\nlarge-scale HQ video-text pairs. Extensive experiments in a variety of T2V and\nimage-to-video generation tasks demonstrate the effectiveness of our proposed\nFactorized-Dreamer. Our source codes are available at\n\\url{https://github.com/yangxy/Factorized-Dreamer/}.\n","authors":["Tao Yang","Yangming Shi","Yunwen Huang","Feng Chen","Yin Zheng","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.10119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07794v2","updated":"2024-08-19T16:01:35Z","published":"2023-10-11T18:28:15Z","title":"CRITERIA: a New Benchmarking Paradigm for Evaluating Trajectory\n Prediction Models for Autonomous Driving","summary":" Benchmarking is a common method for evaluating trajectory prediction models\nfor autonomous driving. Existing benchmarks rely on datasets, which are biased\ntowards more common scenarios, such as cruising, and distance-based metrics\nthat are computed by averaging over all scenarios. Following such a regiment\nprovides a little insight into the properties of the models both in terms of\nhow well they can handle different scenarios and how admissible and diverse\ntheir outputs are. There exist a number of complementary metrics designed to\nmeasure the admissibility and diversity of trajectories, however, they suffer\nfrom biases, such as length of trajectories.\n In this paper, we propose a new benChmarking paRadIgm for evaluaTing\ntrajEctoRy predIction Approaches (CRITERIA). Particularly, we propose 1) a\nmethod for extracting driving scenarios at varying levels of specificity\naccording to the structure of the roads, models' performance, and data\nproperties for fine-grained ranking of prediction models; 2) A set of new\nbias-free metrics for measuring diversity, by incorporating the characteristics\nof a given scenario, and admissibility, by considering the structure of roads\nand kinematic compliancy, motivated by real-world driving constraints. 3) Using\nthe proposed benchmark, we conduct extensive experimentation on a\nrepresentative set of the prediction models using the large scale Argoverse\ndataset. We show that the proposed benchmark can produce a more accurate\nranking of the models and serve as a means of characterizing their behavior. We\nfurther present ablation studies to highlight contributions of different\nelements that are used to compute the proposed metrics.\n","authors":["Changhe Chen","Mozhgan Pourkeshavarz","Amir Rasouli"],"pdf_url":"https://arxiv.org/pdf/2310.07794v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10073v1","updated":"2024-08-19T15:16:36Z","published":"2024-08-19T15:16:36Z","title":"Modelling the Distribution of Human Motion for Sign Language Assessment","summary":" Sign Language Assessment (SLA) tools are useful to aid in language learning\nand are underdeveloped. Previous work has focused on isolated signs or\ncomparison against a single reference video to assess Sign Languages (SL). This\npaper introduces a novel SLA tool designed to evaluate the comprehensibility of\nSL by modelling the natural distribution of human motion. We train our pipeline\non data from native signers and evaluate it using SL learners. We compare our\nresults to ratings from a human raters study and find strong correlation\nbetween human ratings and our tool. We visually demonstrate our tools ability\nto detect anomalous results spatio-temporally, providing actionable feedback to\naid in SL learning and assessment.\n","authors":["Oliver Cory","Ozge Mercanoglu Sincan","Matthew Vowels","Alessia Battisti","Franz Holzknecht","Katja Tissi","Sandra Sidler-Miserez","Tobias Haug","Sarah Ebling","Richard Bowden"],"pdf_url":"https://arxiv.org/pdf/2408.10073v1.pdf","comment":"Accepted to Twelfth International Workshop on Assistive Computer\n Vision and Robotics at ECCV 2024"},{"id":"http://arxiv.org/abs/2408.10072v1","updated":"2024-08-19T15:15:20Z","published":"2024-08-19T15:15:20Z","title":"FFAA: Multimodal Large Language Model based Explainable Open-World Face\n Forgery Analysis Assistant","summary":" The rapid advancement of deepfake technologies has sparked widespread public\nconcern, particularly as face forgery poses a serious threat to public\ninformation security. However, the unknown and diverse forgery techniques,\nvaried facial features and complex environmental factors pose significant\nchallenges for face forgery analysis. Existing datasets lack descriptions of\nthese aspects, making it difficult for models to distinguish between real and\nforged faces using only visual information amid various confounding factors. In\naddition, existing methods do not yield user-friendly and explainable results,\ncomplicating the understanding of the model's decision-making process. To\naddress these challenges, we introduce a novel Open-World Face Forgery Analysis\nVQA (OW-FFA-VQA) task and the corresponding benchmark. To tackle this task, we\nfirst establish a dataset featuring a diverse collection of real and forged\nface images with essential descriptions and reliable forgery reasoning. Base on\nthis dataset, we introduce FFAA: Face Forgery Analysis Assistant, consisting of\na fine-tuned Multimodal Large Language Model (MLLM) and Multi-answer\nIntelligent Decision System (MIDS). By integrating hypothetical prompts with\nMIDS, the impact of fuzzy classification boundaries is effectively mitigated,\nenhancing the model's robustness. Extensive experiments demonstrate that our\nmethod not only provides user-friendly explainable results but also\nsignificantly boosts accuracy and robustness compared to previous methods.\n","authors":["Zhengchao Huang","Bin Xia","Zicheng Lin","Zhun Mou","Wenming Yang"],"pdf_url":"https://arxiv.org/pdf/2408.10072v1.pdf","comment":"17 pages, 18 figures; project page: https://ffaa-vl.github.io"},{"id":"http://arxiv.org/abs/2408.10069v1","updated":"2024-08-19T15:11:01Z","published":"2024-08-19T15:11:01Z","title":"LNQ 2023 challenge: Benchmark of weakly-supervised techniques for\n mediastinal lymph node quantification","summary":" Accurate assessment of lymph node size in 3D CT scans is crucial for cancer\nstaging, therapeutic management, and monitoring treatment response. Existing\nstate-of-the-art segmentation frameworks in medical imaging often rely on fully\nannotated datasets. However, for lymph node segmentation, these datasets are\ntypically small due to the extensive time and expertise required to annotate\nthe numerous lymph nodes in 3D CT scans. Weakly-supervised learning, which\nleverages incomplete or noisy annotations, has recently gained interest in the\nmedical imaging community as a potential solution. Despite the variety of\nweakly-supervised techniques proposed, most have been validated only on private\ndatasets or small publicly available datasets. To address this limitation, the\nMediastinal Lymph Node Quantification (LNQ) challenge was organized in\nconjunction with the 26th International Conference on Medical Image Computing\nand Computer Assisted Intervention (MICCAI 2023). This challenge aimed to\nadvance weakly-supervised segmentation methods by providing a new, partially\nannotated dataset and a robust evaluation framework. A total of 16 teams from 5\ncountries submitted predictions to the validation leaderboard, and 6 teams from\n3 countries participated in the evaluation phase. The results highlighted both\nthe potential and the current limitations of weakly-supervised approaches. On\none hand, weakly-supervised approaches obtained relatively good performance\nwith a median Dice score of $61.0\\%$. On the other hand, top-ranked teams, with\na median Dice score exceeding $70\\%$, boosted their performance by leveraging\nsmaller but fully annotated datasets to combine weak supervision and full\nsupervision. This highlights both the promise of weakly-supervised methods and\nthe ongoing need for high-quality, fully annotated data to achieve higher\nsegmentation performance.\n","authors":["Reuben Dorent","Roya Khajavi","Tagwa Idris","Erik Ziegler","Bhanusupriya Somarouthu","Heather Jacene","Ann LaCasce","Jonathan Deissler","Jan Ehrhardt","Sofija Engelson","Stefan M. Fischer","Yun Gu","Heinz Handels","Satoshi Kasai","Satoshi Kondo","Klaus Maier-Hein","Julia A. Schnabel","Guotai Wang","Litingyu Wang","Tassilo Wald","Guang-Zhong Yang","Hanxiao Zhang","Minghui Zhang","Steve Pieper","Gordon Harris","Ron Kikinis","Tina Kapur"],"pdf_url":"https://arxiv.org/pdf/2408.10069v1.pdf","comment":"Submitted to MELBA"},{"id":"http://arxiv.org/abs/2408.10067v1","updated":"2024-08-19T15:04:42Z","published":"2024-08-19T15:04:42Z","title":"Towards a Benchmark for Colorectal Cancer Segmentation in Endorectal\n Ultrasound Videos: Dataset and Model Development","summary":" Endorectal ultrasound (ERUS) is an important imaging modality that provides\nhigh reliability for diagnosing the depth and boundary of invasion in\ncolorectal cancer. However, the lack of a large-scale ERUS dataset with\nhigh-quality annotations hinders the development of automatic ultrasound\ndiagnostics. In this paper, we collected and annotated the first benchmark\ndataset that covers diverse ERUS scenarios, i.e. colorectal cancer\nsegmentation, detection, and infiltration depth staging. Our ERUS-10K dataset\ncomprises 77 videos and 10,000 high-resolution annotated frames. Based on this\ndataset, we further introduce a benchmark model for colorectal cancer\nsegmentation, named the Adaptive Sparse-context TRansformer (ASTR). ASTR is\ndesigned based on three considerations: scanning mode discrepancy, temporal\ninformation, and low computational complexity. For generalizing to different\nscanning modes, the adaptive scanning-mode augmentation is proposed to convert\nbetween raw sector images and linear scan ones. For mining temporal\ninformation, the sparse-context transformer is incorporated to integrate\ninter-frame local and global features. For reducing computational complexity,\nthe sparse-context block is introduced to extract contextual features from\nauxiliary frames. Finally, on the benchmark dataset, the proposed ASTR model\nachieves a 77.6% Dice score in rectal cancer segmentation, largely\noutperforming previous state-of-the-art methods.\n","authors":["Yuncheng Jiang","Yiwen Hu","Zixun Zhang","Jun Wei","Chun-Mei Feng","Xuemei Tang","Xiang Wan","Yong Liu","Shuguang Cui","Zhen Li"],"pdf_url":"https://arxiv.org/pdf/2408.10067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.07952v2","updated":"2024-08-19T14:56:05Z","published":"2024-06-12T07:22:05Z","title":"Spatial-Frequency Dual Progressive Attention Network For Medical Image\n Segmentation","summary":" In medical images, various types of lesions often manifest significant\ndifferences in their shape and texture. Accurate medical image segmentation\ndemands deep learning models with robust capabilities in multi-scale and\nboundary feature learning. However, previous networks still have limitations in\naddressing the above issues. Firstly, previous networks simultaneously fuse\nmulti-level features or employ deep supervision to enhance multi-scale\nlearning. However, this may lead to feature redundancy and excessive\ncomputational overhead, which is not conducive to network training and clinical\ndeployment. Secondly, the majority of medical image segmentation networks\nexclusively learn features in the spatial domain, disregarding the abundant\nglobal information in the frequency domain. This results in a bias towards\nlow-frequency components, neglecting crucial high-frequency information. To\naddress these problems, we introduce SF-UNet, a spatial-frequency dual-domain\nattention network. It comprises two main components: the Multi-scale\nProgressive Channel Attention (MPCA) block, which progressively extract\nmulti-scale features across adjacent encoder layers, and the lightweight\nFrequency-Spatial Attention (FSA) block, with only 0.05M parameters, enabling\nconcurrent learning of texture and boundary features from both spatial and\nfrequency domains. We validate the effectiveness of the proposed SF-UNet on\nthree public datasets. Experimental results show that compared to previous\nstate-of-the-art (SOTA) medical image segmentation networks, SF-UNet achieves\nthe best performance, and achieves up to 9.4\\% and 10.78\\% improvement in DSC\nand IOU. Codes will be released at https://github.com/nkicsl/SF-UNet.\n","authors":["Zhenhuan Zhou","Along He","Yanlin Wu","Rui Yao","Xueshuo Xie","Tao Li"],"pdf_url":"https://arxiv.org/pdf/2406.07952v2.pdf","comment":"6 pages accepted by 2024 IEEE International Conference on\n Bioinformatics and Biomedicine (BIBM 2024)"},{"id":"http://arxiv.org/abs/2408.10060v1","updated":"2024-08-19T14:54:12Z","published":"2024-08-19T14:54:12Z","title":"Facial Wrinkle Segmentation for Cosmetic Dermatology: Pretraining with\n Texture Map-Based Weak Supervision","summary":" Facial wrinkle detection plays a crucial role in cosmetic dermatology.\nPrecise manual segmentation of facial wrinkles is challenging and\ntime-consuming, with inherent subjectivity leading to inconsistent results\namong graders. To address this issue, we propose two solutions. First, we build\nand release the first public facial wrinkle dataset, `FFHQ-Wrinkle', an\nextension of the NVIDIA FFHQ dataset. This dataset includes 1,000 images with\nhuman labels and 50,000 images with automatically generated weak labels. This\ndataset can foster the research community to develop advanced wrinkle detection\nalgorithms. Second, we introduce a training strategy for U-Net-like\nencoder-decoder models to detect wrinkles across the face automatically. Our\nmethod employs a two-stage training strategy: texture map pretraining and\nfinetuning on human-labeled data. Initially, we pretrain models on a large\ndataset with weak labels (N=50k) or masked texture maps generated through\ncomputer vision techniques, without human intervention. Subsequently, we\nfinetune the models using human-labeled data (N=1k), which consists of manually\nlabeled wrinkle masks. During finetuning, the network inputs a combination of\nRGB and masked texture maps, comprising four channels. We effectively combine\nlabels from multiple annotators to minimize subjectivity in manual labeling.\nOur strategies demonstrate improved segmentation performance in facial wrinkle\nsegmentation both quantitatively and visually compared to existing pretraining\nmethods.\n","authors":["Junho Moon","Haejun Chung","Ikbeom Jang"],"pdf_url":"https://arxiv.org/pdf/2408.10060v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07433v3","updated":"2024-08-19T14:43:24Z","published":"2024-08-14T10:08:46Z","title":"MagicFace: Training-free Universal-Style Human Image Customized\n Synthesis","summary":" Current state-of-the-art methods for human image customized synthesis\ntypically require tedious training on large-scale datasets. In such cases, they\nare prone to overfitting and struggle to personalize individuals of unseen\nstyles. Moreover, these methods extensively focus on single-concept human image\nsynthesis and lack the flexibility needed for customizing individuals with\nmultiple given concepts, thereby impeding their broader practical application.\nTo this end, we propose MagicFace, a novel training-free method for\nuniversal-style human image personalized synthesis, enabling multi-concept\ncustomization by accurately integrating reference concept features into their\nlatent generated region at the pixel level. Specifically, MagicFace introduces\na coarse-to-fine generation pipeline, involving two sequential stages: semantic\nlayout construction and concept feature injection. This is achieved by our\nReference-aware Self-Attention (RSA) and Region-grouped Blend Attention (RBA)\nmechanisms. In the first stage, RSA enables the latent image to query features\nfrom all reference concepts simultaneously, extracting the overall semantic\nunderstanding to facilitate the initial semantic layout establishment. In the\nsecond stage, we employ an attention-based semantic segmentation method to\npinpoint the latent generated regions of all concepts at each step. Following\nthis, RBA divides the pixels of the latent image into semantic groups, with\neach group querying fine-grained features from the corresponding reference\nconcept, which ensures precise attribute alignment and feature injection.\nThroughout the generation process, a weighted mask strategy is employed to\nensure the model focuses more on the reference concepts. Extensive experiments\ndemonstrate the superiority of MagicFace in both human-centric subject-to-image\nsynthesis and multi-concept human image customization.\n","authors":["Yibin Wang","Weizhong Zhang","Cheng Jin"],"pdf_url":"https://arxiv.org/pdf/2408.07433v3.pdf","comment":"project page: https://codegoat24.github.io/MagicFace"},{"id":"http://arxiv.org/abs/2408.10046v1","updated":"2024-08-19T14:38:27Z","published":"2024-08-19T14:38:27Z","title":"Exploiting Fine-Grained Prototype Distribution for Boosting Unsupervised\n Class Incremental Learning","summary":" The dynamic nature of open-world scenarios has attracted more attention to\nclass incremental learning (CIL). However, existing CIL methods typically\npresume the availability of complete ground-truth labels throughout the\ntraining process, an assumption rarely met in practical applications.\nConsequently, this paper explores a more challenging problem of unsupervised\nclass incremental learning (UCIL). The essence of addressing this problem lies\nin effectively capturing comprehensive feature representations and discovering\nunknown novel classes. To achieve this, we first model the knowledge of class\ndistribution by exploiting fine-grained prototypes. Subsequently, a granularity\nalignment technique is introduced to enhance the unsupervised class discovery.\nAdditionally, we proposed a strategy to minimize overlap between novel and\nexisting classes, thereby preserving historical knowledge and mitigating the\nphenomenon of catastrophic forgetting. Extensive experiments on the five\ndatasets demonstrate that our approach significantly outperforms current\nstate-of-the-art methods, indicating the effectiveness of the proposed method.\n","authors":["Jiaming Liu","Hongyuan Liu","Zhili Qin","Wei Han","Yulu Fan","Qinli Yang","Junming Shao"],"pdf_url":"https://arxiv.org/pdf/2408.10046v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10041v1","updated":"2024-08-19T14:34:17Z","published":"2024-08-19T14:34:17Z","title":"Implicit Gaussian Splatting with Efficient Multi-Level Tri-Plane\n Representation","summary":" Recent advancements in photo-realistic novel view synthesis have been\nsignificantly driven by Gaussian Splatting (3DGS). Nevertheless, the explicit\nnature of 3DGS data entails considerable storage requirements, highlighting a\npressing need for more efficient data representations. To address this, we\npresent Implicit Gaussian Splatting (IGS), an innovative hybrid model that\nintegrates explicit point clouds with implicit feature embeddings through a\nmulti-level tri-plane architecture. This architecture features 2D feature grids\nat various resolutions across different levels, facilitating continuous spatial\ndomain representation and enhancing spatial correlations among Gaussian\nprimitives. Building upon this foundation, we introduce a level-based\nprogressive training scheme, which incorporates explicit spatial\nregularization. This method capitalizes on spatial correlations to enhance both\nthe rendering quality and the compactness of the IGS representation.\nFurthermore, we propose a novel compression pipeline tailored for both point\nclouds and 2D feature grids, considering the entropy variations across\ndifferent levels. Extensive experimental evaluations demonstrate that our\nalgorithm can deliver high-quality rendering using only a few MBs, effectively\nbalancing storage efficiency and rendering fidelity, and yielding results that\nare competitive with the state-of-the-art.\n","authors":["Minye Wu","Tinne Tuytelaars"],"pdf_url":"https://arxiv.org/pdf/2408.10041v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10037v1","updated":"2024-08-19T14:30:29Z","published":"2024-08-19T14:30:29Z","title":"SHARP: Segmentation of Hands and Arms by Range using Pseudo-Depth for\n Enhanced Egocentric 3D Hand Pose Estimation and Action Recognition","summary":" Hand pose represents key information for action recognition in the egocentric\nperspective, where the user is interacting with objects. We propose to improve\negocentric 3D hand pose estimation based on RGB frames only by using\npseudo-depth images. Incorporating state-of-the-art single RGB image depth\nestimation techniques, we generate pseudo-depth representations of the frames\nand use distance knowledge to segment irrelevant parts of the scene. The\nresulting depth maps are then used as segmentation masks for the RGB frames.\nExperimental results on H2O Dataset confirm the high accuracy of the estimated\npose with our method in an action recognition task. The 3D hand pose, together\nwith information from object detection, is processed by a transformer-based\naction recognition network, resulting in an accuracy of 91.73%, outperforming\nall state-of-the-art methods. Estimations of 3D hand pose result in competitive\nperformance with existing methods with a mean pose error of 28.66 mm. This\nmethod opens up new possibilities for employing distance information in\negocentric 3D hand pose estimation without relying on depth sensors.\n","authors":["Wiktor Mucha","Michael Wray","Martin Kampel"],"pdf_url":"https://arxiv.org/pdf/2408.10037v1.pdf","comment":"Accepted at 27th International Conference on Pattern Recognition\n (ICPR)"},{"id":"http://arxiv.org/abs/2408.10031v1","updated":"2024-08-19T14:24:46Z","published":"2024-08-19T14:24:46Z","title":"Dynamic Label Injection for Imbalanced Industrial Defect Segmentation","summary":" In this work, we propose a simple yet effective method to tackle the problem\nof imbalanced multi-class semantic segmentation in deep learning systems. One\nof the key properties for a good training set is the balancing among the\nclasses. When the input distribution is heavily imbalanced in the number of\ninstances, the learning process could be hindered or difficult to carry on. To\nthis end, we propose a Dynamic Label Injection (DLI) algorithm to impose a\nuniform distribution in the input batch. Our algorithm computes the current\nbatch defect distribution and re-balances it by transferring defects using a\ncombination of Poisson-based seamless image cloning and cut-paste techniques. A\nthorough experimental section on the Magnetic Tiles dataset shows better\nresults of DLI compared to other balancing loss approaches also in the\nchallenging weakly-supervised setup. The code is available at\nhttps://github.com/covisionlab/dynamic-label-injection.git\n","authors":["Emanuele Caruso","Francesco Pelosin","Alessandro Simoni","Marco Boschetti"],"pdf_url":"https://arxiv.org/pdf/2408.10031v1.pdf","comment":"ECCV 2024 VISION Workshop"},{"id":"http://arxiv.org/abs/2403.06126v2","updated":"2024-08-19T14:22:42Z","published":"2024-03-10T08:15:51Z","title":"In-context Prompt Learning for Test-time Vision Recognition with Frozen\n Vision-language Model","summary":" Current pre-trained vision-language models, such as CLIP, have demonstrated\nremarkable zero-shot generalization capabilities across various downstream\ntasks. However, their performance significantly degrades when test inputs\nexhibit different distributions. In this paper, we explore the concept of\ntest-time prompt tuning (TTPT), which facilitates the adaptation of the CLIP\nmodel to novel downstream tasks through a one-step unsupervised optimization\nthat involves only test samples. Inspired by in-context learning in natural\nlanguage processing (NLP), we propose In-Context Prompt Learning (InCPL) for\ntest-time visual recognition tasks, which empowers a pre-trained\nvision-language model with labeled examples as context information on\ndownstream task. Specifically, InCPL associates a new test sample with very few\nlabeled examples (sometimes just one) as context information, enabling reliable\nlabel estimation for the test sample and facilitating model adaptation. To\nachieve this, InCPL employs an efficient language-to-vision translator to\nexplore the textual prior information for visual prompt learning. Further, we\nintroduce a context-aware unsupervised loss to optimize visual prompts tailored\nto test samples. Finally, we design a cyclic learning strategy for visual and\ntextual prompts to ensure mutual synergy across different modalities. This\nenables a pre-trained, frozen CLIP model to adapt to any task using its learned\nadaptive prompt. Our method demonstrates superior performance and achieves\nstate-of-the-art results across various downstream datasets.\n","authors":["Junhui Yin","Xinyu Zhang","Lin Wu","Xiaojie Wang"],"pdf_url":"https://arxiv.org/pdf/2403.06126v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10024v1","updated":"2024-08-19T14:18:21Z","published":"2024-08-19T14:18:21Z","title":"Towards Robust Federated Image Classification: An Empirical Study of\n Weight Selection Strategies in Manufacturing","summary":" In the realm of Federated Learning (FL), particularly within the\nmanufacturing sector, the strategy for selecting client weights for server\naggregation is pivotal for model performance. This study investigates the\ncomparative effectiveness of two weight selection strategies: Final Epoch\nWeight Selection (FEWS) and Optimal Epoch Weight Selection (OEWS). Designed for\nmanufacturing contexts where collaboration typically involves a limited number\nof partners (two to four clients), our research focuses on federated image\nclassification tasks. We employ various neural network architectures, including\nEfficientNet, ResNet, and VGG, to assess the impact of these weight selection\nstrategies on model convergence and robustness.\n Our research aims to determine whether FEWS or OEWS enhances the global FL\nmodel's performance across communication rounds (CRs). Through empirical\nanalysis and rigorous experimentation, we seek to provide valuable insights for\noptimizing FL implementations in manufacturing, ensuring that collaborative\nefforts yield the most effective and reliable models with a limited number of\nparticipating clients. The findings from this study are expected to refine FL\npractices significantly in manufacturing, thereby enhancing the efficiency and\nperformance of collaborative machine learning endeavors in this vital sector.\n","authors":["Vinit Hegiste","Tatjana Legler","Martin Ruskowski"],"pdf_url":"https://arxiv.org/pdf/2408.10024v1.pdf","comment":"Submitted to The 2nd IEEE International Conference on Federated\n Learning Technologies and Applications (FLTA24)"},{"id":"http://arxiv.org/abs/2408.10021v1","updated":"2024-08-19T14:13:30Z","published":"2024-08-19T14:13:30Z","title":"Detecting Adversarial Attacks in Semantic Segmentation via Uncertainty\n Estimation: A Deep Analysis","summary":" Deep neural networks have demonstrated remarkable effectiveness across a wide\nrange of tasks such as semantic segmentation. Nevertheless, these networks are\nvulnerable to adversarial attacks that add imperceptible perturbations to the\ninput image, leading to false predictions. This vulnerability is particularly\ndangerous in safety-critical applications like automated driving. While\nadversarial examples and defense strategies are well-researched in the context\nof image classification, there is comparatively less research focused on\nsemantic segmentation. Recently, we have proposed an uncertainty-based method\nfor detecting adversarial attacks on neural networks for semantic segmentation.\nWe observed that uncertainty, as measured by the entropy of the output\ndistribution, behaves differently on clean versus adversely perturbed images,\nand we utilize this property to differentiate between the two. In this extended\nversion of our work, we conduct a detailed analysis of uncertainty-based\ndetection of adversarial attacks including a diverse set of adversarial attacks\nand various state-of-the-art neural networks. Our numerical experiments show\nthe effectiveness of the proposed uncertainty-based detection method, which is\nlightweight and operates as a post-processing step, i.e., no model\nmodifications or knowledge of the adversarial example generation process are\nrequired.\n","authors":["Kira Maag","Roman Resner","Asja Fischer"],"pdf_url":"https://arxiv.org/pdf/2408.10021v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10012v1","updated":"2024-08-19T14:05:58Z","published":"2024-08-19T14:05:58Z","title":"CLIPCleaner: Cleaning Noisy Labels with CLIP","summary":" Learning with Noisy labels (LNL) poses a significant challenge for the\nMachine Learning community. Some of the most widely used approaches that select\nas clean samples for which the model itself (the in-training model) has high\nconfidence, e.g., `small loss', can suffer from the so called\n`self-confirmation' bias. This bias arises because the in-training model, is at\nleast partially trained on the noisy labels. Furthermore, in the classification\ncase, an additional challenge arises because some of the label noise is between\nclasses that are visually very similar (`hard noise'). This paper addresses\nthese challenges by proposing a method (\\textit{CLIPCleaner}) that leverages\nCLIP, a powerful Vision-Language (VL) model for constructing a zero-shot\nclassifier for efficient, offline, clean sample selection. This has the\nadvantage that the sample selection is decoupled from the in-training model and\nthat the sample selection is aware of the semantic and visual similarities\nbetween the classes due to the way that CLIP is trained. We provide theoretical\njustifications and empirical evidence to demonstrate the advantages of CLIP for\nLNL compared to conventional pre-trained models. Compared to current methods\nthat combine iterative sample selection with various techniques,\n\\textit{CLIPCleaner} offers a simple, single-step approach that achieves\ncompetitive or superior performance on benchmark datasets. To the best of our\nknowledge, this is the first time a VL model has been used for sample selection\nto address the problem of Learning with Noisy Labels (LNL), highlighting their\npotential in the domain.\n","authors":["Chen Feng","Georgios Tzimiropoulos","Ioannis Patras"],"pdf_url":"https://arxiv.org/pdf/2408.10012v1.pdf","comment":"Accepted to ACMMM2024"},{"id":"http://arxiv.org/abs/2408.10007v1","updated":"2024-08-19T13:59:53Z","published":"2024-08-19T13:59:53Z","title":"P3P: Pseudo-3D Pre-training for Scaling 3D Masked Autoencoders","summary":" 3D pre-training is crucial to 3D perception tasks. However, limited by the\ndifficulties in collecting clean 3D data, 3D pre-training consistently faced\ndata scaling challenges. Inspired by semi-supervised learning leveraging\nlimited labeled data and a large amount of unlabeled data, in this work, we\npropose a novel self-supervised pre-training framework utilizing the real 3D\ndata and the pseudo-3D data lifted from images by a large depth estimation\nmodel. Another challenge lies in the efficiency. Previous methods such as\nPoint-BERT and Point-MAE, employ k nearest neighbors to embed 3D tokens,\nrequiring quadratic time complexity. To efficiently pre-train on such a large\namount of data, we propose a linear-time-complexity token embedding strategy\nand a training-efficient 2D reconstruction target. Our method achieves\nstate-of-the-art performance in 3D classification and few-shot learning while\nmaintaining high pre-training and downstream fine-tuning efficiency.\n","authors":["Xuechao Chen","Ying Chen","Jialin Li","Qiang Nie","Yong Liu","Qixing Huang","Yang Li"],"pdf_url":"https://arxiv.org/pdf/2408.10007v1.pdf","comment":"Under review. Pre-print"},{"id":"http://arxiv.org/abs/2408.01228v2","updated":"2024-08-19T13:35:05Z","published":"2024-08-02T12:36:13Z","title":"The Phantom Menace: Unmasking Privacy Leakages in Vision-Language Models","summary":" Vision-Language Models (VLMs) combine visual and textual understanding,\nrendering them well-suited for diverse tasks like generating image captions and\nanswering visual questions across various domains. However, these capabilities\nare built upon training on large amount of uncurated data crawled from the web.\nThe latter may include sensitive information that VLMs could memorize and leak,\nraising significant privacy concerns. In this paper, we assess whether these\nvulnerabilities exist, focusing on identity leakage. Our study leads to three\nkey findings: (i) VLMs leak identity information, even when the vision-language\nalignment and the fine-tuning use anonymized data; (ii) context has little\ninfluence on identity leakage; (iii) simple, widely used anonymization\ntechniques, like blurring, are not sufficient to address the problem. These\nfindings underscore the urgent need for robust privacy protection strategies\nwhen deploying VLMs. Ethical awareness and responsible development practices\nare essential to mitigate these risks.\n","authors":["Simone Caldarella","Massimiliano Mancini","Elisa Ricci","Rahaf Aljundi"],"pdf_url":"https://arxiv.org/pdf/2408.01228v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09984v1","updated":"2024-08-19T13:32:51Z","published":"2024-08-19T13:32:51Z","title":"Boosting Open-Domain Continual Learning via Leveraging Intra-domain\n Category-aware Prototype","summary":" Despite recent progress in enhancing the efficacy of Open-Domain Continual\nLearning (ODCL) in Vision-Language Models (VLM), failing to (1) correctly\nidentify the Task-ID of a test image and (2) use only the category set\ncorresponding to the Task-ID, while preserving the knowledge related to each\ndomain, cannot address the two primary challenges of ODCL: forgetting old\nknowledge and maintaining zero-shot capabilities, as well as the confusions\ncaused by category-relatedness between domains. In this paper, we propose a\nsimple yet effective solution: leveraging intra-domain category-aware\nprototypes for ODCL in CLIP (DPeCLIP), where the prototype is the key to\nbridging the above two processes. Concretely, we propose a training-free\nTask-ID discriminator method, by utilizing prototypes as classifiers for\nidentifying Task-IDs. Furthermore, to maintain the knowledge corresponding to\neach domain, we incorporate intra-domain category-aware prototypes as domain\nprior prompts into the training process. Extensive experiments conducted on 11\ndifferent datasets demonstrate the effectiveness of our approach, achieving\n2.37% and 1.14% average improvement in class-incremental and task-incremental\nsettings, respectively.\n","authors":["Yadong Lu","Shitian Zhao","Boxiang Yun","Dongsheng Jiang","Yin Li","Qingli Li","Yan Wang"],"pdf_url":"https://arxiv.org/pdf/2408.09984v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.02329v2","updated":"2024-08-19T13:27:59Z","published":"2024-01-04T16:06:31Z","title":"Exploring Vacant Classes in Label-Skewed Federated Learning","summary":" Label skews, characterized by disparities in local label distribution across\nclients, pose a significant challenge in federated learning. As minority\nclasses suffer from worse accuracy due to overfitting on local imbalanced data,\nprior methods often incorporate class-balanced learning techniques during local\ntraining. Although these methods improve the mean accuracy across all classes,\nwe observe that vacant classes-referring to categories absent from a client's\ndata distribution-remain poorly recognized. Besides, there is still a gap in\nthe accuracy of local models on minority classes compared to the global model.\nThis paper introduces FedVLS, a novel approach to label-skewed federated\nlearning that integrates both vacant-class distillation and logit suppression\nsimultaneously. Specifically, vacant-class distillation leverages knowledge\ndistillation during local training on each client to retain essential\ninformation related to vacant classes from the global model. Moreover, logit\nsuppression directly penalizes network logits for non-label classes,\neffectively addressing misclassifications in minority classes that may be\nbiased toward majority classes. Extensive experiments validate the efficacy of\nFedVLS, demonstrating superior performance compared to previous\nstate-of-the-art (SOTA) methods across diverse datasets with varying degrees of\nlabel skews. Code is available in the supplementary material.\n","authors":["Kuangpu Guo","Yuhe Ding","Jian Liang","Ran He","Zilei Wang","Tieniu Tan"],"pdf_url":"https://arxiv.org/pdf/2401.02329v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.13602v4","updated":"2024-08-19T13:27:55Z","published":"2024-02-21T08:09:05Z","title":"Hybrid Reasoning Based on Large Language Models for Autonomous Car\n Driving","summary":" Large Language Models (LLMs) have garnered significant attention for their\nability to understand text and images, generate human-like text, and perform\ncomplex reasoning tasks. However, their ability to generalize this advanced\nreasoning with a combination of natural language text for decision-making in\ndynamic situations requires further exploration. In this study, we investigate\nhow well LLMs can adapt and apply a combination of arithmetic and common-sense\nreasoning, particularly in autonomous driving scenarios. We hypothesize that\nLLMs hybrid reasoning abilities can improve autonomous driving by enabling them\nto analyze detected object and sensor data, understand driving regulations and\nphysical laws, and offer additional context. This addresses complex scenarios,\nlike decisions in low visibility (due to weather conditions), where traditional\nmethods might fall short. We evaluated Large Language Models (LLMs) based on\naccuracy by comparing their answers with human-generated ground truth inside\nCARLA. The results showed that when a combination of images (detected objects)\nand sensor data is fed into the LLM, it can offer precise information for brake\nand throttle control in autonomous vehicles across various weather conditions.\nThis formulation and answers can assist in decision-making for auto-pilot\nsystems.\n","authors":["Mehdi Azarafza","Mojtaba Nayyeri","Charles Steinmetz","Steffen Staab","Achim Rettberg"],"pdf_url":"https://arxiv.org/pdf/2402.13602v4.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2403.04484v2","updated":"2024-08-19T13:06:36Z","published":"2024-03-07T13:36:15Z","title":"Source Matters: Source Dataset Impact on Model Robustness in Medical\n Imaging","summary":" Transfer learning has become an essential part of medical imaging\nclassification algorithms, often leveraging ImageNet weights. The domain shift\nfrom natural to medical images has prompted alternatives such as RadImageNet,\noften showing comparable classification performance. However, it remains\nunclear whether the performance gains from transfer learning stem from improved\ngeneralization or shortcut learning. To address this, we conceptualize\nconfounders by introducing the Medical Imaging Contextualized Confounder\nTaxonomy (MICCAT) and investigate a range of confounders across it -- whether\nsynthetic or sampled from the data -- using two public chest X-ray and CT\ndatasets. We show that ImageNet and RadImageNet achieve comparable\nclassification performance, yet ImageNet is much more prone to overfitting to\nconfounders. We recommend that researchers using ImageNet-pretrained models\nreexamine their model robustness by conducting similar experiments. Our code\nand experiments are available at https://github.com/DovileDo/source-matters.\n","authors":["Dovile Juodelyte","Yucheng Lu","Amelia Jiménez-Sánchez","Sabrina Bottazzi","Enzo Ferrante","Veronika Cheplygina"],"pdf_url":"https://arxiv.org/pdf/2403.04484v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15314v2","updated":"2024-08-19T13:04:41Z","published":"2024-03-22T16:06:43Z","title":"Global Control for Local SO(3)-Equivariant Scale-Invariant Vessel\n Segmentation","summary":" Personalized 3D vascular models can aid in a range of diagnostic, prognostic,\nand treatment-planning tasks relevant to cardiovascular disease management.\nDeep learning provides a means to obtain such models automatically from image\ndata. Ideally, a user should have control over the included region in the\nvascular model. Additionally, the model should be watertight and highly\naccurate. To this end, we propose a combination of a global controller\nleveraging voxel mask segmentations to provide boundary conditions for vessels\nof interest to a local, iterative vessel segmentation model. We introduce the\npreservation of scale- and rotational symmetries in the local segmentation\nmodel, leading to generalisation to vessels of unseen sizes and orientations.\nCombined with the global controller, this enables flexible 3D vascular model\nbuilding, without additional retraining. We demonstrate the potential of our\nmethod on a dataset containing abdominal aortic aneurysms (AAAs). Our method\nperforms on par with a state-of-the-art segmentation model in the segmentation\nof AAAs, iliac arteries, and renal arteries, while providing a watertight,\nsmooth surface representation. Moreover, we demonstrate that by adapting the\nglobal controller, we can easily extend vessel sections in the 3D model.\n","authors":["Patryk Rygiel","Dieuwertje Alblas","Christoph Brune","Kak Khee Yeung","Jelmer M. Wolterink"],"pdf_url":"https://arxiv.org/pdf/2403.15314v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09952v1","updated":"2024-08-19T12:47:47Z","published":"2024-08-19T12:47:47Z","title":"Weakly Supervised Pretraining and Multi-Annotator Supervised Finetuning\n for Facial Wrinkle Detection","summary":" 1. Research question: With the growing interest in skin diseases and skin\naesthetics, the ability to predict facial wrinkles is becoming increasingly\nimportant. This study aims to evaluate whether a computational model,\nconvolutional neural networks (CNN), can be trained for automated facial\nwrinkle segmentation. 2. Findings: Our study presents an effective technique\nfor integrating data from multiple annotators and illustrates that transfer\nlearning can enhance performance, resulting in dependable segmentation of\nfacial wrinkles. 3. Meaning: This approach automates intricate and\ntime-consuming tasks of wrinkle analysis with a deep learning framework. It\ncould be used to facilitate skin treatments and diagnostics.\n","authors":["Ik Jun Moon","Junho Moon","Ikbeom Jang"],"pdf_url":"https://arxiv.org/pdf/2408.09952v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11897v3","updated":"2024-08-19T12:47:11Z","published":"2023-12-19T06:42:47Z","title":"Text-Conditioned Resampler For Long Form Video Understanding","summary":" In this paper we present a text-conditioned video resampler (TCR) module that\nuses a pre-trained and frozen visual encoder and large language model (LLM) to\nprocess long video sequences for a task. TCR localises relevant visual features\nfrom the video given a text condition and provides them to a LLM to generate a\ntext response. Due to its lightweight design and use of cross-attention, TCR\ncan process more than 100 frames at a time with plain attention and without\noptimised implementations. We make the following contributions: (i) we design a\ntransformer-based sampling architecture that can process long videos\nconditioned on a task, together with a training method that enables it to\nbridge pre-trained visual and language models; (ii) we identify tasks that\ncould benefit from longer video perception; and (iii) we empirically validate\nits efficacy on a wide variety of evaluation tasks including NextQA, EgoSchema,\nand the EGO4D-LTA challenge.\n","authors":["Bruno Korbar","Yongqin Xian","Alessio Tonioni","Andrew Zisserman","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2312.11897v3.pdf","comment":"Accepted to the ECCV24 conference"},{"id":"http://arxiv.org/abs/2408.09949v1","updated":"2024-08-19T12:42:10Z","published":"2024-08-19T12:42:10Z","title":"C${^2}$RL: Content and Context Representation Learning for Gloss-free\n Sign Language Translation and Retrieval","summary":" Sign Language Representation Learning (SLRL) is crucial for a range of sign\nlanguage-related downstream tasks such as Sign Language Translation (SLT) and\nSign Language Retrieval (SLRet). Recently, many gloss-based and gloss-free SLRL\nmethods have been proposed, showing promising performance. Among them, the\ngloss-free approach shows promise for strong scalability without relying on\ngloss annotations. However, it currently faces suboptimal solutions due to\nchallenges in encoding the intricate, context-sensitive characteristics of sign\nlanguage videos, mainly struggling to discern essential sign features using a\nnon-monotonic video-text alignment strategy. Therefore, we introduce an\ninnovative pretraining paradigm for gloss-free SLRL, called C${^2}$RL, in this\npaper. Specifically, rather than merely incorporating a non-monotonic semantic\nalignment of video and text to learn language-oriented sign features, we\nemphasize two pivotal aspects of SLRL: Implicit Content Learning (ICL) and\nExplicit Context Learning (ECL). ICL delves into the content of communication,\ncapturing the nuances, emphasis, timing, and rhythm of the signs. In contrast,\nECL focuses on understanding the contextual meaning of signs and converting\nthem into equivalent sentences. Despite its simplicity, extensive experiments\nconfirm that the joint optimization of ICL and ECL results in robust sign\nlanguage representation and significant performance gains in gloss-free SLT and\nSLRet tasks. Notably, C${^2}$RL improves the BLEU-4 score by +5.3 on P14T,\n+10.6 on CSL-daily, +6.2 on OpenASL, and +1.3 on How2Sign. It also boosts the\nR@1 score by +8.3 on P14T, +14.4 on CSL-daily, and +5.9 on How2Sign.\nAdditionally, we set a new baseline for the OpenASL dataset in the SLRet task.\n","authors":["Zhigang Chen","Benjia Zhou","Yiqing Huang","Jun Wan","Yibo Hu","Hailin Shi","Yanyan Liang","Zhen Lei","Du Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.09949v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09948v1","updated":"2024-08-19T12:41:46Z","published":"2024-08-19T12:41:46Z","title":"Caption-Driven Explorations: Aligning Image and Text Embeddings through\n Human-Inspired Foveated Vision","summary":" Understanding human attention is crucial for vision science and AI. While\nmany models exist for free-viewing, less is known about task-driven image\nexploration. To address this, we introduce CapMIT1003, a dataset with captions\nand click-contingent image explorations, to study human attention during the\ncaptioning task. We also present NevaClip, a zero-shot method for predicting\nvisual scanpaths by combining CLIP models with NeVA algorithms. NevaClip\ngenerates fixations to align the representations of foveated visual stimuli and\ncaptions. The simulated scanpaths outperform existing human attention models in\nplausibility for captioning and free-viewing tasks. This research enhances the\nunderstanding of human attention and advances scanpath prediction models.\n","authors":["Dario Zanca","Andrea Zugarini","Simon Dietz","Thomas R. Altstidl","Mark A. Turban Ndjeuha","Leo Schwinn","Bjoern Eskofier"],"pdf_url":"https://arxiv.org/pdf/2408.09948v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2305.12380"},{"id":"http://arxiv.org/abs/2405.14137v2","updated":"2024-08-19T12:40:53Z","published":"2024-05-23T03:20:51Z","title":"RET-CLIP: A Retinal Image Foundation Model Pre-trained with Clinical\n Diagnostic Reports","summary":" The Vision-Language Foundation model is increasingly investigated in the\nfields of computer vision and natural language processing, yet its exploration\nin ophthalmology and broader medical applications remains limited. The\nchallenge is the lack of labeled data for the training of foundation model. To\nhandle this issue, a CLIP-style retinal image foundation model is developed in\nthis paper. Our foundation model, RET-CLIP, is specifically trained on a\ndataset of 193,865 patients to extract general features of color fundus\nphotographs (CFPs), employing a tripartite optimization strategy to focus on\nleft eye, right eye, and patient level to reflect real-world clinical\nscenarios. Extensive experiments demonstrate that RET-CLIP outperforms existing\nbenchmarks across eight diverse datasets spanning four critical diagnostic\ncategories: diabetic retinopathy, glaucoma, multiple disease diagnosis, and\nmulti-label classification of multiple diseases, which demonstrate the\nperformance and generality of our foundation model. The sourse code and\npre-trained model are available at https://github.com/sStonemason/RET-CLIP.\n","authors":["Jiawei Du","Jia Guo","Weihang Zhang","Shengzhu Yang","Hanruo Liu","Huiqi Li","Ningli Wang"],"pdf_url":"https://arxiv.org/pdf/2405.14137v2.pdf","comment":"Accepted by MICCAI 2024"},{"id":"http://arxiv.org/abs/2311.12063v3","updated":"2024-08-19T12:34:28Z","published":"2023-11-18T21:58:28Z","title":"DatasetNeRF: Efficient 3D-aware Data Factory with Generative Radiance\n Fields","summary":" Progress in 3D computer vision tasks demands a huge amount of data, yet\nannotating multi-view images with 3D-consistent annotations, or point clouds\nwith part segmentation is both time-consuming and challenging. This paper\nintroduces DatasetNeRF, a novel approach capable of generating infinite,\nhigh-quality 3D-consistent 2D annotations alongside 3D point cloud\nsegmentations, while utilizing minimal 2D human-labeled annotations.\nSpecifically, we leverage the strong semantic prior within a 3D generative\nmodel to train a semantic decoder, requiring only a handful of fine-grained\nlabeled samples. Once trained, the decoder efficiently generalizes across the\nlatent space, enabling the generation of infinite data. The generated data is\napplicable across various computer vision tasks, including video segmentation\nand 3D point cloud segmentation. Our approach not only surpasses baseline\nmodels in segmentation quality, achieving superior 3D consistency and\nsegmentation precision on individual images, but also demonstrates versatility\nby being applicable to both articulated and non-articulated generative models.\nFurthermore, we explore applications stemming from our approach, such as\n3D-aware semantic editing and 3D inversion.\n","authors":["Yu Chi","Fangneng Zhan","Sibo Wu","Christian Theobalt","Adam Kortylewski"],"pdf_url":"https://arxiv.org/pdf/2311.12063v3.pdf","comment":"Accepted by ECCV 2024. Project page:\n https://ychgoaround.github.io/projects/DatasetNeRF/"},{"id":"http://arxiv.org/abs/2403.14362v4","updated":"2024-08-19T12:28:55Z","published":"2024-03-21T12:45:01Z","title":"Less but Better: Enabling Generalized Zero-shot Learning Towards Unseen\n Domains by Intrinsic Learning from Redundant LLM Semantics","summary":" Generalized zero-shot learning (GZSL) focuses on recognizing seen and unseen\nclasses against domain shift problem (DSP) where data of unseen classes may be\nmisclassified as seen classes. However, existing GZSL is still limited to seen\ndomains. In the current work, we pioneer cross-domain GZSL (CDGZSL) which\naddresses GZSL towards unseen domains. Different from existing GZSL methods\nwhich alleviate DSP by generating features of unseen classes with semantics,\nCDGZSL needs to construct a common feature space across domains and acquire the\ncorresponding intrinsic semantics shared among domains to transfer from seen to\nunseen domains. Considering the information asymmetry problem caused by\nredundant class semantics annotated with large language models (LLMs), we\npresent Meta Domain Alignment Semantic Refinement (MDASR). Technically, MDASR\nconsists of two parts: Inter-class Similarity Alignment (ISA), which eliminates\nthe non-intrinsic semantics not shared across all domains under the guidance of\ninter-class feature relationships, and Unseen-class Meta Generation (UMG),\nwhich preserves intrinsic semantics to maintain connectivity between seen and\nunseen classes by simulating feature generation. MDASR effectively aligns the\nredundant semantic space with the common feature space, mitigating the\ninformation asymmetry in CDGZSL. The effectiveness of MDASR is demonstrated on\nthe Office-Home and Mini-DomainNet, and we have shared the LLM-based semantics\nfor these datasets as the benchmark.\n","authors":["Jiaqi Yue","Jiancheng Zhao","Chunhui Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.14362v4.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2404.06365v2","updated":"2024-08-19T12:23:37Z","published":"2024-04-09T15:02:01Z","title":"Dynamic Resolution Guidance for Facial Expression Recognition","summary":" Facial expression recognition (FER) is vital for human-computer interaction\nand emotion analysis, yet recognizing expressions in low-resolution images\nremains challenging. This paper introduces a practical method called Dynamic\nResolution Guidance for Facial Expression Recognition (DRGFER) to effectively\nrecognize facial expressions in images with varying resolutions without\ncompromising FER model accuracy. Our framework comprises two main components:\nthe Resolution Recognition Network (RRN) and the Multi-Resolution Adaptation\nFacial Expression Recognition Network (MRAFER). The RRN determines image\nresolution, outputs a binary vector, and the MRAFER assigns images to suitable\nfacial expression recognition networks based on resolution. We evaluated DRGFER\non widely-used datasets RAFDB and FERPlus, demonstrating that our method\nretains optimal model performance at each resolution and outperforms\nalternative resolution approaches. The proposed framework exhibits robustness\nagainst resolution variations and facial expressions, offering a promising\nsolution for real-world applications.\n","authors":["Songpan Wang","Xu Li","Tianxiang Jiang","Yuanlun Xie"],"pdf_url":"https://arxiv.org/pdf/2404.06365v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09940v1","updated":"2024-08-19T12:23:15Z","published":"2024-08-19T12:23:15Z","title":"ML-CrAIST: Multi-scale Low-high Frequency Information-based Cross black\n Attention with Image Super-resolving Transformer","summary":" Recently, transformers have captured significant interest in the area of\nsingle-image super-resolution tasks, demonstrating substantial gains in\nperformance. Current models heavily depend on the network's extensive ability\nto extract high-level semantic details from images while overlooking the\neffective utilization of multi-scale image details and intermediate information\nwithin the network. Furthermore, it has been observed that high-frequency areas\nin images present significant complexity for super-resolution compared to\nlow-frequency areas. This work proposes a transformer-based super-resolution\narchitecture called ML-CrAIST that addresses this gap by utilizing low-high\nfrequency information in multiple scales. Unlike most of the previous work\n(either spatial or channel), we operate spatial and channel self-attention,\nwhich concurrently model pixel interaction from both spatial and channel\ndimensions, exploiting the inherent correlations across spatial and channel\naxis. Further, we devise a cross-attention block for super-resolution, which\nexplores the correlations between low and high-frequency information.\nQuantitative and qualitative assessments indicate that our proposed ML-CrAIST\nsurpasses state-of-the-art super-resolution methods (e.g., 0.15 dB gain\n@Manga109 $\\times$4). Code is available on:\nhttps://github.com/Alik033/ML-CrAIST.\n","authors":["Alik Pramanick","Utsav Bheda","Arijit Sur"],"pdf_url":"https://arxiv.org/pdf/2408.09940v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09931v1","updated":"2024-08-19T12:11:50Z","published":"2024-08-19T12:11:50Z","title":"Pose-GuideNet: Automatic Scanning Guidance for Fetal Head Ultrasound\n from Pose Estimation","summary":" 3D pose estimation from a 2D cross-sectional view enables healthcare\nprofessionals to navigate through the 3D space, and such techniques initiate\nautomatic guidance in many image-guided radiology applications. In this work,\nwe investigate how estimating 3D fetal pose from freehand 2D ultrasound\nscanning can guide a sonographer to locate a head standard plane. Fetal head\npose is estimated by the proposed Pose-GuideNet, a novel 2D/3D registration\napproach to align freehand 2D ultrasound to a 3D anatomical atlas without the\nacquisition of 3D ultrasound. To facilitate the 2D to 3D cross-dimensional\nprojection, we exploit the prior knowledge in the atlas to align the standard\nplane frame in a freehand scan. A semantic-aware contrastive-based approach is\nfurther proposed to align the frames that are off standard planes based on\ntheir anatomical similarity. In the experiment, we enhance the existing\nassessment of freehand image localization by comparing the transformation of\nits estimated pose towards standard plane with the corresponding probe motion,\nwhich reflects the actual view change in 3D anatomy. Extensive results on two\nclinical head biometry tasks show that Pose-GuideNet not only accurately\npredicts pose but also successfully predicts the direction of the fetal head.\nEvaluations with probe motions further demonstrate the feasibility of adopting\nPose-GuideNet for freehand ultrasound-assisted navigation in a sensor-free\nenvironment.\n","authors":["Qianhui Men","Xiaoqing Guo","Aris T. Papageorghiou","J. Alison Noble"],"pdf_url":"https://arxiv.org/pdf/2408.09931v1.pdf","comment":"Accepted by MICCAI2024"},{"id":"http://arxiv.org/abs/2408.09929v1","updated":"2024-08-19T12:07:42Z","published":"2024-08-19T12:07:42Z","title":"Data Augmentation of Contrastive Learning is Estimating\n Positive-incentive Noise","summary":" Inspired by the idea of Positive-incentive Noise (Pi-Noise or $\\pi$-Noise)\nthat aims at learning the reliable noise beneficial to tasks, we scientifically\ninvestigate the connection between contrastive learning and $\\pi$-noise in this\npaper. By converting the contrastive loss to an auxiliary Gaussian distribution\nto quantitatively measure the difficulty of the specific contrastive model\nunder the information theory framework, we properly define the task entropy,\nthe core concept of $\\pi$-noise, of contrastive learning. It is further proved\nthat the predefined data augmentation in the standard contrastive learning\nparadigm can be regarded as a kind of point estimation of $\\pi$-noise. Inspired\nby the theoretical study, a framework that develops a $\\pi$-noise generator to\nlearn the beneficial noise (instead of estimation) as data augmentations for\ncontrast is proposed. The designed framework can be applied to diverse types of\ndata and is also completely compatible with the existing contrastive models.\nFrom the visualization, we surprisingly find that the proposed method\nsuccessfully learns effective augmentations.\n","authors":["Hongyuan Zhang","Yanchen Xu","Sida Huang","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2408.09929v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09928v1","updated":"2024-08-19T12:07:24Z","published":"2024-08-19T12:07:24Z","title":"DiscoNeRF: Class-Agnostic Object Field for 3D Object Discovery","summary":" Neural Radiance Fields (NeRFs) have become a powerful tool for modeling 3D\nscenes from multiple images. However, NeRFs remain difficult to segment into\nsemantically meaningful regions. Previous approaches to 3D segmentation of\nNeRFs either require user interaction to isolate a single object, or they rely\non 2D semantic masks with a limited number of classes for supervision. As a\nconsequence, they generalize poorly to class-agnostic masks automatically\ngenerated in real scenes. This is attributable to the ambiguity arising from\nzero-shot segmentation, yielding inconsistent masks across views. In contrast,\nwe propose a method that is robust to inconsistent segmentations and\nsuccessfully decomposes the scene into a set of objects of any class. By\nintroducing a limited number of competing object slots against which masks are\nmatched, a meaningful object representation emerges that best explains the 2D\nsupervision and minimizes an additional regularization term. Our experiments\ndemonstrate the ability of our method to generate 3D panoptic segmentations on\ncomplex scenes, and extract high-quality 3D assets from NeRFs that can then be\nused in virtual 3D environments.\n","authors":["Corentin Dumery","Aoxiang Fan","Ren Li","Nicolas Talabot","Pascal Fua"],"pdf_url":"https://arxiv.org/pdf/2408.09928v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09920v1","updated":"2024-08-19T11:55:32Z","published":"2024-08-19T11:55:32Z","title":"Sliced Maximal Information Coefficient: A Training-Free Approach for\n Image Quality Assessment Enhancement","summary":" Full-reference image quality assessment (FR-IQA) models generally operate by\nmeasuring the visual differences between a degraded image and its reference.\nHowever, existing FR-IQA models including both the classical ones (eg, PSNR and\nSSIM) and deep-learning based measures (eg, LPIPS and DISTS) still exhibit\nlimitations in capturing the full perception characteristics of the human\nvisual system (HVS). In this paper, instead of designing a new FR-IQA measure,\nwe aim to explore a generalized human visual attention estimation strategy to\nmimic the process of human quality rating and enhance existing IQA models. In\nparticular, we model human attention generation by measuring the statistical\ndependency between the degraded image and the reference image. The dependency\nis captured in a training-free manner by our proposed sliced maximal\ninformation coefficient and exhibits surprising generalization in different IQA\nmeasures. Experimental results verify the performance of existing IQA models\ncan be consistently improved when our attention module is incorporated. The\nsource code is available at https://github.com/KANGX99/SMIC.\n","authors":["Kang Xiao","Xu Wang","Yulin He","Baoliang Chen","Xuelin Shen"],"pdf_url":"https://arxiv.org/pdf/2408.09920v1.pdf","comment":"6 pages, 5 figures, accepted by ICME2024"},{"id":"http://arxiv.org/abs/2408.09919v1","updated":"2024-08-19T11:55:16Z","published":"2024-08-19T11:55:16Z","title":"Long-Tail Temporal Action Segmentation with Group-wise Temporal Logit\n Adjustment","summary":" Procedural activity videos often exhibit a long-tailed action distribution\ndue to varying action frequencies and durations. However, state-of-the-art\ntemporal action segmentation methods overlook the long tail and fail to\nrecognize tail actions. Existing long-tail methods make class-independent\nassumptions and struggle to identify tail classes when applied to temporal\nsegmentation frameworks. This work proposes a novel group-wise temporal logit\nadjustment~(G-TLA) framework that combines a group-wise softmax formulation\nwhile leveraging activity information and action ordering for logit adjustment.\nThe proposed framework significantly improves in segmenting tail actions\nwithout any performance loss on head actions.\n","authors":["Zhanzhong Pang","Fadime Sener","Shrinivas Ramasubramanian","Angela Yao"],"pdf_url":"https://arxiv.org/pdf/2408.09919v1.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2404.06913v3","updated":"2024-08-19T11:52:53Z","published":"2024-04-10T11:06:29Z","title":"Sparse Global Matching for Video Frame Interpolation with Large Motion","summary":" Large motion poses a critical challenge in Video Frame Interpolation (VFI)\ntask. Existing methods are often constrained by limited receptive fields,\nresulting in sub-optimal performance when handling scenarios with large motion.\nIn this paper, we introduce a new pipeline for VFI, which can effectively\nintegrate global-level information to alleviate issues associated with large\nmotion. Specifically, we first estimate a pair of initial intermediate flows\nusing a high-resolution feature map for extracting local details. Then, we\nincorporate a sparse global matching branch to compensate for flow estimation,\nwhich consists of identifying flaws in initial flows and generating sparse flow\ncompensation with a global receptive field. Finally, we adaptively merge the\ninitial flow estimation with global flow compensation, yielding a more accurate\nintermediate flow. To evaluate the effectiveness of our method in handling\nlarge motion, we carefully curate a more challenging subset from commonly used\nbenchmarks. Our method demonstrates the state-of-the-art performance on these\nVFI subsets with large motion.\n","authors":["Chunxu Liu","Guozhen Zhang","Rui Zhao","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06913v3.pdf","comment":"Accepted by CVPR 2024. Project page: https://sgm-vfi.github.io/"},{"id":"http://arxiv.org/abs/2408.09916v1","updated":"2024-08-19T11:44:40Z","published":"2024-08-19T11:44:40Z","title":"Attribution Analysis Meets Model Editing: Advancing Knowledge Correction\n in Vision Language Models with VisEdit","summary":" Model editing aims to correct outdated or erroneous knowledge in large models\nwithout costly retraining. Recent research discovered that the mid-layer\nrepresentation of the subject's final token in a prompt has a strong influence\non factual predictions, and developed Large Language Model (LLM) editing\ntechniques based on this observation. However, for Vision-LLMs (VLLMs), how\nvisual representations impact the predictions from a decoder-only language\nmodel remains largely unexplored. To the best of our knowledge, model editing\nfor VLLMs has not been extensively studied in the literature. In this work, we\nemploy the contribution allocation and noise perturbation methods to measure\nthe contributions of visual representations for token predictions. Our\nattribution analysis shows that visual representations in mid-to-later layers\nthat are highly relevant to the prompt contribute significantly to predictions.\nBased on these insights, we propose VisEdit, a novel model editor for VLLMs\nthat effectively corrects knowledge by editing intermediate visual\nrepresentations in regions important to the edit prompt. We evaluated VisEdit\nusing multiple VLLM backbones and public VLLM editing benchmark datasets. The\nresults show the superiority of VisEdit over the strong baselines adapted from\nexisting state-of-the-art editors for LLMs.\n","authors":["Qizhou Chen","Taolin Zhang","Chengyu Wang","Xiaofeng He","Dakan Wang","Tingting Liu"],"pdf_url":"https://arxiv.org/pdf/2408.09916v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11261v3","updated":"2024-08-19T11:38:47Z","published":"2023-11-19T07:47:43Z","title":"Adversarial Prompt Tuning for Vision-Language Models","summary":" With the rapid advancement of multimodal learning, pre-trained\nVision-Language Models (VLMs) such as CLIP have demonstrated remarkable\ncapacities in bridging the gap between visual and language modalities. However,\nthese models remain vulnerable to adversarial attacks, particularly in the\nimage modality, presenting considerable security risks. This paper introduces\nAdversarial Prompt Tuning (AdvPT), a novel technique to enhance the adversarial\nrobustness of image encoders in VLMs. AdvPT innovatively leverages learnable\ntext prompts and aligns them with adversarial image embeddings, to address the\nvulnerabilities inherent in VLMs without the need for extensive parameter\ntraining or modification of the model architecture. We demonstrate that AdvPT\nimproves resistance against white-box and black-box adversarial attacks and\nexhibits a synergistic effect when combined with existing\nimage-processing-based defense techniques, further boosting defensive\ncapabilities. Comprehensive experimental analyses provide insights into\nadversarial prompt tuning, a novel paradigm devoted to improving resistance to\nadversarial images through textual input modifications, paving the way for\nfuture robust multimodal learning research. These findings open up new\npossibilities for enhancing the security of VLMs. Our code is available at\nhttps://github.com/jiamingzhang94/Adversarial-Prompt-Tuning.\n","authors":["Jiaming Zhang","Xingjun Ma","Xin Wang","Lingyu Qiu","Jiaqi Wang","Yu-Gang Jiang","Jitao Sang"],"pdf_url":"https://arxiv.org/pdf/2311.11261v3.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2408.09912v1","updated":"2024-08-19T11:36:48Z","published":"2024-08-19T11:36:48Z","title":"Harnessing Multi-resolution and Multi-scale Attention for Underwater\n Image Restoration","summary":" Underwater imagery is often compromised by factors such as color distortion\nand low contrast, posing challenges for high-level vision tasks. Recent\nunderwater image restoration (UIR) methods either analyze the input image at\nfull resolution, resulting in spatial richness but contextual weakness, or\nprogressively from high to low resolution, yielding reliable semantic\ninformation but reduced spatial accuracy. Here, we propose a lightweight\nmulti-stage network called Lit-Net that focuses on multi-resolution and\nmulti-scale image analysis for restoring underwater images while retaining\noriginal resolution during the first stage, refining features in the second,\nand focusing on reconstruction in the final stage. Our novel encoder block\nutilizes parallel $1\\times1$ convolution layers to capture local information\nand speed up operations. Further, we incorporate a modified weighted color\nchannel-specific $l_1$ loss ($cl_1$) function to recover color and detail\ninformation. Extensive experimentations on publicly available datasets suggest\nour model's superiority over recent state-of-the-art methods, with significant\nimprovement in qualitative and quantitative measures, such as $29.477$ dB PSNR\n($1.92\\%$ improvement) and $0.851$ SSIM ($2.87\\%$ improvement) on the EUVP\ndataset. The contributions of Lit-Net offer a more robust approach to\nunderwater image enhancement and super-resolution, which is of considerable\nimportance for underwater autonomous vehicles and surveillance. The code is\navailable at: https://github.com/Alik033/Lit-Net.\n","authors":["Alik Pramanick","Arijit Sur","V. Vijaya Saradhi"],"pdf_url":"https://arxiv.org/pdf/2408.09912v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17607v2","updated":"2024-08-19T11:26:40Z","published":"2023-11-29T13:05:06Z","title":"Topology-preserving Adversarial Training for Alleviating Natural\n Accuracy Degradation","summary":" Despite the effectiveness in improving the robustness of neural networks,\nadversarial training has suffered from the natural accuracy degradation\nproblem, i.e., accuracy on natural samples has reduced significantly. In this\nstudy, we reveal that natural accuracy degradation is highly related to the\ndisruption of the natural sample topology in the representation space by\nquantitative and qualitative experiments. Based on this observation, we propose\nTopology-pReserving Adversarial traINing (TRAIN) to alleviate the problem by\npreserving the topology structure of natural samples from a standard model\ntrained only on natural samples during adversarial training. As an additional\nregularization, our method can be combined with various popular adversarial\ntraining algorithms, taking advantage of both sides. Extensive experiments on\nCIFAR-10, CIFAR-100, and Tiny ImageNet show that our proposed method achieves\nconsistent and significant improvements over various strong baselines in most\ncases. Specifically, without additional data, TRAIN achieves up to 8.86%\nimprovement in natural accuracy and 6.33% improvement in robust accuracy.\n","authors":["Xiaoyue Mi","Fan Tang","Yepeng Weng","Danding Wang","Juan Cao","Sheng Tang","Peng Li","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2311.17607v2.pdf","comment":"BMVC 2024; Code will be released on https://github.com/KululuMi/TRAIN"},{"id":"http://arxiv.org/abs/2408.01537v2","updated":"2024-08-19T11:22:31Z","published":"2024-08-02T18:49:14Z","title":"SceneMotion: From Agent-Centric Embeddings to Scene-Wide Forecasts","summary":" Self-driving vehicles rely on multimodal motion forecasts to effectively\ninteract with their environment and plan safe maneuvers. We introduce\nSceneMotion, an attention-based model for forecasting scene-wide motion modes\nof multiple traffic agents. Our model transforms local agent-centric embeddings\ninto scene-wide forecasts using a novel latent context module. This module\nlearns a scene-wide latent space from multiple agent-centric embeddings,\nenabling joint forecasting and interaction modeling. The competitive\nperformance in the Waymo Open Interaction Prediction Challenge demonstrates the\neffectiveness of our approach. Moreover, we cluster future waypoints in time\nand space to quantify the interaction between agents. We merge all modes and\nanalyze each mode independently to determine which clusters are resolved\nthrough interaction or result in conflict. Our implementation is available at:\nhttps://github.com/kit-mrt/future-motion\n","authors":["Royden Wagner","Ömer Sahin Tas","Marlon Steiner","Fabian Konstantinidis","Hendrik Königshof","Marvin Klemp","Carlos Fernandez","Christoph Stiller"],"pdf_url":"https://arxiv.org/pdf/2408.01537v2.pdf","comment":"7 pages, 3 figures, ITSC 2024; v2: added details about waypoint\n clustering"},{"id":"http://arxiv.org/abs/2408.09899v1","updated":"2024-08-19T11:13:49Z","published":"2024-08-19T11:13:49Z","title":"LCE: A Framework for Explainability of DNNs for Ultrasound Image Based\n on Concept Discovery","summary":" Explaining the decisions of Deep Neural Networks (DNNs) for medical images\nhas become increasingly important. Existing attribution methods have difficulty\nexplaining the meaning of pixels while existing concept-based methods are\nlimited by additional annotations or specific model structures that are\ndifficult to apply to ultrasound images. In this paper, we propose the Lesion\nConcept Explainer (LCE) framework, which combines attribution methods with\nconcept-based methods. We introduce the Segment Anything Model (SAM),\nfine-tuned on a large number of medical images, for concept discovery to enable\na meaningful explanation of ultrasound image DNNs. The proposed framework is\nevaluated in terms of both faithfulness and understandability. We point out\ndeficiencies in the popular faithfulness evaluation metrics and propose a new\nevaluation metric. Our evaluation of public and private breast ultrasound\ndatasets (BUSI and FG-US-B) shows that LCE performs well compared to\ncommonly-used explainability methods. Finally, we also validate that LCE can\nconsistently provide reliable explanations for more meaningful fine-grained\ndiagnostic tasks in breast ultrasound.\n","authors":["Weiji Kong","Xun Gong","Juan Wang"],"pdf_url":"https://arxiv.org/pdf/2408.09899v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09894v1","updated":"2024-08-19T11:08:49Z","published":"2024-08-19T11:08:49Z","title":"Preoperative Rotator Cuff Tear Prediction from Shoulder Radiographs\n using a Convolutional Block Attention Module-Integrated Neural Network","summary":" Research question: We test whether a plane shoulder radiograph can be used\ntogether with deep learning methods to identify patients with rotator cuff\ntears as opposed to using an MRI in standard of care. Findings: By integrating\nconvolutional block attention modules into a deep neural network, our model\ndemonstrates high accuracy in detecting patients with rotator cuff tears,\nachieving an average AUC of 0.889 and an accuracy of 0.831. Meaning: This study\nvalidates the efficacy of our deep learning model to accurately detect rotation\ncuff tears from radiographs, offering a viable pre-assessment or alternative to\nmore expensive imaging techniques such as MRI.\n","authors":["Chris Hyunchul Jo","Jiwoong Yang","Byunghwan Jeon","Hackjoon Shim","Ikbeom Jang"],"pdf_url":"https://arxiv.org/pdf/2408.09894v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09886v1","updated":"2024-08-19T11:01:00Z","published":"2024-08-19T11:01:00Z","title":"SAM-UNet:Enhancing Zero-Shot Segmentation of SAM for Universal Medical\n Images","summary":" Segment Anything Model (SAM) has demonstrated impressive performance on a\nwide range of natural image segmentation tasks. However, its performance\nsignificantly deteriorates when directly applied to medical domain, due to the\nremarkable differences between natural images and medical images. Some\nresearchers have attempted to train SAM on large scale medical datasets.\nHowever, poor zero-shot performance is observed from the experimental results.\nIn this context, inspired by the superior performance of U-Net-like models in\nmedical image segmentation, we propose SAMUNet, a new foundation model which\nincorporates U-Net to the original SAM, to fully leverage the powerful\ncontextual modeling ability of convolutions. To be specific, we parallel a\nconvolutional branch in the image encoder, which is trained independently with\nthe vision Transformer branch frozen. Additionally, we employ multi-scale\nfusion in the mask decoder, to facilitate accurate segmentation of objects with\ndifferent scales. We train SAM-UNet on SA-Med2D-16M, the largest 2-dimensional\nmedical image segmentation dataset to date, yielding a universal pretrained\nmodel for medical images. Extensive experiments are conducted to evaluate the\nperformance of the model, and state-of-the-art result is achieved, with a dice\nsimilarity coefficient score of 0.883 on SA-Med2D-16M dataset. Specifically, in\nzero-shot segmentation experiments, our model not only significantly\noutperforms previous large medical SAM models across all modalities, but also\nsubstantially mitigates the performance degradation seen on unseen modalities.\nIt should be highlighted that SAM-UNet is an efficient and extensible\nfoundation model, which can be further fine-tuned for other downstream tasks in\nmedical community. The code is available at\nhttps://github.com/Hhankyangg/sam-unet.\n","authors":["Sihan Yang","Haixia Bi","Hai Zhang","Jian Sun"],"pdf_url":"https://arxiv.org/pdf/2408.09886v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09873v1","updated":"2024-08-19T10:24:57Z","published":"2024-08-19T10:24:57Z","title":"New spectral imaging biomarkers for sepsis and mortality in intensive\n care","summary":" With sepsis remaining a leading cause of mortality, early identification of\nseptic patients and those at high risk of death is a challenge of high\nsocioeconomic importance. The driving hypothesis of this study was that\nhyperspectral imaging (HSI) could provide novel biomarkers for sepsis diagnosis\nand treatment management due to its potential to monitor microcirculatory\nalterations. We conducted a comprehensive study involving HSI data of the palm\nand fingers from more than 480 patients on the day of their intensive care unit\n(ICU) admission. The findings demonstrate that HSI measurements can predict\nsepsis with an area under the receiver operating characteristic curve (AUROC)\nof 0.80 (95 % confidence interval (CI) [0.76; 0.84]) and mortality with an\nAUROC of 0.72 (95 % CI [0.65; 0.79]). The predictive performance improves\nsubstantially when additional clinical data is incorporated, leading to an\nAUROC of up to 0.94 (95 % CI [0.92; 0.96]) for sepsis and 0.84 (95 % CI [0.78;\n0.89]) for mortality. We conclude that HSI presents novel imaging biomarkers\nfor the rapid, non-invasive prediction of sepsis and mortality, suggesting its\npotential as an important modality for guiding diagnosis and treatment.\n","authors":["Silvia Seidlitz","Katharina Hölzl","Ayca von Garrel","Jan Sellner","Stephan Katzenschlager","Tobias Hölle","Dania Fischer","Maik von der Forst","Felix C. F. Schmitt","Markus A. Weigand","Lena Maier-Hein","Maximilian Dietrich"],"pdf_url":"https://arxiv.org/pdf/2408.09873v1.pdf","comment":"Markus A. Weigand, Lena Maier-Hein and Maximilian Dietrich\n contributed equally"},{"id":"http://arxiv.org/abs/2408.09869v1","updated":"2024-08-19T10:20:06Z","published":"2024-08-19T10:20:06Z","title":"Docling Technical Report","summary":" This technical report introduces Docling, an easy to use, self-contained,\nMIT-licensed open-source package for PDF document conversion. It is powered by\nstate-of-the-art specialized AI models for layout analysis (DocLayNet) and\ntable structure recognition (TableFormer), and runs efficiently on commodity\nhardware in a small resource budget. The code interface allows for easy\nextensibility and addition of new features and models.\n","authors":["Christoph Auer","Maksym Lysak","Ahmed Nassar","Michele Dolfi","Nikolaos Livathinos","Panos Vagenas","Cesar Berrospi Ramis","Matteo Omenetti","Fabian Lindlbauer","Kasper Dinkla","Valery Weber","Lucas Morin","Ingmar Meijer","Viktor Kuropiatnyk","Peter W. J. Staar"],"pdf_url":"https://arxiv.org/pdf/2408.09869v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2206.01062"},{"id":"http://arxiv.org/abs/2312.11463v2","updated":"2024-08-19T10:14:13Z","published":"2023-12-18T18:59:51Z","title":"Appearance-Based Refinement for Object-Centric Motion Segmentation","summary":" The goal of this paper is to discover, segment, and track independently\nmoving objects in complex visual scenes. Previous approaches have explored the\nuse of optical flow for motion segmentation, leading to imperfect predictions\ndue to partial motion, background distraction, and object articulations and\ninteractions. To address this issue, we introduce an appearance-based\nrefinement method that leverages temporal consistency in video streams to\ncorrect inaccurate flow-based proposals. Our approach involves a sequence-level\nselection mechanism that identifies accurate flow-predicted masks as exemplars,\nand an object-centric architecture that refines problematic masks based on\nexemplar information. The model is pre-trained on synthetic data and then\nadapted to real-world videos in a self-supervised manner, eliminating the need\nfor human annotations. Its performance is evaluated on multiple video\nsegmentation benchmarks, including DAVIS, YouTubeVOS, SegTrackv2, and FBMS-59.\nWe achieve competitive performance on single-object segmentation, while\nsignificantly outperforming existing models on the more challenging problem of\nmulti-object segmentation. Finally, we investigate the benefits of using our\nmodel as a prompt for the per-frame Segment Anything Model.\n","authors":["Junyu Xie","Weidi Xie","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2312.11463v2.pdf","comment":"ECCV 2024. Project page:\n https://www.robots.ox.ac.uk/vgg/research/appear-refine/"},{"id":"http://arxiv.org/abs/2408.09860v1","updated":"2024-08-19T10:08:25Z","published":"2024-08-19T10:08:25Z","title":"3D-Aware Instance Segmentation and Tracking in Egocentric Videos","summary":" Egocentric videos present unique challenges for 3D scene understanding due to\nrapid camera motion, frequent object occlusions, and limited object visibility.\nThis paper introduces a novel approach to instance segmentation and tracking in\nfirst-person video that leverages 3D awareness to overcome these obstacles. Our\nmethod integrates scene geometry, 3D object centroid tracking, and instance\nsegmentation to create a robust framework for analyzing dynamic egocentric\nscenes. By incorporating spatial and temporal cues, we achieve superior\nperformance compared to state-of-the-art 2D approaches. Extensive evaluations\non the challenging EPIC Fields dataset demonstrate significant improvements\nacross a range of tracking and segmentation consistency metrics. Specifically,\nour method outperforms the next best performing approach by $7$ points in\nAssociation Accuracy (AssA) and $4.5$ points in IDF1 score, while reducing the\nnumber of ID switches by $73\\%$ to $80\\%$ across various object categories.\nLeveraging our tracked instance segmentations, we showcase downstream\napplications in 3D object reconstruction and amodal video object segmentation\nin these egocentric settings.\n","authors":["Yash Bhalgat","Vadim Tschernezki","Iro Laina","João F. Henriques","Andrea Vedaldi","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2408.09860v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09859v1","updated":"2024-08-19T10:07:00Z","published":"2024-08-19T10:07:00Z","title":"OccMamba: Semantic Occupancy Prediction with State Space Models","summary":" Training deep learning models for semantic occupancy prediction is\nchallenging due to factors such as a large number of occupancy cells, severe\nocclusion, limited visual cues, complicated driving scenarios, etc. Recent\nmethods often adopt transformer-based architectures given their strong\ncapability in learning input-conditioned weights and long-range relationships.\nHowever, transformer-based networks are notorious for their quadratic\ncomputation complexity, seriously undermining their efficacy and deployment in\nsemantic occupancy prediction. Inspired by the global modeling and linear\ncomputation complexity of the Mamba architecture, we present the first\nMamba-based network for semantic occupancy prediction, termed OccMamba.\nHowever, directly applying the Mamba architecture to the occupancy prediction\ntask yields unsatisfactory performance due to the inherent domain gap between\nthe linguistic and 3D domains. To relieve this problem, we present a simple yet\neffective 3D-to-1D reordering operation, i.e., height-prioritized 2D Hilbert\nexpansion. It can maximally retain the spatial structure of point clouds as\nwell as facilitate the processing of Mamba blocks. Our OccMamba achieves\nstate-of-the-art performance on three prevalent occupancy prediction\nbenchmarks, including OpenOccupancy, SemanticKITTI and SemanticPOSS. Notably,\non OpenOccupancy, our OccMamba outperforms the previous state-of-the-art Co-Occ\nby 3.1% IoU and 3.2% mIoU, respectively. Codes will be released upon\npublication.\n","authors":["Heng Li","Yuenan Hou","Xiaohan Xing","Xiao Sun","Yanyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.09859v1.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2407.19156v2","updated":"2024-08-19T10:06:31Z","published":"2024-07-27T03:21:44Z","title":"Robust Multimodal 3D Object Detection via Modality-Agnostic Decoding and\n Proximity-based Modality Ensemble","summary":" Recent advancements in 3D object detection have benefited from multi-modal\ninformation from the multi-view cameras and LiDAR sensors. However, the\ninherent disparities between the modalities pose substantial challenges. We\nobserve that existing multi-modal 3D object detection methods heavily rely on\nthe LiDAR sensor, treating the camera as an auxiliary modality for augmenting\nsemantic details. This often leads to not only underutilization of camera data\nbut also significant performance degradation in scenarios where LiDAR data is\nunavailable. Additionally, existing fusion methods overlook the detrimental\nimpact of sensor noise induced by environmental changes, on detection\nperformance. In this paper, we propose MEFormer to address the LiDAR\nover-reliance problem by harnessing critical information for 3D object\ndetection from every available modality while concurrently safeguarding against\ncorrupted signals during the fusion process. Specifically, we introduce\nModality Agnostic Decoding (MOAD) that extracts geometric and semantic features\nwith a shared transformer decoder regardless of input modalities and provides\npromising improvement with a single modality as well as multi-modality.\nAdditionally, our Proximity-based Modality Ensemble (PME) module adaptively\nutilizes the strengths of each modality depending on the environment while\nmitigating the effects of a noisy sensor. Our MEFormer achieves\nstate-of-the-art performance of 73.9% NDS and 71.5% mAP in the nuScenes\nvalidation set. Extensive analyses validate that our MEFormer improves\nrobustness against challenging conditions such as sensor malfunctions or\nenvironmental changes. The source code is available at\nhttps://github.com/hanchaa/MEFormer\n","authors":["Juhan Cha","Minseok Joo","Jihwan Park","Sanghyeok Lee","Injae Kim","Hyunwoo J. Kim"],"pdf_url":"https://arxiv.org/pdf/2407.19156v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17465v2","updated":"2024-08-19T09:48:16Z","published":"2024-03-26T07:55:16Z","title":"LaRE^2: Latent Reconstruction Error Based Method for Diffusion-Generated\n Image Detection","summary":" The evolution of Diffusion Models has dramatically improved image generation\nquality, making it increasingly difficult to differentiate between real and\ngenerated images. This development, while impressive, also raises significant\nprivacy and security concerns. In response to this, we propose a novel Latent\nREconstruction error guided feature REfinement method (LaRE^2) for detecting\nthe diffusion-generated images. We come up with the Latent Reconstruction Error\n(LaRE), the first reconstruction-error based feature in the latent space for\ngenerated image detection. LaRE surpasses existing methods in terms of feature\nextraction efficiency while preserving crucial cues required to differentiate\nbetween the real and the fake. To exploit LaRE, we propose an Error-Guided\nfeature REfinement module (EGRE), which can refine the image feature guided by\nLaRE to enhance the discriminativeness of the feature. Our EGRE utilizes an\nalign-then-refine mechanism, which effectively refines the image feature for\ngenerated-image detection from both spatial and channel perspectives. Extensive\nexperiments on the large-scale GenImage benchmark demonstrate the superiority\nof our LaRE^2, which surpasses the best SoTA method by up to 11.9%/12.1%\naverage ACC/AP across 8 different image generators. LaRE also surpasses\nexisting methods in terms of feature extraction cost, delivering an impressive\nspeed enhancement of 8 times.\n","authors":["Yunpeng Luo","Junlong Du","Ke Yan","Shouhong Ding"],"pdf_url":"https://arxiv.org/pdf/2403.17465v2.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2408.09839v1","updated":"2024-08-19T09:35:51Z","published":"2024-08-19T09:35:51Z","title":"Segment-Anything Models Achieve Zero-shot Robustness in Autonomous\n Driving","summary":" Semantic segmentation is a significant perception task in autonomous driving.\nIt suffers from the risks of adversarial examples. In the past few years, deep\nlearning has gradually transitioned from convolutional neural network (CNN)\nmodels with a relatively small number of parameters to foundation models with a\nhuge number of parameters. The segment-anything model (SAM) is a generalized\nimage segmentation framework that is capable of handling various types of\nimages and is able to recognize and segment arbitrary objects in an image\nwithout the need to train on a specific object. It is a unified model that can\nhandle diverse downstream tasks, including semantic segmentation, object\ndetection, and tracking. In the task of semantic segmentation for autonomous\ndriving, it is significant to study the zero-shot adversarial robustness of\nSAM. Therefore, we deliver a systematic empirical study on the robustness of\nSAM without additional training. Based on the experimental results, the\nzero-shot adversarial robustness of the SAM under the black-box corruptions and\nwhite-box adversarial attacks is acceptable, even without the need for\nadditional training. The finding of this study is insightful in that the\ngigantic model parameters and huge amounts of training data lead to the\nphenomenon of emergence, which builds a guarantee of adversarial robustness.\nSAM is a vision foundation model that can be regarded as an early prototype of\nan artificial general intelligence (AGI) pipeline. In such a pipeline, a\nunified model can handle diverse tasks. Therefore, this research not only\ninspects the impact of vision foundation models on safe autonomous driving but\nalso provides a perspective on developing trustworthy AGI. The code is\navailable at: https://github.com/momo1986/robust_sam_iv.\n","authors":["Jun Yan","Pengyu Wang","Danni Wang","Weiquan Huang","Daniel Watzenig","Huilin Yin"],"pdf_url":"https://arxiv.org/pdf/2408.09839v1.pdf","comment":"Accepted to IAVVC 2024"},{"id":"http://arxiv.org/abs/2408.07967v2","updated":"2024-08-19T09:29:33Z","published":"2024-08-15T06:27:42Z","title":"FlashGS: Efficient 3D Gaussian Splatting for Large-scale and\n High-resolution Rendering","summary":" This work introduces FlashGS, an open-source CUDA Python library, designed to\nfacilitate the efficient differentiable rasterization of 3D Gaussian Splatting\nthrough algorithmic and kernel-level optimizations. FlashGS is developed based\non the observations from a comprehensive analysis of the rendering process to\nenhance computational efficiency and bring the technique to wide adoption. The\npaper includes a suite of optimization strategies, encompassing redundancy\nelimination, efficient pipelining, refined control and scheduling mechanisms,\nand memory access optimizations, all of which are meticulously integrated to\namplify the performance of the rasterization process. An extensive evaluation\nof FlashGS' performance has been conducted across a diverse spectrum of\nsynthetic and real-world large-scale scenes, encompassing a variety of image\nresolutions. The empirical findings demonstrate that FlashGS consistently\nachieves an average 4x acceleration over mobile consumer GPUs, coupled with\nreduced memory consumption. These results underscore the superior performance\nand resource optimization capabilities of FlashGS, positioning it as a\nformidable tool in the domain of 3D rendering.\n","authors":["Guofeng Feng","Siyan Chen","Rong Fu","Zimu Liao","Yi Wang","Tao Liu","Zhilin Pei","Hengjie Li","Xingcheng Zhang","Bo Dai"],"pdf_url":"https://arxiv.org/pdf/2408.07967v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09822v1","updated":"2024-08-19T09:19:25Z","published":"2024-08-19T09:19:25Z","title":"SurgicaL-CD: Generating Surgical Images via Unpaired Image Translation\n with Latent Consistency Diffusion Models","summary":" Computer-assisted surgery (CAS) systems are designed to assist surgeons\nduring procedures, thereby reducing complications and enhancing patient care.\nTraining machine learning models for these systems requires a large corpus of\nannotated datasets, which is challenging to obtain in the surgical domain due\nto patient privacy concerns and the significant labeling effort required from\ndoctors. Previous methods have explored unpaired image translation using\ngenerative models to create realistic surgical images from simulations.\nHowever, these approaches have struggled to produce high-quality, diverse\nsurgical images. In this work, we introduce \\emph{SurgicaL-CD}, a\nconsistency-distilled diffusion method to generate realistic surgical images\nwith only a few sampling steps without paired data. We evaluate our approach on\nthree datasets, assessing the generated images in terms of quality and utility\nas downstream training datasets. Our results demonstrate that our method\noutperforms GANs and diffusion-based approaches. Our code is available at\n\\url{https://gitlab.com/nct_tso_public/gan2diffusion}.\n","authors":["Danush Kumar Venkatesh","Dominik Rivoir","Micha Pfeiffer","Stefanie Speidel"],"pdf_url":"https://arxiv.org/pdf/2408.09822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09802v1","updated":"2024-08-19T08:47:03Z","published":"2024-08-19T08:47:03Z","title":"Hear Your Face: Face-based voice conversion with F0 estimation","summary":" This paper delves into the emerging field of face-based voice conversion,\nleveraging the unique relationship between an individual's facial features and\ntheir vocal characteristics. We present a novel face-based voice conversion\nframework that particularly utilizes the average fundamental frequency of the\ntarget speaker, derived solely from their facial images. Through extensive\nanalysis, our framework demonstrates superior speech generation quality and the\nability to align facial features with voice characteristics, including tracking\nof the target speaker's fundamental frequency.\n","authors":["Jaejun Lee","Yoori Oh","Injune Hwang","Kyogu Lee"],"pdf_url":"https://arxiv.org/pdf/2408.09802v1.pdf","comment":"Interspeech 2024"},{"id":"http://arxiv.org/abs/2408.09800v1","updated":"2024-08-19T08:46:16Z","published":"2024-08-19T08:46:16Z","title":"Latent Diffusion for Guided Document Table Generation","summary":" Obtaining annotated table structure data for complex tables is a challenging\ntask due to the inherent diversity and complexity of real-world document\nlayouts. The scarcity of publicly available datasets with comprehensive\nannotations for intricate table structures hinders the development and\nevaluation of models designed for such scenarios. This research paper\nintroduces a novel approach for generating annotated images for table structure\nby leveraging conditioned mask images of rows and columns through the\napplication of latent diffusion models. The proposed method aims to enhance the\nquality of synthetic data used for training object detection models.\nSpecifically, the study employs a conditioning mechanism to guide the\ngeneration of complex document table images, ensuring a realistic\nrepresentation of table layouts. To evaluate the effectiveness of the generated\ndata, we employ the popular YOLOv5 object detection model for training. The\ngenerated table images serve as valuable training samples, enriching the\ndataset with diverse table structures. The model is subsequently tested on the\nchallenging pubtables-1m testset, a benchmark for table structure recognition\nin complex document layouts. Experimental results demonstrate that the\nintroduced approach significantly improves the quality of synthetic data for\ntraining, leading to YOLOv5 models with enhanced performance. The mean Average\nPrecision (mAP) values obtained on the pubtables-1m testset showcase results\nclosely aligned with state-of-the-art methods. Furthermore, low FID results\nobtained on the synthetic data further validate the efficacy of the proposed\nmethodology in generating annotated images for table structure.\n","authors":["Syed Jawwad Haider Hamdani","Saifullah Saifullah","Stefan Agne","Andreas Dengel","Sheraz Ahmed"],"pdf_url":"https://arxiv.org/pdf/2408.09800v1.pdf","comment":"Accepted in ICDAR 2024"},{"id":"http://arxiv.org/abs/2402.18579v2","updated":"2024-08-19T08:40:03Z","published":"2024-01-11T20:46:39Z","title":"Wilcoxon Nonparametric CFAR Scheme for Ship Detection in SAR Image","summary":" The parametric constant false alarm rate (CFAR) detection algorithms which\nare based on various statistical distributions, such as Gaussian, Gamma,\nWeibull, log-normal, G0 distribution, alpha-stable distribution, etc, are most\nwidely used to detect the ship targets in SAR image at present. However, the\nclutter background in SAR images is complicated and variable. When the actual\nclutter background deviates from the assumed statistical distribution, the\nperformance of the parametric CFAR detector will deteriorate. In addition to\nthe parametric CFAR schemes, there is another class of nonparametric CFAR\ndetectors which can maintain a constant false alarm rate for the target\ndetection without the assumption of a known clutter distribution. In this work,\nthe Wilcoxon nonparametric CFAR scheme for ship detection in SAR image is\nproposed and analyzed, and a closed form of the false alarm rate for the\nWilcoxon nonparametric detector to determine the decision threshold is\npresented. By comparison with several typical parametric CFAR schemes on\nRadarsat-2, ICEYE-X6 and Gaofen-3 SAR images, the robustness of the Wilcoxon\nnonparametric detector to maintain a good false alarm performance in different\ndetection backgrounds is revealed, and its detection performance for the weak\nship in rough sea surface is improved to some extent. Moreover, the Wilcoxon\nnonparametric detector can suppress the false alarms resulting from the\nsidelobes at some degree and its detection speed is fast.\n","authors":["Xiangwei Meng"],"pdf_url":"https://arxiv.org/pdf/2402.18579v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.09271v2","updated":"2024-08-19T08:38:32Z","published":"2024-07-12T13:57:49Z","title":"iNeMo: Incremental Neural Mesh Models for Robust Class-Incremental\n Learning","summary":" Different from human nature, it is still common practice today for vision\ntasks to train deep learning models only initially and on fixed datasets. A\nvariety of approaches have recently addressed handling continual data streams.\nHowever, extending these methods to manage out-of-distribution (OOD) scenarios\nhas not effectively been investigated. On the other hand, it has recently been\nshown that non-continual neural mesh models exhibit strong performance in\ngeneralizing to such OOD scenarios. To leverage this decisive property in a\ncontinual learning setting, we propose incremental neural mesh models that can\nbe extended with new meshes over time. In addition, we present a latent space\ninitialization strategy that enables us to allocate feature space for future\nunseen classes in advance and a positional regularization term that forces the\nfeatures of the different classes to consistently stay in respective latent\nspace regions. We demonstrate the effectiveness of our method through extensive\nexperiments on the Pascal3D and ObjectNet3D datasets and show that our approach\noutperforms the baselines for classification by $2-6\\%$ in the in-domain and by\n$6-50\\%$ in the OOD setting. Our work also presents the first incremental\nlearning approach for pose estimation. Our code and model can be found at\nhttps://github.com/Fischer-Tom/iNeMo.\n","authors":["Tom Fischer","Yaoyao Liu","Artur Jesslen","Noor Ahmed","Prakhar Kaushik","Angtian Wang","Alan Yuille","Adam Kortylewski","Eddy Ilg"],"pdf_url":"https://arxiv.org/pdf/2407.09271v2.pdf","comment":"ECCV-24"},{"id":"http://arxiv.org/abs/2408.09787v1","updated":"2024-08-19T08:27:31Z","published":"2024-08-19T08:27:31Z","title":"Anim-Director: A Large Multimodal Model Powered Agent for Controllable\n Animation Video Generation","summary":" Traditional animation generation methods depend on training generative models\nwith human-labelled data, entailing a sophisticated multi-stage pipeline that\ndemands substantial human effort and incurs high training costs. Due to limited\nprompting plans, these methods typically produce brief, information-poor, and\ncontext-incoherent animations. To overcome these limitations and automate the\nanimation process, we pioneer the introduction of large multimodal models\n(LMMs) as the core processor to build an autonomous animation-making agent,\nnamed Anim-Director. This agent mainly harnesses the advanced understanding and\nreasoning capabilities of LMMs and generative AI tools to create animated\nvideos from concise narratives or simple instructions. Specifically, it\noperates in three main stages: Firstly, the Anim-Director generates a coherent\nstoryline from user inputs, followed by a detailed director's script that\nencompasses settings of character profiles and interior/exterior descriptions,\nand context-coherent scene descriptions that include appearing characters,\ninteriors or exteriors, and scene events. Secondly, we employ LMMs with the\nimage generation tool to produce visual images of settings and scenes. These\nimages are designed to maintain visual consistency across different scenes\nusing a visual-language prompting method that combines scene descriptions and\nimages of the appearing character and setting. Thirdly, scene images serve as\nthe foundation for producing animated videos, with LMMs generating prompts to\nguide this process. The whole process is notably autonomous without manual\nintervention, as the LMMs interact seamlessly with generative tools to generate\nprompts, evaluate visual quality, and select the best one to optimize the final\noutput.\n","authors":["Yunxin Li","Haoyuan Shi","Baotian Hu","Longyue Wang","Jiashun Zhu","Jinyi Xu","Zhen Zhao","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.09787v1.pdf","comment":"Accepted by SIGGRAPH Asia 2024, Project and Codes:\n https://github.com/HITsz-TMG/Anim-Director"},{"id":"http://arxiv.org/abs/2405.18523v2","updated":"2024-08-19T08:26:25Z","published":"2024-05-28T18:44:15Z","title":"MM-Mixing: Multi-Modal Mixing Alignment for 3D Understanding","summary":" We introduce MM-Mixing, a multi-modal mixing alignment framework for 3D\nunderstanding. MM-Mixing applies mixing-based methods to multi-modal data,\npreserving and optimizing cross-modal connections while enhancing diversity and\nimproving alignment across modalities. Our proposed two-stage training pipeline\ncombines feature-level and input-level mixing to optimize the 3D encoder. The\nfirst stage employs feature-level mixing with contrastive learning to align 3D\nfeatures with their corresponding modalities. The second stage incorporates\nboth feature-level and input-level mixing, introducing mixed point cloud inputs\nto further refine 3D feature representations. MM-Mixing enhances intermodality\nrelationships, promotes generalization, and ensures feature consistency while\nproviding diverse and realistic training samples. We demonstrate that MM-Mixing\nsignificantly improves baseline performance across various learning scenarios,\nincluding zero-shot 3D classification, linear probing 3D classification, and\ncross-modal 3D shape retrieval. Notably, we improved the zero-shot\nclassification accuracy on ScanObjectNN from 51.3% to 61.9%, and on\nObjaverse-LVIS from 46.8% to 51.4%. Our findings highlight the potential of\nmulti-modal mixing-based alignment to significantly advance 3D object\nrecognition and understanding while remaining straightforward to implement and\nintegrate into existing frameworks.\n","authors":["Jiaze Wang","Yi Wang","Ziyu Guo","Renrui Zhang","Donghao Zhou","Guangyong Chen","Anfeng Liu","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2405.18523v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09786v1","updated":"2024-08-19T08:23:09Z","published":"2024-08-19T08:23:09Z","title":"Cross-composition Feature Disentanglement for Compositional Zero-shot\n Learning","summary":" Disentanglement of visual features of primitives (i.e., attributes and\nobjects) has shown exceptional results in Compositional Zero-shot Learning\n(CZSL). However, due to the feature divergence of an attribute (resp. object)\nwhen combined with different objects (resp. attributes), it is challenging to\nlearn disentangled primitive features that are general across different\ncompositions. To this end, we propose the solution of cross-composition feature\ndisentanglement, which takes multiple primitive-sharing compositions as inputs\nand constrains the disentangled primitive features to be general across these\ncompositions. More specifically, we leverage a compositional graph to define\nthe overall primitive-sharing relationships between compositions, and build a\ntask-specific architecture upon the recently successful large pre-trained\nvision-language model (VLM) CLIP, with dual cross-composition disentangling\nadapters (called L-Adapter and V-Adapter) inserted into CLIP's frozen text and\nimage encoders, respectively. Evaluation on three popular CZSL benchmarks shows\nthat our proposed solution significantly improves the performance of CZSL, and\nits components have been verified by solid ablation studies.\n","authors":["Yuxia Geng","Runkai Zhu","Jiaoyan Chen","Jintai Chen","Zhuo Chen","Xiang Chen","Can Xu","Yuxiang Wang","Xiaoliang Xu"],"pdf_url":"https://arxiv.org/pdf/2408.09786v1.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2303.04238v5","updated":"2024-08-19T07:53:40Z","published":"2023-03-07T21:03:48Z","title":"Patch of Invisibility: Naturalistic Physical Black-Box Adversarial\n Attacks on Object Detectors","summary":" Adversarial attacks on deep-learning models have been receiving increased\nattention in recent years. Work in this area has mostly focused on\ngradient-based techniques, so-called \"white-box\" attacks, wherein the attacker\nhas access to the targeted model's internal parameters; such an assumption is\nusually unrealistic in the real world. Some attacks additionally use the entire\npixel space to fool a given model, which is neither practical nor physical\n(i.e., real-world). On the contrary, we propose herein a direct, black-box,\ngradient-free method that uses the learned image manifold of a pretrained\ngenerative adversarial network (GAN) to generate naturalistic physical\nadversarial patches for object detectors. To our knowledge this is the first\nand only method that performs black-box physical attacks directly on\nobject-detection models, which results with a model-agnostic attack. We show\nthat our proposed method works both digitally and physically. We compared our\napproach against four different black-box attacks with different\nconfigurations. Our approach outperformed all other approaches that were tested\nin our experiments by a large margin.\n","authors":["Raz Lapid","Eylon Mizrahi","Moshe Sipper"],"pdf_url":"https://arxiv.org/pdf/2303.04238v5.pdf","comment":"Accepted at MLCS @ ECML-PKDD 2024"},{"id":"http://arxiv.org/abs/2408.09764v1","updated":"2024-08-19T07:52:20Z","published":"2024-08-19T07:52:20Z","title":"Event Stream based Human Action Recognition: A High-Definition Benchmark\n Dataset and Algorithms","summary":" Human Action Recognition (HAR) stands as a pivotal research domain in both\ncomputer vision and artificial intelligence, with RGB cameras dominating as the\npreferred tool for investigation and innovation in this field. However, in\nreal-world applications, RGB cameras encounter numerous challenges, including\nlight conditions, fast motion, and privacy concerns. Consequently, bio-inspired\nevent cameras have garnered increasing attention due to their advantages of low\nenergy consumption, high dynamic range, etc. Nevertheless, most existing\nevent-based HAR datasets are low resolution ($346 \\times 260$). In this paper,\nwe propose a large-scale, high-definition ($1280 \\times 800$) human action\nrecognition dataset based on the CeleX-V event camera, termed CeleX-HAR. It\nencompasses 150 commonly occurring action categories, comprising a total of\n124,625 video sequences. Various factors such as multi-view, illumination,\naction speed, and occlusion are considered when recording these data. To build\na more comprehensive benchmark dataset, we report over 20 mainstream HAR models\nfor future works to compare. In addition, we also propose a novel Mamba vision\nbackbone network for event stream based HAR, termed EVMamba, which equips the\nspatial plane multi-directional scanning and novel voxel temporal scanning\nmechanism. By encoding and mining the spatio-temporal information of event\nstreams, our EVMamba has achieved favorable results across multiple datasets.\nBoth the dataset and source code will be released on\n\\url{https://github.com/Event-AHU/CeleX-HAR}\n","authors":["Xiao Wang","Shiao Wang","Pengpeng Shao","Bo Jiang","Lin Zhu","Yonghong Tian"],"pdf_url":"https://arxiv.org/pdf/2408.09764v1.pdf","comment":"In Peer Review"},{"id":"http://arxiv.org/abs/2408.07703v2","updated":"2024-08-19T07:52:15Z","published":"2024-08-14T17:59:32Z","title":"Knowledge Distillation with Refined Logits","summary":" Recent research on knowledge distillation has increasingly focused on logit\ndistillation because of its simplicity, effectiveness, and versatility in model\ncompression. In this paper, we introduce Refined Logit Distillation (RLD) to\naddress the limitations of current logit distillation methods. Our approach is\nmotivated by the observation that even high-performing teacher models can make\nincorrect predictions, creating a conflict between the standard distillation\nloss and the cross-entropy loss. This conflict can undermine the consistency of\nthe student model's learning objectives. Previous attempts to use labels to\nempirically correct teacher predictions may undermine the class correlation. In\ncontrast, our RLD employs labeling information to dynamically refine teacher\nlogits. In this way, our method can effectively eliminate misleading\ninformation from the teacher while preserving crucial class correlations, thus\nenhancing the value and efficiency of distilled knowledge. Experimental results\non CIFAR-100 and ImageNet demonstrate its superiority over existing methods.\nThe code is provided at \\text{https://github.com/zju-SWJ/RLD}.\n","authors":["Wujie Sun","Defang Chen","Siwei Lyu","Genlang Chen","Chun Chen","Can Wang"],"pdf_url":"https://arxiv.org/pdf/2408.07703v2.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2407.08481v2","updated":"2024-08-19T07:28:38Z","published":"2024-07-11T13:13:31Z","title":"SliceMamba with Neural Architecture Search for Medical Image\n Segmentation","summary":" Despite the progress made in Mamba-based medical image segmentation models,\nexisting methods utilizing unidirectional or multi-directional feature scanning\nmechanisms struggle to effectively capture dependencies between neighboring\npositions, limiting the discriminant representation learning of local features.\nThese local features are crucial for medical image segmentation as they provide\ncritical structural information about lesions and organs. To address this\nlimitation, we propose SliceMamba, a simple and effective locally sensitive\nMamba-based medical image segmentation model. SliceMamba includes an efficient\nBidirectional Slice Scan module (BSS), which performs bidirectional feature\nslicing and employs varied scanning mechanisms for sliced features with\ndistinct shapes. This design ensures that spatially adjacent features remain\nclose in the scanning sequence, thereby improving segmentation performance.\nAdditionally, to fit the varying sizes and shapes of lesions and organs, we\nfurther introduce an Adaptive Slice Search method to automatically determine\nthe optimal feature slice method based on the characteristics of the target\ndata. Extensive experiments on two skin lesion datasets (ISIC2017 and\nISIC2018), two polyp segmentation (Kvasir and ClinicDB) datasets, and one\nmulti-organ segmentation dataset (Synapse) validate the effectiveness of our\nmethod.\n","authors":["Chao Fan","Hongyuan Yu","Yan Huang","Liang Wang","Zhenghan Yang","Xibin Jia"],"pdf_url":"https://arxiv.org/pdf/2407.08481v2.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2408.09752v1","updated":"2024-08-19T07:24:36Z","published":"2024-08-19T07:24:36Z","title":"A Unified Framework for Iris Anti-Spoofing: Introducing IrisGeneral\n Dataset and Masked-MoE Method","summary":" Iris recognition is widely used in high-security scenarios due to its\nstability and distinctiveness. However, the acquisition of iris images\ntypically requires near-infrared illumination and near-infrared band filters,\nleading to significant and consistent differences in imaging across devices.\nThis underscores the importance of developing cross-domain capabilities in iris\nanti-spoofing methods. Despite this need, there is no dataset available that\ncomprehensively evaluates the generalization ability of the iris anti-spoofing\ntask. To address this gap, we propose the IrisGeneral dataset, which includes\n10 subsets, belonging to 7 databases, published by 4 institutions, collected\nwith 6 types of devices. IrisGeneral is designed with three protocols, aimed at\nevaluating average performance, cross-racial generalization, and cross-device\ngeneralization of iris anti-spoofing models. To tackle the challenge of\nintegrating multiple sub-datasets in IrisGeneral, we employ multiple parameter\nsets to learn from the various subsets. Specifically, we utilize the Mixture of\nExperts (MoE) to fit complex data distributions using multiple sub-neural\nnetworks. To further enhance the generalization capabilities, we introduce a\nnovel method Masked-MoE (MMoE). It randomly masks a portion of tokens for some\nexperts and requires their outputs to be similar to the unmasked experts, which\nimproves the generalization ability and effectively mitigates the overfitting\nissue produced by MoE. We selected ResNet50, VIT-B/16, CLIP, and FLIP as\nrepresentative models and benchmarked them on the IrisGeneral dataset.\nExperimental results demonstrate that our proposed MMoE with CLIP achieves the\nbest performance on IrisGeneral.\n","authors":["Hang Zou","Chenxi Du","Ajian Liu","Yuan Zhang","Jing Liu","Mingchuan Yang","Jun Wan","Hui Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.09752v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09746v1","updated":"2024-08-19T07:18:06Z","published":"2024-08-19T07:18:06Z","title":"Enhanced Cascade Prostate Cancer Classifier in mp-MRI Utilizing Recall\n Feedback Adaptive Loss and Prior Knowledge-Based Feature Extraction","summary":" Prostate cancer is the second most common cancer in males worldwide, and\nmpMRI is commonly used for diagnosis. However, interpreting mpMRI is\nchallenging and requires expertise from radiologists. This highlights the\nurgent need for automated grading in mpMRI. Existing studies lack integration\nof clinical prior information and suffer from uneven training sample\ndistribution due to prevalence. Therefore, we propose a solution that\nincorporates prior knowledge, addresses the issue of uneven medical sample\ndistribution, and maintains high interpretability in mpMRI. Firstly, we\nintroduce Prior Knowledge-Based Feature Extraction, which mathematically models\nthe PI-RADS criteria for prostate cancer as diagnostic information into model\ntraining. Secondly, we propose Adaptive Recall Feedback Loss to address the\nextremely imbalanced data problem. This method adjusts the training dynamically\nbased on accuracy and recall in the validation set, resulting in high accuracy\nand recall simultaneously in the testing set.Thirdly, we design an Enhanced\nCascade Prostate Cancer Classifier that classifies prostate cancer into\ndifferent levels in an interpretable way, which refines the classification\nresults and helps with clinical intervention. Our method is validated through\nexperiments on the PI-CAI dataset and outperforms other methods with a more\nbalanced result in both accuracy and recall rate.\n","authors":["Kun Luo","Bowen Zheng","Shidong Lv","Jie Tao","Qiang Wei"],"pdf_url":"https://arxiv.org/pdf/2408.09746v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09744v1","updated":"2024-08-19T07:15:44Z","published":"2024-08-19T07:15:44Z","title":"RealCustom++: Representing Images as Real-Word for Real-Time\n Customization","summary":" Text-to-image customization, which takes given texts and images depicting\ngiven subjects as inputs, aims to synthesize new images that align with both\ntext semantics and subject appearance. This task provides precise control over\ndetails that text alone cannot capture and is fundamental for various\nreal-world applications, garnering significant interest from academia and\nindustry. Existing works follow the pseudo-word paradigm, which involves\nrepresenting given subjects as pseudo-words and combining them with given texts\nto collectively guide the generation. However, the inherent conflict and\nentanglement between the pseudo-words and texts result in a dual-optimum\nparadox, where subject similarity and text controllability cannot be optimal\nsimultaneously. We propose a novel real-words paradigm termed RealCustom++ that\ninstead represents subjects as non-conflict real words, thereby disentangling\nsubject similarity from text controllability and allowing both to be optimized\nsimultaneously. Specifically, RealCustom++ introduces a novel \"train-inference\"\ndecoupled framework: (1) During training, RealCustom++ learns the alignment\nbetween vision conditions and all real words in the text, ensuring high\nsubject-similarity generation in open domains. This is achieved by the\ncross-layer cross-scale projector to robustly and finely extract subject\nfeatures, and a curriculum training recipe that adapts the generated subject to\ndiverse poses and sizes. (2) During inference, leveraging the learned general\nalignment, an adaptive mask guidance is proposed to only customize the\ngeneration of the specific target real word, keeping other subject-irrelevant\nregions uncontaminated to ensure high text-controllability in real-time.\n","authors":["Zhendong Mao","Mengqi Huang","Fei Ding","Mingcong Liu","Qian He","Xiaojun Chang","Yongdong Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.09744v1.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2408.09743v1","updated":"2024-08-19T07:15:11Z","published":"2024-08-19T07:15:11Z","title":"R2GenCSR: Retrieving Context Samples for Large Language Model based\n X-ray Medical Report Generation","summary":" Inspired by the tremendous success of Large Language Models (LLMs), existing\nX-ray medical report generation methods attempt to leverage large models to\nachieve better performance. They usually adopt a Transformer to extract the\nvisual features of a given X-ray image, and then, feed them into the LLM for\ntext generation. How to extract more effective information for the LLMs to help\nthem improve final results is an urgent problem that needs to be solved.\nAdditionally, the use of visual Transformer models also brings high\ncomputational complexity. To address these issues, this paper proposes a novel\ncontext-guided efficient X-ray medical report generation framework.\nSpecifically, we introduce the Mamba as the vision backbone with linear\ncomplexity, and the performance obtained is comparable to that of the strong\nTransformer model. More importantly, we perform context retrieval from the\ntraining set for samples within each mini-batch during the training phase,\nutilizing both positively and negatively related samples to enhance feature\nrepresentation and discriminative learning. Subsequently, we feed the vision\ntokens, context information, and prompt statements to invoke the LLM for\ngenerating high-quality medical reports. Extensive experiments on three X-ray\nreport generation datasets (i.e., IU-Xray, MIMIC-CXR, CheXpert Plus) fully\nvalidated the effectiveness of our proposed model. The source code of this work\nwill be released on \\url{https://github.com/Event-AHU/Medical_Image_Analysis}.\n","authors":["Xiao Wang","Yuehang Li","Fuling Wang","Shiao Wang","Chuanfu Li","Bo Jiang"],"pdf_url":"https://arxiv.org/pdf/2408.09743v1.pdf","comment":"In Peer Review"},{"id":"http://arxiv.org/abs/2408.09739v1","updated":"2024-08-19T07:01:43Z","published":"2024-08-19T07:01:43Z","title":"TraDiffusion: Trajectory-Based Training-Free Image Generation","summary":" In this work, we propose a training-free, trajectory-based controllable T2I\napproach, termed TraDiffusion. This novel method allows users to effortlessly\nguide image generation via mouse trajectories. To achieve precise control, we\ndesign a distance awareness energy function to effectively guide latent\nvariables, ensuring that the focus of generation is within the areas defined by\nthe trajectory. The energy function encompasses a control function to draw the\ngeneration closer to the specified trajectory and a movement function to\ndiminish activity in areas distant from the trajectory. Through extensive\nexperiments and qualitative assessments on the COCO dataset, the results reveal\nthat TraDiffusion facilitates simpler, more natural image control. Moreover, it\nshowcases the ability to manipulate salient regions, attributes, and\nrelationships within the generated images, alongside visual input based on\narbitrary or enhanced trajectories.\n","authors":["Mingrui Wu","Oucheng Huang","Jiayi Ji","Jiale Li","Xinyue Cai","Huafeng Kuang","Jianzhuang Liu","Xiaoshuai Sun","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2408.09739v1.pdf","comment":"The code: https://github.com/och-mac/TraDiffusion"},{"id":"http://arxiv.org/abs/2408.09736v1","updated":"2024-08-19T06:57:07Z","published":"2024-08-19T06:57:07Z","title":"Coarse-Fine View Attention Alignment-Based GAN for CT Reconstruction\n from Biplanar X-Rays","summary":" For surgical planning and intra-operation imaging, CT reconstruction using\nX-ray images can potentially be an important alternative when CT imaging is not\navailable or not feasible. In this paper, we aim to use biplanar X-rays to\nreconstruct a 3D CT image, because biplanar X-rays convey richer information\nthan single-view X-rays and are more commonly used by surgeons. Different from\nprevious studies in which the two X-ray views were treated indifferently when\nfusing the cross-view data, we propose a novel attention-informed\ncoarse-to-fine cross-view fusion method to combine the features extracted from\nthe orthogonal biplanar views. This method consists of a view attention\nalignment sub-module and a fine-distillation sub-module that are designed to\nwork together to highlight the unique or complementary information from each of\nthe views. Experiments have demonstrated the superiority of our proposed method\nover the SOTA methods.\n","authors":["Zhi Qiao","Hanqiang Ouyang","Dongheng Chu","Huishu Yuan","Xiantong Zhen","Pei Dong","Zhen Qian"],"pdf_url":"https://arxiv.org/pdf/2408.09736v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09734v1","updated":"2024-08-19T06:46:24Z","published":"2024-08-19T06:46:24Z","title":"Mutually-Aware Feature Learning for Few-Shot Object Counting","summary":" Few-shot object counting has garnered significant attention for its\npracticality as it aims to count target objects in a query image based on given\nexemplars without the need for additional training. However, there is a\nshortcoming in the prevailing extract-and-match approach: query and exemplar\nfeatures lack interaction during feature extraction since they are extracted\nunaware of each other and later correlated based on similarity. This can lead\nto insufficient target awareness of the extracted features, resulting in target\nconfusion in precisely identifying the actual target when multiple class\nobjects coexist. To address this limitation, we propose a novel framework,\nMutually-Aware FEAture learning(MAFEA), which encodes query and exemplar\nfeatures mutually aware of each other from the outset. By encouraging\ninteraction between query and exemplar features throughout the entire pipeline,\nwe can obtain target-aware features that are robust to a multi-category\nscenario. Furthermore, we introduce a background token to effectively associate\nthe target region of query with exemplars and decouple its background region\nfrom them. Our extensive experiments demonstrate that our model reaches a new\nstate-of-the-art performance on the two challenging benchmarks, FSCD-LVIS and\nFSC-147, with a remarkably reduced degree of the target confusion problem.\n","authors":["Yerim Jeon","Subeen Lee","Jihwan Kim","Jae-Pil Heo"],"pdf_url":"https://arxiv.org/pdf/2408.09734v1.pdf","comment":"Submitted to Pattern Recognition"},{"id":"http://arxiv.org/abs/2408.09731v1","updated":"2024-08-19T06:34:01Z","published":"2024-08-19T06:34:01Z","title":"Diff2CT: Diffusion Learning to Reconstruct Spine CT from Biplanar X-Rays","summary":" Intraoperative CT imaging serves as a crucial resource for surgical guidance;\nhowever, it may not always be readily accessible or practical to implement. In\nscenarios where CT imaging is not an option, reconstructing CT scans from\nX-rays can offer a viable alternative. In this paper, we introduce an\ninnovative method for 3D CT reconstruction utilizing biplanar X-rays. Distinct\nfrom previous research that relies on conventional image generation techniques,\nour approach leverages a conditional diffusion process to tackle the task of\nreconstruction. More precisely, we employ a diffusion-based probabilistic model\ntrained to produce 3D CT images based on orthogonal biplanar X-rays. To improve\nthe structural integrity of the reconstructed images, we incorporate a novel\nprojection loss function. Experimental results validate that our proposed\nmethod surpasses existing state-of-the-art benchmarks in both visual image\nquality and multiple evaluative metrics. Specifically, our technique achieves a\nhigher Structural Similarity Index (SSIM) of 0.83, a relative increase of 10\\%,\nand a lower Fr\\'echet Inception Distance (FID) of 83.43, which represents a\nrelative decrease of 25\\%.\n","authors":["Zhi Qiao","Xuhui Liu","Xiaopeng Wang","Runkun Liu","Xiantong Zhen","Pei Dong","Zhen Qian"],"pdf_url":"https://arxiv.org/pdf/2408.09731v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09739v2","updated":"2024-08-19T06:27:49Z","published":"2024-06-14T06:00:14Z","title":"Decoupling Forgery Semantics for Generalizable Deepfake Detection","summary":" In this paper, we propose a novel method for detecting DeepFakes, enhancing\nthe generalization of detection through semantic decoupling. There are now\nmultiple DeepFake forgery technologies that not only possess unique forgery\nsemantics but may also share common forgery semantics. The unique forgery\nsemantics and irrelevant content semantics may promote over-fitting and hamper\ngeneralization for DeepFake detectors. For our proposed method, after\ndecoupling, the common forgery semantics could be extracted from DeepFakes, and\nsubsequently be employed for developing the generalizability of DeepFake\ndetectors. Also, to pursue additional generalizability, we designed an adaptive\nhigh-pass module and a two-stage training strategy to improve the independence\nof decoupled semantics. Evaluation on FF++, Celeb-DF, DFD, and DFDC datasets\nshowcases our method's excellent detection and generalization performance. Code\nis available at: https://github.com/leaffeall/DFS-GDD.\n","authors":["Wei Ye","Xinan He","Feng Ding"],"pdf_url":"https://arxiv.org/pdf/2406.09739v2.pdf","comment":"Accepted by BMVC 2024"},{"id":"http://arxiv.org/abs/2408.09720v1","updated":"2024-08-19T06:19:31Z","published":"2024-08-19T06:19:31Z","title":"Pedestrian Attribute Recognition: A New Benchmark Dataset and A Large\n Language Model Augmented Framework","summary":" Pedestrian Attribute Recognition (PAR) is one of the indispensable tasks in\nhuman-centered research. However, existing datasets neglect different domains\n(e.g., environments, times, populations, and data sources), only conducting\nsimple random splits, and the performance of these datasets has already\napproached saturation. In the past five years, no large-scale dataset has been\nopened to the public. To address this issue, this paper proposes a new\nlarge-scale, cross-domain pedestrian attribute recognition dataset to fill the\ndata gap, termed MSP60K. It consists of 60,122 images and 57 attribute\nannotations across eight scenarios. Synthetic degradation is also conducted to\nfurther narrow the gap between the dataset and real-world challenging\nscenarios. To establish a more rigorous benchmark, we evaluate 17\nrepresentative PAR models under both random and cross-domain split protocols on\nour dataset. Additionally, we propose an innovative Large Language Model (LLM)\naugmented PAR framework, named LLM-PAR. This framework processes pedestrian\nimages through a Vision Transformer (ViT) backbone to extract features and\nintroduces a multi-embedding query Transformer to learn partial-aware features\nfor attribute classification. Significantly, we enhance this framework with LLM\nfor ensemble learning and visual feature augmentation. Comprehensive\nexperiments across multiple PAR benchmark datasets have thoroughly validated\nthe efficacy of our proposed framework. The dataset and source code\naccompanying this paper will be made publicly available at\n\\url{https://github.com/Event-AHU/OpenPAR}.\n","authors":["Jiandong Jin","Xiao Wang","Qian Zhu","Haiyang Wang","Chenglong Li"],"pdf_url":"https://arxiv.org/pdf/2408.09720v1.pdf","comment":"MSP60K PAR Benchmark Dataset, LLM based PAR model, In Peer Review"},{"id":"http://arxiv.org/abs/2408.09715v1","updated":"2024-08-19T06:06:30Z","published":"2024-08-19T06:06:30Z","title":"HYDEN: Hyperbolic Density Representations for Medical Images and Reports","summary":" In light of the inherent entailment relations between images and text,\nhyperbolic point vector embeddings, leveraging the hierarchical modeling\nadvantages of hyperbolic space, have been utilized for visual semantic\nrepresentation learning. However, point vector embedding approaches fail to\naddress the issue of semantic uncertainty, where an image may have multiple\ninterpretations, and text may refer to different images, a phenomenon\nparticularly prevalent in the medical domain. Therefor, we propose\n\\textbf{HYDEN}, a novel hyperbolic density embedding based image-text\nrepresentation learning approach tailored for specific medical domain data.\nThis method integrates text-aware local features alongside global features from\nimages, mapping image-text features to density features in hyperbolic space via\nusing hyperbolic pseudo-Gaussian distributions. An encapsulation loss function\nis employed to model the partial order relations between image-text density\ndistributions. Experimental results demonstrate the interpretability of our\napproach and its superior performance compared to the baseline methods across\nvarious zero-shot tasks and different datasets.\n","authors":["Zhi Qiao","Linbin Han","Xiantong Zhen","Jia-Hong Gao","Zhen Qian"],"pdf_url":"https://arxiv.org/pdf/2408.09715v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09709v1","updated":"2024-08-19T05:53:38Z","published":"2024-08-19T05:53:38Z","title":"Dataset Distillation for Histopathology Image Classification","summary":" Deep neural networks (DNNs) have exhibited remarkable success in the field of\nhistopathology image analysis. On the other hand, the contemporary trend of\nemploying large models and extensive datasets has underscored the significance\nof dataset distillation, which involves compressing large-scale datasets into a\ncondensed set of synthetic samples, offering distinct advantages in improving\ntraining efficiency and streamlining downstream applications. In this work, we\nintroduce a novel dataset distillation algorithm tailored for histopathology\nimage datasets (Histo-DD), which integrates stain normalisation and model\naugmentation into the distillation progress. Such integration can substantially\nenhance the compatibility with histopathology images that are often\ncharacterised by high colour heterogeneity. We conduct a comprehensive\nevaluation of the effectiveness of the proposed algorithm and the generated\nhistopathology samples in both patch-level and slide-level classification\ntasks. The experimental results, carried out on three publicly available WSI\ndatasets, including Camelyon16, TCGA-IDH, and UniToPath, demonstrate that the\nproposed Histo-DD can generate more informative synthetic patches than previous\ncoreset selection and patch sampling methods. Moreover, the synthetic samples\ncan preserve discriminative information, substantially reduce training efforts,\nand exhibit architecture-agnostic properties. These advantages indicate that\nsynthetic samples can serve as an alternative to large-scale datasets.\n","authors":["Cong Cong","Shiyu Xuan","Sidong Liu","Maurice Pagnucco","Shiliang Zhang","Yang Song"],"pdf_url":"https://arxiv.org/pdf/2408.09709v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09706v1","updated":"2024-08-19T05:42:00Z","published":"2024-08-19T05:42:00Z","title":"MePT: Multi-Representation Guided Prompt Tuning for Vision-Language\n Model","summary":" Recent advancements in pre-trained Vision-Language Models (VLMs) have\nhighlighted the significant potential of prompt tuning for adapting these\nmodels to a wide range of downstream tasks. However, existing prompt tuning\nmethods typically map an image to a single representation, limiting the model's\nability to capture the diverse ways an image can be described. To address this\nlimitation, we investigate the impact of visual prompts on the model's\ngeneralization capability and introduce a novel method termed\nMulti-Representation Guided Prompt Tuning (MePT). Specifically, MePT employs a\nthree-branch framework that focuses on diverse salient regions, uncovering the\ninherent knowledge within images which is crucial for robust generalization.\nFurther, we employ efficient self-ensemble techniques to integrate these\nversatile image representations, allowing MePT to learn all conditional,\nmarginal, and fine-grained distributions effectively. We validate the\neffectiveness of MePT through extensive experiments, demonstrating significant\nimprovements on both base-to-novel class prediction and domain generalization\ntasks.\n","authors":["Xinyang Wang","Yi Yang","Minfeng Zhu","Kecheng Zheng","Shi Liu","Wei Chen"],"pdf_url":"https://arxiv.org/pdf/2408.09706v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09702v1","updated":"2024-08-19T05:15:45Z","published":"2024-08-19T05:15:45Z","title":"Photorealistic Object Insertion with Diffusion-Guided Inverse Rendering","summary":" The correct insertion of virtual objects in images of real-world scenes\nrequires a deep understanding of the scene's lighting, geometry and materials,\nas well as the image formation process. While recent large-scale diffusion\nmodels have shown strong generative and inpainting capabilities, we find that\ncurrent models do not sufficiently \"understand\" the scene shown in a single\npicture to generate consistent lighting effects (shadows, bright reflections,\netc.) while preserving the identity and details of the composited object. We\npropose using a personalized large diffusion model as guidance to a physically\nbased inverse rendering process. Our method recovers scene lighting and\ntone-mapping parameters, allowing the photorealistic composition of arbitrary\nvirtual objects in single frames or videos of indoor or outdoor scenes. Our\nphysically based pipeline further enables automatic materials and tone-mapping\nrefinement.\n","authors":["Ruofan Liang","Zan Gojcic","Merlin Nimier-David","David Acuna","Nandita Vijaykumar","Sanja Fidler","Zian Wang"],"pdf_url":"https://arxiv.org/pdf/2408.09702v1.pdf","comment":"ECCV 2024, Project page:\n https://research.nvidia.com/labs/toronto-ai/DiPIR/"},{"id":"http://arxiv.org/abs/2311.17629v3","updated":"2024-08-19T04:18:23Z","published":"2023-11-29T13:43:17Z","title":"Efficient Decoder for End-to-End Oriented Object Detection in Remote\n Sensing Images","summary":" Object instances in remote sensing images often distribute with\nmulti-orientations, varying scales, and dense distribution. These issues bring\nchallenges to end-to-end oriented object detectors including multi-scale\nfeatures alignment and a large number of queries. To address these limitations,\nwe propose an end-to-end oriented detector equipped with an efficient decoder,\nwhich incorporates two technologies, Rotated RoI attention (RRoI attention) and\nSelective Distinct Queries (SDQ). Specifically, RRoI attention effectively\nfocuses on oriented regions of interest through a cross-attention mechanism and\naligns multi-scale features. SDQ collects queries from intermediate decoder\nlayers and then filters similar queries to obtain distinct queries. The\nproposed SDQ can facilitate the optimization of one-to-one label assignment,\nwithout introducing redundant initial queries or extra auxiliary branches.\nExtensive experiments on five datasets demonstrate the effectiveness of our\nmethod. Notably, our method achieves state-of-the-art performance on DIOR-R\n(67.31% mAP), DOTA-v1.5 (67.43% mAP), and DOTA-v2.0 (53.28% mAP) with the\nResNet50 backbone.\n","authors":["Jiaqi Zhao","Zeyu Ding","Yong Zhou","Hancheng Zhu","Wenliang Du","Rui Yao","Abdulmotaleb El Saddik"],"pdf_url":"https://arxiv.org/pdf/2311.17629v3.pdf","comment":"The paper has not been accepted yet. We will release a new version\n after the paper is accepted"},{"id":"http://arxiv.org/abs/2312.10680v2","updated":"2024-08-19T04:11:30Z","published":"2023-12-17T10:46:46Z","title":"DomainForensics: Exposing Face Forgery across Domains via Bi-directional\n Adaptation","summary":" Recent DeepFake detection methods have shown excellent performance on public\ndatasets but are significantly degraded on new forgeries. Solving this problem\nis important, as new forgeries emerge daily with the continuously evolving\ngenerative techniques. Many efforts have been made for this issue by seeking\nthe commonly existing traces empirically on data level. In this paper, we\nrethink this problem and propose a new solution from the unsupervised domain\nadaptation perspective. Our solution, called DomainForensics, aims to transfer\nthe forgery knowledge from known forgeries to new forgeries. Unlike recent\nefforts, our solution does not focus on data view but on learning strategies of\nDeepFake detectors to capture the knowledge of new forgeries through the\nalignment of domain discrepancies. In particular, unlike the general domain\nadaptation methods which consider the knowledge transfer in the semantic class\ncategory, thus having limited application, our approach captures the subtle\nforgery traces. We describe a new bi-directional adaptation strategy dedicated\nto capturing the forgery knowledge across domains. Specifically, our strategy\nconsiders both forward and backward adaptation, to transfer the forgery\nknowledge from the source domain to the target domain in forward adaptation and\nthen reverse the adaptation from the target domain to the source domain in\nbackward adaptation. In forward adaptation, we perform supervised training for\nthe DeepFake detector in the source domain and jointly employ adversarial\nfeature adaptation to transfer the ability to detect manipulated faces from\nknown forgeries to new forgeries. In backward adaptation, we further improve\nthe knowledge transfer by coupling adversarial adaptation with\nself-distillation on new forgeries. This enables the detector to expose new\nforgery features from unlabeled data and avoid forgetting the known knowledge\nof known...\n","authors":["Qingxuan Lv","Yuezun Li","Junyu Dong","Sheng Chen","Hui Yu","Huiyu Zhou","Shu Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.10680v2.pdf","comment":"TIFS 2024"},{"id":"http://arxiv.org/abs/2407.17480v3","updated":"2024-08-19T04:09:17Z","published":"2024-07-02T06:08:30Z","title":"Universal Approximation Theory: The Basic Theory for Deep Learning-Based\n Computer Vision Models","summary":" Computer vision (CV) is one of the most crucial fields in artificial\nintelligence. In recent years, a variety of deep learning models based on\nconvolutional neural networks (CNNs) and Transformers have been designed to\ntackle diverse problems in CV. These algorithms have found practical\napplications in areas such as robotics and facial recognition. Despite the\nincreasing power of current CV models, several fundamental questions remain\nunresolved: Why do CNNs require deep layers? What ensures the generalization\nability of CNNs? Why do residual-based networks outperform fully convolutional\nnetworks like VGG? What is the fundamental difference between residual-based\nCNNs and Transformer-based networks? Why can CNNs utilize LoRA and pruning\ntechniques? The root cause of these questions lies in the lack of a robust\ntheoretical foundation for deep learning models in CV. To address these\ncritical issues and techniques, we employ the Universal Approximation Theorem\n(UAT) to provide a theoretical basis for convolution- and Transformer-based\nmodels in CV. By doing so, we aim to elucidate these questions from a\ntheoretical perspective.\n","authors":["Wei Wang","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2407.17480v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.12341v2","updated":"2024-08-19T04:08:49Z","published":"2023-06-21T15:39:17Z","title":"Geometric Pooling: maintaining more useful information","summary":" Graph Pooling technology plays an important role in graph node classification\ntasks. Sorting pooling technologies maintain large-value units for pooling\ngraphs of varying sizes. However, by analyzing the statistical characteristic\nof activated units after pooling, we found that a large number of units dropped\nby sorting pooling are negative-value units that contain useful information and\ncan contribute considerably to the final decision. To maintain more useful\ninformation, a novel pooling technology, called Geometric Pooling (GP), was\nproposed to contain the unique node features with negative values by measuring\nthe similarity of all node features. We reveal the effectiveness of GP from the\nentropy reduction view. The experiments were conducted on TUdatasets to show\nthe effectiveness of GP. The results showed that the proposed GP outperforms\nthe SOTA graph pooling technologies by 1%\\sim5% with fewer parameters.\n","authors":["Hao Xu","Jia Liu","Yang Shen","Kenan Lou","Yanxia Bao","Ruihua Zhang","Shuyue Zhou","Hongsen Zhao","Shuai Wang"],"pdf_url":"https://arxiv.org/pdf/2306.12341v2.pdf","comment":"Accepted by IEEE ACCESS"},{"id":"http://arxiv.org/abs/2408.09687v1","updated":"2024-08-19T03:49:48Z","published":"2024-08-19T03:49:48Z","title":"TESL-Net: A Transformer-Enhanced CNN for Accurate Skin Lesion\n Segmentation","summary":" Early detection of skin cancer relies on precise segmentation of dermoscopic\nimages of skin lesions. However, this task is challenging due to the irregular\nshape of the lesion, the lack of sharp borders, and the presence of artefacts\nsuch as marker colours and hair follicles. Recent methods for melanoma\nsegmentation are U-Nets and fully connected networks (FCNs). As the depth of\nthese neural network models increases, they can face issues like the vanishing\ngradient problem and parameter redundancy, potentially leading to a decrease in\nthe Jaccard index of the segmentation model. In this study, we introduced a\nnovel network named TESL-Net for the segmentation of skin lesions. The proposed\nTESL-Net involves a hybrid network that combines the local features of a CNN\nencoder-decoder architecture with long-range and temporal dependencies using\nbi-convolutional long-short-term memory (Bi-ConvLSTM) networks and a Swin\ntransformer. This enables the model to account for the uncertainty of\nsegmentation over time and capture contextual channel relationships in the\ndata. We evaluated the efficacy of TESL-Net in three commonly used datasets\n(ISIC 2016, ISIC 2017, and ISIC 2018) for the segmentation of skin lesions. The\nproposed TESL-Net achieves state-of-the-art performance, as evidenced by a\nsignificantly elevated Jaccard index demonstrated by empirical results.\n","authors":["Shahzaib Iqbal","Muhammad Zeeshan","Mehwish Mehmood","Tariq M. Khan","Imran Razzak"],"pdf_url":"https://arxiv.org/pdf/2408.09687v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09680v1","updated":"2024-08-19T03:38:29Z","published":"2024-08-19T03:38:29Z","title":"MambaLoc: Efficient Camera Localisation via State Space Model","summary":" Location information is pivotal for the automation and intelligence of\nterminal devices and edge-cloud IoT systems, such as autonomous vehicles and\naugmented reality. However, achieving reliable positioning across diverse IoT\napplications remains challenging due to significant training costs and the\nnecessity of densely collected data. To tackle these issues, we have\ninnovatively applied the selective state space (SSM) model to visual\nlocalization, introducing a new model named MambaLoc. The proposed model\ndemonstrates exceptional training efficiency by capitalizing on the SSM model's\nstrengths in efficient feature extraction, rapid computation, and memory\noptimization, and it further ensures robustness in sparse data environments due\nto its parameter sparsity. Additionally, we propose the Global Information\nSelector (GIS), which leverages selective SSM to implicitly achieve the\nefficient global feature extraction capabilities of Non-local Neural Networks.\nThis design leverages the computational efficiency of the SSM model alongside\nthe Non-local Neural Networks' capacity to capture long-range dependencies with\nminimal layers. Consequently, the GIS enables effective global information\ncapture while significantly accelerating convergence. Our extensive\nexperimental validation using public indoor and outdoor datasets first\ndemonstrates our model's effectiveness, followed by evidence of its versatility\nwith various existing localization models.\n","authors":["Jialu Wang","Kaichen Zhou","Andrew Markham","Niki Trigoni"],"pdf_url":"https://arxiv.org/pdf/2408.09680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09676v1","updated":"2024-08-19T03:33:39Z","published":"2024-08-19T03:33:39Z","title":"Image-based Freeform Handwriting Authentication with Energy-oriented\n Self-Supervised Learning","summary":" Freeform handwriting authentication verifies a person's identity from their\nwriting style and habits in messy handwriting data. This technique has gained\nwidespread attention in recent years as a valuable tool for various fields,\ne.g., fraud prevention and cultural heritage protection. However, it still\nremains a challenging task in reality due to three reasons: (i) severe damage,\n(ii) complex high-dimensional features, and (iii) lack of supervision. To\naddress these issues, we propose SherlockNet, an energy-oriented two-branch\ncontrastive self-supervised learning framework for robust and fast freeform\nhandwriting authentication. It consists of four stages: (i) pre-processing:\nconverting manuscripts into energy distributions using a novel plug-and-play\nenergy-oriented operator to eliminate the influence of noise; (ii) generalized\npre-training: learning general representation through two-branch momentum-based\nadaptive contrastive learning with the energy distributions, which handles the\nhigh-dimensional features and spatial dependencies of handwriting; (iii)\npersonalized fine-tuning: calibrating the learned knowledge using a small\namount of labeled data from downstream tasks; and (iv) practical application:\nidentifying individual handwriting from scrambled, missing, or forged data\nefficiently and conveniently. Considering the practicality, we construct EN-HA,\na novel dataset that simulates data forgery and severe damage in real\napplications. Finally, we conduct extensive experiments on six benchmark\ndatasets including our EN-HA, and the results prove the robustness and\nefficiency of SherlockNet.\n","authors":["Jingyao Wang","Luntian Mou","Changwen Zheng","Wen Gao"],"pdf_url":"https://arxiv.org/pdf/2408.09676v1.pdf","comment":"Accepted by TMM"},{"id":"http://arxiv.org/abs/2408.09674v1","updated":"2024-08-19T03:30:15Z","published":"2024-08-19T03:30:15Z","title":"Implicit Grid Convolution for Multi-Scale Image Super-Resolution","summary":" Recently, Super-Resolution (SR) achieved significant performance improvement\nby employing neural networks. Most SR methods conventionally train a single\nmodel for each targeted scale, which increases redundancy in training and\ndeployment in proportion to the number of scales targeted. This paper\nchallenges this conventional fixed-scale approach. Our preliminary analysis\nreveals that, surprisingly, encoders trained at different scales extract\nsimilar features from images. Furthermore, the commonly used scale-specific\nupsampler, Sub-Pixel Convolution (SPConv), exhibits significant inter-scale\ncorrelations. Based on these observations, we propose a framework for training\nmultiple integer scales simultaneously with a single model. We use a single\nencoder to extract features and introduce a novel upsampler, Implicit Grid\nConvolution~(IGConv), which integrates SPConv at all scales within a single\nmodule to predict multiple scales. Our extensive experiments demonstrate that\ntraining multiple scales with a single model reduces the training budget and\nstored parameters by one-third while achieving equivalent inference latency and\ncomparable performance. Furthermore, we propose IGConv$^{+}$, which addresses\nspectral bias and input-independent upsampling and uses ensemble prediction to\nimprove performance. As a result, SRFormer-IGConv$^{+}$ achieves a remarkable\n0.25dB improvement in PSNR at Urban100$\\times$4 while reducing the training\nbudget, stored parameters, and inference cost compared to the existing\nSRFormer.\n","authors":["Dongheon Lee","Seokju Yun","Youngmin Ro"],"pdf_url":"https://arxiv.org/pdf/2408.09674v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05919v2","updated":"2024-08-19T03:22:48Z","published":"2023-09-12T02:23:30Z","title":"Deep evidential fusion with uncertainty quantification and contextual\n discounting for multimodal medical image segmentation","summary":" Single-modality medical images generally do not contain enough information to\nreach an accurate and reliable diagnosis. For this reason, physicians generally\ndiagnose diseases based on multimodal medical images such as, e.g., PET/CT. The\neffective fusion of multimodal information is essential to reach a reliable\ndecision and explain how the decision is made as well. In this paper, we\npropose a fusion framework for multimodal medical image segmentation based on\ndeep learning and the Dempster-Shafer theory of evidence. In this framework,\nthe reliability of each single modality image when segmenting different objects\nis taken into account by a contextual discounting operation. The discounted\npieces of evidence from each modality are then combined by Dempster's rule to\nreach a final decision. Experimental results with a PET-CT dataset with\nlymphomas and a multi-MRI dataset with brain tumors show that our method\noutperforms the state-of-the-art methods in accuracy and reliability.\n","authors":["Ling Huang","Su Ruan","Pierre Decazes","Thierry Denoeux"],"pdf_url":"https://arxiv.org/pdf/2309.05919v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09665v1","updated":"2024-08-19T02:58:20Z","published":"2024-08-19T02:58:20Z","title":"SG-GS: Photo-realistic Animatable Human Avatars with Semantically-Guided\n Gaussian Splatting","summary":" Reconstructing photo-realistic animatable human avatars from monocular videos\nremains challenging in computer vision and graphics. Recently, methods using 3D\nGaussians to represent the human body have emerged, offering faster\noptimization and real-time rendering. However, due to ignoring the crucial role\nof human body semantic information which represents the intrinsic structure and\nconnections within the human body, they fail to achieve fine-detail\nreconstruction of dynamic human avatars. To address this issue, we propose\nSG-GS, which uses semantics-embedded 3D Gaussians, skeleton-driven rigid\ndeformation, and non-rigid cloth dynamics deformation to create photo-realistic\nanimatable human avatars from monocular videos. We then design a Semantic\nHuman-Body Annotator (SHA) which utilizes SMPL's semantic prior for efficient\nbody part semantic labeling. The generated labels are used to guide the\noptimization of Gaussian semantic attributes. To address the limited receptive\nfield of point-level MLPs for local features, we also propose a 3D network that\nintegrates geometric and semantic associations for human avatar deformation. We\nfurther implement three key strategies to enhance the semantic accuracy of 3D\nGaussians and rendering quality: semantic projection with 2D regularization,\nsemantic-guided density regularization and semantic-aware regularization with\nneighborhood consistency. Extensive experiments demonstrate that SG-GS achieves\nstate-of-the-art geometry and appearance reconstruction performance.\n","authors":["Haoyu Zhao","Chen Yang","Hao Wang","Xingyue Zhao","Wei Shen"],"pdf_url":"https://arxiv.org/pdf/2408.09665v1.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2403.08512v2","updated":"2024-08-19T02:46:26Z","published":"2024-03-13T13:23:05Z","title":"MergeOcc: Bridge the Domain Gap between Different LiDARs for Robust\n Occupancy Prediction","summary":" LiDAR-based 3D occupancy prediction evolved rapidly alongside the emergence\nof large datasets. Nevertheless, the potential of existing diverse datasets\nremains underutilized as they kick in individually. Models trained on a\nspecific dataset often suffer considerable performance degradation when\ndeployed to real-world scenarios or datasets involving disparate LiDARs. This\npaper aims to develop a generalized model called MergeOcc, to simultaneously\nhandle different LiDARs by leveraging multiple datasets. The gaps among LiDAR\ndatasets primarily manifest in geometric disparities and semantic\ninconsistencies. Thus, MergeOcc incorporates a novel model featuring a\ngeometric realignment module and a semantic label mapping module to enable\nmultiple datasets training (MDT). The effectiveness of MergeOcc is validated\nthrough experiments on two prominent datasets for autonomous vehicles:\nOpenOccupancy-nuScenes and SemanticKITTI. The results demonstrate its enhanced\nrobustness and remarkable performance across both types of LiDARs,\noutperforming several SOTA multi-modality methods. Notably, despite using an\nidentical model architecture and hyper-parameter set, MergeOcc can\nsignificantly surpass the baseline due to its exposure to more diverse data.\nMergeOcc is considered the first cross-dataset 3D occupancy prediction pipeline\nthat effectively bridges the domain gap for seamless deployment across\nheterogeneous platforms.\n","authors":["Zikun Xu","Jianqiang Wang","Shaobing Xu"],"pdf_url":"https://arxiv.org/pdf/2403.08512v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09663v1","updated":"2024-08-19T02:46:23Z","published":"2024-08-19T02:46:23Z","title":"CHASE: 3D-Consistent Human Avatars with Sparse Inputs via Gaussian\n Splatting and Contrastive Learning","summary":" Recent advancements in human avatar synthesis have utilized radiance fields\nto reconstruct photo-realistic animatable human avatars. However, both\nNeRFs-based and 3DGS-based methods struggle with maintaining 3D consistency and\nexhibit suboptimal detail reconstruction, especially with sparse inputs. To\naddress this challenge, we propose CHASE, which introduces supervision from\nintrinsic 3D consistency across poses and 3D geometry contrastive learning,\nachieving performance comparable with sparse inputs to that with full inputs.\nFollowing previous work, we first integrate a skeleton-driven rigid deformation\nand a non-rigid cloth dynamics deformation to coordinate the movements of\nindividual Gaussians during animation, reconstructing basic avatar with coarse\n3D consistency. To improve 3D consistency under sparse inputs, we design\nDynamic Avatar Adjustment(DAA) to adjust deformed Gaussians based on a selected\nsimilar pose/image from the dataset. Minimizing the difference between the\nimage rendered by adjusted Gaussians and the image with the similar pose serves\nas an additional form of supervision for avatar. Furthermore, we propose a 3D\ngeometry contrastive learning strategy to maintain the 3D global consistency of\ngenerated avatars. Though CHASE is designed for sparse inputs, it surprisingly\noutperforms current SOTA methods \\textbf{in both full and sparse settings} on\nthe ZJU-MoCap and H36M datasets, demonstrating that our CHASE successfully\nmaintains avatar's 3D consistency, hence improving rendering quality.\n","authors":["Haoyu Zhao","Hao Wang","Chen Yang","Wei Shen"],"pdf_url":"https://arxiv.org/pdf/2408.09663v1.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.05713v2","updated":"2024-08-19T02:33:43Z","published":"2024-08-11T07:46:06Z","title":"SSL: A Self-similarity Loss for Improving Generative Image\n Super-resolution","summary":" Generative adversarial networks (GAN) and generative diffusion models (DM)\nhave been widely used in real-world image super-resolution (Real-ISR) to\nenhance the image perceptual quality. However, these generative models are\nprone to generating visual artifacts and false image structures, resulting in\nunnatural Real-ISR results. Based on the fact that natural images exhibit high\nself-similarities, i.e., a local patch can have many similar patches to it in\nthe whole image, in this work we propose a simple yet effective self-similarity\nloss (SSL) to improve the performance of generative Real-ISR models, enhancing\nthe hallucination of structural and textural details while reducing the\nunpleasant visual artifacts. Specifically, we compute a self-similarity graph\n(SSG) of the ground-truth image, and enforce the SSG of Real-ISR output to be\nclose to it. To reduce the training cost and focus on edge areas, we generate\nan edge mask from the ground-truth image, and compute the SSG only on the\nmasked pixels. The proposed SSL serves as a general plug-and-play penalty,\nwhich could be easily applied to the off-the-shelf Real-ISR models. Our\nexperiments demonstrate that, by coupling with SSL, the performance of many\nstate-of-the-art Real-ISR models, including those GAN and DM based ones, can be\nlargely improved, reproducing more perceptually realistic image details and\neliminating many false reconstructions and visual artifacts. Codes and\nsupplementary material can be found at https://github.com/ChrisDud0257/SSL\n","authors":["Du Chen","Zhengqiang Zhang","Jie Liang","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.05713v2.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2408.09650v1","updated":"2024-08-19T02:16:47Z","published":"2024-08-19T02:16:47Z","title":"ExpoMamba: Exploiting Frequency SSM Blocks for Efficient and Effective\n Image Enhancement","summary":" Low-light image enhancement remains a challenging task in computer vision,\nwith existing state-of-the-art models often limited by hardware constraints and\ncomputational inefficiencies, particularly in handling high-resolution images.\nRecent foundation models, such as transformers and diffusion models, despite\ntheir efficacy in various domains, are limited in use on edge devices due to\ntheir computational complexity and slow inference times. We introduce\nExpoMamba, a novel architecture that integrates components of the frequency\nstate space within a modified U-Net, offering a blend of efficiency and\neffectiveness. This model is specifically optimized to address mixed exposure\nchallenges, a common issue in low-light image enhancement, while ensuring\ncomputational efficiency. Our experiments demonstrate that ExpoMamba enhances\nlow-light images up to 2-3x faster than traditional models with an inference\ntime of 36.6 ms and achieves a PSNR improvement of approximately 15-20% over\ncompeting models, making it highly suitable for real-time image processing\napplications.\n","authors":["Eashan Adhikarla","Kai Zhang","John Nicholson","Brian D. Davison"],"pdf_url":"https://arxiv.org/pdf/2408.09650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09647v1","updated":"2024-08-19T02:14:25Z","published":"2024-08-19T02:14:25Z","title":"C2P-CLIP: Injecting Category Common Prompt in CLIP to Enhance\n Generalization in Deepfake Detection","summary":" This work focuses on AIGC detection to develop universal detectors capable of\nidentifying various types of forgery images. Recent studies have found large\npre-trained models, such as CLIP, are effective for generalizable deepfake\ndetection along with linear classifiers. However, two critical issues remain\nunresolved: 1) understanding why CLIP features are effective on deepfake\ndetection through a linear classifier; and 2) exploring the detection potential\nof CLIP. In this study, we delve into the underlying mechanisms of CLIP's\ndetection capabilities by decoding its detection features into text and\nperforming word frequency analysis. Our finding indicates that CLIP detects\ndeepfakes by recognizing similar concepts (Fig. \\ref{fig:fig1} a). Building on\nthis insight, we introduce Category Common Prompt CLIP, called C2P-CLIP, which\nintegrates the category common prompt into the text encoder to inject\ncategory-related concepts into the image encoder, thereby enhancing detection\nperformance (Fig. \\ref{fig:fig1} b). Our method achieves a 12.41\\% improvement\nin detection accuracy compared to the original CLIP, without introducing\nadditional parameters during testing. Comprehensive experiments conducted on\ntwo widely-used datasets, encompassing 20 generation models, validate the\nefficacy of the proposed method, demonstrating state-of-the-art performance.\nThe code is available at\n\\url{https://github.com/chuangchuangtan/C2P-CLIP-DeepfakeDetection}\n","authors":["Chuangchuang Tan","Renshuai Tao","Huan Liu","Guanghua Gu","Baoyuan Wu","Yao Zhao","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2408.09647v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2311.12015v3","updated":"2024-08-19T01:59:54Z","published":"2023-11-20T18:54:39Z","title":"GPT-4V(ision) for Robotics: Multimodal Task Planning from Human\n Demonstration","summary":" We introduce a pipeline that enhances a general-purpose Vision Language\nModel, GPT-4V(ision), to facilitate one-shot visual teaching for robotic\nmanipulation. This system analyzes videos of humans performing tasks and\noutputs executable robot programs that incorporate insights into affordances.\nThe process begins with GPT-4V analyzing the videos to obtain textual\nexplanations of environmental and action details. A GPT-4-based task planner\nthen encodes these details into a symbolic task plan. Subsequently, vision\nsystems spatially and temporally ground the task plan in the videos. Object are\nidentified using an open-vocabulary object detector, and hand-object\ninteractions are analyzed to pinpoint moments of grasping and releasing. This\nspatiotemporal grounding allows for the gathering of affordance information\n(e.g., grasp types, waypoints, and body postures) critical for robot execution.\nExperiments across various scenarios demonstrate the method's efficacy in\nachieving real robots' operations from human demonstrations in a one-shot\nmanner. Meanwhile, quantitative tests have revealed instances of hallucination\nin GPT-4V, highlighting the importance of incorporating human supervision\nwithin the pipeline. The prompts of GPT-4V/GPT-4 are available at this project\npage: https://microsoft.github.io/GPT4Vision-Robot-Manipulation-Prompts/\n","authors":["Naoki Wake","Atsushi Kanehira","Kazuhiro Sasabuchi","Jun Takamatsu","Katsushi Ikeuchi"],"pdf_url":"https://arxiv.org/pdf/2311.12015v3.pdf","comment":"8 pages, 10 figures, 3 tables. Last updated on August 18th, 2024"},{"id":"http://arxiv.org/abs/2408.08191v3","updated":"2024-08-19T01:35:50Z","published":"2024-08-15T14:49:12Z","title":"Beyond Full Label: Single-Point Prompt for Infrared Small Target Label\n Generation","summary":" In this work, we make the first attempt to construct a learning-based\nsingle-point annotation paradigm for infrared small target label generation\n(IRSTLG). Our intuition is that label generation requires just one more point\nprompt than target detection: IRSTLG can be regarded as an infrared small\ntarget detection (IRSTD) task with the target location hint. Based on this\ninsight, we introduce an energy double guided single-point prompt (EDGSP)\nframework, which adeptly transforms the target detection network into a refined\nlabel generation method. Specifically, the proposed EDGSP includes: 1) target\nenergy initialization (TEI) to create a foundational outline for sufficient\nshape evolution of pseudo label, 2) double prompt embedding (DPE) for rapid\nlocalization of interested regions and reinforcement of individual differences\nto avoid label adhesion, and 3) bounding box-based matching (BBM) to eliminate\nfalse alarms. Experimental results show that pseudo labels generated by three\nbaselines equipped with EDGSP achieve 100% object-level probability of\ndetection (Pd) and 0% false-alarm rate (Fa) on SIRST, NUDT-SIRST, and IRSTD-1k\ndatasets, with a pixel-level intersection over union (IoU) improvement of\n13.28% over state-of-the-art (SOTA) label generation methods. In the practical\napplication of downstream IRSTD, EDGSP realizes, for the first time, a\nsingle-point generated pseudo mask beyond the full label. Even with coarse\nsingle-point annotations, it still achieves 99.5% performance of full labeling.\n","authors":["Shuai Yuan","Hanlin Qin","Renke Kou","Xiang Yan","Zechuan Li","Chenxu Peng","Abd-Krim Seghouane"],"pdf_url":"https://arxiv.org/pdf/2408.08191v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05642v3","updated":"2024-08-19T00:54:29Z","published":"2024-02-08T12:56:26Z","title":"An Optimization-based Baseline for Rigid 2D/3D Registration Applied to\n Spine Surgical Navigation Using CMA-ES","summary":" A robust and efficient optimization-based 2D/3D registration framework is\ncrucial for the navigation system of orthopedic surgical robots. It can provide\nprecise position information of surgical instruments and implants during\nsurgery. While artificial intelligence technology has advanced rapidly in\nrecent years, traditional optimization-based registration methods remain\nindispensable in the field of 2D/3D registration.he exceptional precision of\nthis method enables it to be considered as a post-processing step of the\nlearning-based methods, thereby offering a reliable assurance for registration.\nIn this paper, we present a coarse-to-fine registration framework based on the\nCMA-ES algorithm. We conducted intensive testing of our method using data from\ndifferent parts of the spine. The results shows the effectiveness of the\nproposed framework on real orthopedic spine surgery clinical data. This work\ncan be viewed as an additional extension that complements the\noptimization-based methods employed in our previous studies.\n","authors":["Minheng Chen","Tonglong Li","Zhirun Zhang","Youyong Kong"],"pdf_url":"https://arxiv.org/pdf/2402.05642v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05374v3","updated":"2024-08-19T00:52:51Z","published":"2024-02-08T03:12:25Z","title":"CIC: A framework for Culturally-aware Image Captioning","summary":" Image Captioning generates descriptive sentences from images using\nVision-Language Pre-trained models (VLPs) such as BLIP, which has improved\ngreatly. However, current methods lack the generation of detailed descriptive\ncaptions for the cultural elements depicted in the images, such as the\ntraditional clothing worn by people from Asian cultural groups. In this paper,\nwe propose a new framework, Culturally-aware Image Captioning (CIC), that\ngenerates captions and describes cultural elements extracted from cultural\nvisual elements in images representing cultures. Inspired by methods combining\nvisual modality and Large Language Models (LLMs) through appropriate prompts,\nour framework (1) generates questions based on cultural categories from images,\n(2) extracts cultural visual elements from Visual Question Answering (VQA)\nusing generated questions, and (3) generates culturally-aware captions using\nLLMs with the prompts. Our human evaluation conducted on 45 participants from 4\ndifferent cultural groups with a high understanding of the corresponding\nculture shows that our proposed framework generates more culturally descriptive\ncaptions when compared to the image captioning baseline based on VLPs.\nResources can be found at https://shane3606.github.io/cic..\n","authors":["Youngsik Yun","Jihie Kim"],"pdf_url":"https://arxiv.org/pdf/2402.05374v3.pdf","comment":"Accepted in IJCAI 2024"},{"id":"http://arxiv.org/abs/2408.10453v1","updated":"2024-08-19T23:31:02Z","published":"2024-08-19T23:31:02Z","title":"Kubrick: Multimodal Agent Collaborations for Synthetic Video Generation","summary":" Text-to-video generation has been dominated by end-to-end diffusion-based or\nautoregressive models. On one hand, those novel models provide plausible\nversatility, but they are criticized for physical correctness, shading and\nillumination, camera motion, and temporal consistency. On the other hand, film\nindustry relies on manually-edited Computer-Generated Imagery (CGI) using 3D\nmodeling software. Human-directed 3D synthetic videos and animations address\nthe aforementioned shortcomings, but it is extremely tedious and requires tight\ncollaboration between movie makers and 3D rendering experts. In this paper, we\nintroduce an automatic synthetic video generation pipeline based on Vision\nLarge Language Model (VLM) agent collaborations. Given a natural language\ndescription of a video, multiple VLM agents auto-direct various processes of\nthe generation pipeline. They cooperate to create Blender scripts which render\na video that best aligns with the given description. Based on film making\ninspiration and augmented with Blender-based movie making knowledge, the\nDirector agent decomposes the input text-based video description into\nsub-processes. For each sub-process, the Programmer agent produces Python-based\nBlender scripts based on customized function composing and API calling. Then,\nthe Reviewer agent, augmented with knowledge of video reviewing, character\nmotion coordinates, and intermediate screenshots uses its compositional\nreasoning ability to provide feedback to the Programmer agent. The Programmer\nagent iteratively improves the scripts to yield the best overall video outcome.\nOur generated videos show better quality than commercial video generation\nmodels in 5 metrics on video quality and instruction-following performance.\nMoreover, our framework outperforms other approaches in a comprehensive user\nstudy on quality, consistency, and rationality.\n","authors":["Liu He","Yizhi Song","Hejun Huang","Daniel Aliaga","Xin Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.10453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10446v1","updated":"2024-08-19T22:58:30Z","published":"2024-08-19T22:58:30Z","title":"The Brittleness of AI-Generated Image Watermarking Techniques: Examining\n Their Robustness Against Visual Paraphrasing Attacks","summary":" The rapid advancement of text-to-image generation systems, exemplified by\nmodels like Stable Diffusion, Midjourney, Imagen, and DALL-E, has heightened\nconcerns about their potential misuse. In response, companies like Meta and\nGoogle have intensified their efforts to implement watermarking techniques on\nAI-generated images to curb the circulation of potentially misleading visuals.\nHowever, in this paper, we argue that current image watermarking methods are\nfragile and susceptible to being circumvented through visual paraphrase\nattacks. The proposed visual paraphraser operates in two steps. First, it\ngenerates a caption for the given image using KOSMOS-2, one of the latest\nstate-of-the-art image captioning systems. Second, it passes both the original\nimage and the generated caption to an image-to-image diffusion system. During\nthe denoising step of the diffusion pipeline, the system generates a visually\nsimilar image that is guided by the text caption. The resulting image is a\nvisual paraphrase and is free of any watermarks. Our empirical findings\ndemonstrate that visual paraphrase attacks can effectively remove watermarks\nfrom images. This paper provides a critical assessment, empirically revealing\nthe vulnerability of existing watermarking techniques to visual paraphrase\nattacks. While we do not propose solutions to this issue, this paper serves as\na call to action for the scientific community to prioritize the development of\nmore robust watermarking techniques. Our first-of-its-kind visual paraphrase\ndataset and accompanying code are publicly available.\n","authors":["Niyar R Barman","Krish Sharma","Ashhar Aziz","Shashwat Bajpai","Shwetangshu Biswas","Vasu Sharma","Vinija Jain","Aman Chadha","Amit Sheth","Amitava Das"],"pdf_url":"https://arxiv.org/pdf/2408.10446v1.pdf","comment":"23 pages and 10 figures"},{"id":"http://arxiv.org/abs/2408.10442v1","updated":"2024-08-19T22:34:43Z","published":"2024-08-19T22:34:43Z","title":"Feasibility of assessing cognitive impairment via distributed camera\n network and privacy-preserving edge computing","summary":" INTRODUCTION: Mild cognitive impairment (MCI) is characterized by a decline\nin cognitive functions beyond typical age and education-related expectations.\nSince, MCI has been linked to reduced social interactions and increased aimless\nmovements, we aimed to automate the capture of these behaviors to enhance\nlongitudinal monitoring.\n METHODS: Using a privacy-preserving distributed camera network, we collected\nmovement and social interaction data from groups of individuals with MCI\nundergoing therapy within a 1700$m^2$ space. We developed movement and social\ninteraction features, which were then used to train a series of machine\nlearning algorithms to distinguish between higher and lower cognitive\nfunctioning MCI groups.\n RESULTS: A Wilcoxon rank-sum test revealed statistically significant\ndifferences between high and low-functioning cohorts in features such as linear\npath length, walking speed, change in direction while walking, entropy of\nvelocity and direction change, and number of group formations in the indoor\nspace. Despite lacking individual identifiers to associate with specific levels\nof MCI, a machine learning approach using the most significant features\nprovided a 71% accuracy.\n DISCUSSION: We provide evidence to show that a privacy-preserving low-cost\ncamera network using edge computing framework has the potential to distinguish\nbetween different levels of cognitive impairment from the movements and social\ninteractions captured during group activities.\n","authors":["Chaitra Hegde","Yashar Kiarashi","Allan I Levey","Amy D Rodriguez","Hyeokhyen Kwon","Gari D Clifford"],"pdf_url":"https://arxiv.org/pdf/2408.10442v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.14353v2","updated":"2024-08-19T22:04:25Z","published":"2023-03-25T04:37:20Z","title":"DiracDiffusion: Denoising and Incremental Reconstruction with Assured\n Data-Consistency","summary":" Diffusion models have established new state of the art in a multitude of\ncomputer vision tasks, including image restoration. Diffusion-based inverse\nproblem solvers generate reconstructions of exceptional visual quality from\nheavily corrupted measurements. However, in what is widely known as the\nperception-distortion trade-off, the price of perceptually appealing\nreconstructions is often paid in declined distortion metrics, such as PSNR.\nDistortion metrics measure faithfulness to the observation, a crucial\nrequirement in inverse problems. In this work, we propose a novel framework for\ninverse problem solving, namely we assume that the observation comes from a\nstochastic degradation process that gradually degrades and noises the original\nclean image. We learn to reverse the degradation process in order to recover\nthe clean image. Our technique maintains consistency with the original\nmeasurement throughout the reverse process, and allows for great flexibility in\ntrading off perceptual quality for improved distortion metrics and sampling\nspeedup via early-stopping. We demonstrate the efficiency of our method on\ndifferent high-resolution datasets and inverse problems, achieving great\nimprovements over other state-of-the-art diffusion-based methods with respect\nto both perceptual and distortion metrics.\n","authors":["Zalan Fabian","Berk Tinaz","Mahdi Soltanolkotabi"],"pdf_url":"https://arxiv.org/pdf/2303.14353v2.pdf","comment":"30 pages, 15 figures, published at the 41st International Conference\n on Machine Learning, Vienna, Austria, 2024"},{"id":"http://arxiv.org/abs/2408.10433v1","updated":"2024-08-19T21:56:20Z","published":"2024-08-19T21:56:20Z","title":"CLIP-DPO: Vision-Language Models as a Source of Preference for Fixing\n Hallucinations in LVLMs","summary":" Despite recent successes, LVLMs or Large Vision Language Models are prone to\nhallucinating details like objects and their properties or relations, limiting\ntheir real-world deployment. To address this and improve their robustness, we\npresent CLIP-DPO, a preference optimization method that leverages contrastively\npre-trained Vision-Language (VL) embedding models, such as CLIP, for DPO-based\noptimization of LVLMs. Unlike prior works tackling LVLM hallucinations, our\nmethod does not rely on paid-for APIs, and does not require additional training\ndata or the deployment of other external LVLMs. Instead, starting from the\ninitial pool of supervised fine-tuning data, we generate a diverse set of\npredictions, which are ranked based on their CLIP image-text similarities, and\nthen filtered using a robust rule-based approach to obtain a set of positive\nand negative pairs for DPO-based training. We applied CLIP-DPO fine-tuning to\nthe MobileVLM-v2 family of models and to LlaVA-1.5, in all cases observing\nsignificant improvements in terms of hallucination reduction over baseline\nmodels. We also observe better performance for zero-shot classification,\nsuggesting improved grounding capabilities, and verify that the original\nperformance on standard LVLM benchmarks is overall preserved.\n","authors":["Yassine Ouali","Adrian Bulat","Brais Martinez","Georgios Tzimiropoulos"],"pdf_url":"https://arxiv.org/pdf/2408.10433v1.pdf","comment":"Accepted at ECCV 2024"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2408.10159v1","updated":"2024-08-19T17:09:32Z","published":"2024-08-19T17:09:32Z","title":"Customizing Language Models with Instance-wise LoRA for Sequential\n Recommendation","summary":" Sequential recommendation systems predict a user's next item of interest by\nanalyzing past interactions, aligning recommendations with individual\npreferences. Leveraging the strengths of Large Language Models (LLMs) in\nknowledge comprehension and reasoning, recent approaches have applied LLMs to\nsequential recommendation through language generation paradigms. These methods\nconvert user behavior sequences into prompts for LLM fine-tuning, utilizing\nLow-Rank Adaptation (LoRA) modules to refine recommendations. However, the\nuniform application of LoRA across diverse user behaviors sometimes fails to\ncapture individual variability, leading to suboptimal performance and negative\ntransfer between disparate sequences. To address these challenges, we propose\nInstance-wise LoRA (iLoRA), integrating LoRA with the Mixture of Experts (MoE)\nframework. iLoRA creates a diverse array of experts, each capturing specific\naspects of user preferences, and introduces a sequence representation guided\ngate function. This gate function processes historical interaction sequences to\ngenerate enriched representations, guiding the gating network to output\ncustomized expert participation weights. This tailored approach mitigates\nnegative transfer and dynamically adjusts to diverse behavior patterns.\nExtensive experiments on three benchmark datasets demonstrate the effectiveness\nof iLoRA, highlighting its superior performance compared to existing methods in\ncapturing user-specific preferences and improving recommendation accuracy.\n","authors":["Xiaoyu Kong","Jiancan Wu","An Zhang","Leheng Sheng","Hui Lin","Xiang Wang","Xiangnan He"],"pdf_url":"https://arxiv.org/pdf/2408.10159v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10124v1","updated":"2024-08-19T16:11:59Z","published":"2024-08-19T16:11:59Z","title":"Molecular Graph Representation Learning Integrating Large Language\n Models with Domain-specific Small Models","summary":" Molecular property prediction is a crucial foundation for drug discovery. In\nrecent years, pre-trained deep learning models have been widely applied to this\ntask. Some approaches that incorporate prior biological domain knowledge into\nthe pre-training framework have achieved impressive results. However, these\nmethods heavily rely on biochemical experts, and retrieving and summarizing\nvast amounts of domain knowledge literature is both time-consuming and\nexpensive. Large Language Models (LLMs) have demonstrated remarkable\nperformance in understanding and efficiently providing general knowledge.\nNevertheless, they occasionally exhibit hallucinations and lack precision in\ngenerating domain-specific knowledge. Conversely, Domain-specific Small Models\n(DSMs) possess rich domain knowledge and can accurately calculate molecular\ndomain-related metrics. However, due to their limited model size and singular\nfunctionality, they lack the breadth of knowledge necessary for comprehensive\nrepresentation learning. To leverage the advantages of both approaches in\nmolecular property prediction, we propose a novel Molecular Graph\nrepresentation learning framework that integrates Large language models and\nDomain-specific small models (MolGraph-LarDo). Technically, we design a\ntwo-stage prompt strategy where DSMs are introduced to calibrate the knowledge\nprovided by LLMs, enhancing the accuracy of domain-specific information and\nthus enabling LLMs to generate more precise textual descriptions for molecular\nsamples. Subsequently, we employ a multi-modal alignment method to coordinate\nvarious modalities, including molecular graphs and their corresponding\ndescriptive texts, to guide the pre-training of molecular representations.\nExtensive experiments demonstrate the effectiveness of the proposed method.\n","authors":["Tianyu Zhang","Yuxiang Ren","Chengbin Hou","Hairong Lv","Xuegong Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.10124v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17631v2","updated":"2024-08-19T13:59:30Z","published":"2024-07-24T20:44:36Z","title":"BLAZE: Cross-Language and Cross-Project Bug Localization via Dynamic\n Chunking and Hard Example Learning","summary":" Software bugs require developers to exert significant effort to identify and\nresolve them, often consuming about one-third of their time. Bug localization,\nthe process of pinpointing the exact source code files that need modification,\nis crucial in reducing this effort. Existing bug localization tools, typically\nreliant on deep learning techniques, face limitations in cross-project\napplicability and effectiveness in multi-language environments. Recent\nadvancements with Large Language Models (LLMs) offer detailed representations\nfor bug localization. However, they encounter challenges with limited context\nwindows and mapping accuracy. To address these issues, we propose BLAZE, an\napproach that employs dynamic chunking and hard example learning. First, BLAZE\ndynamically segments source code to minimize continuity loss. Then, BLAZE\nfine-tunes a GPT-based model using challenging bug cases, in order to enhance\ncross-project and cross-language bug localization. To support the capability of\nBLAZE, we create the BEETLEBOX dataset, which comprises 26,321 bugs from 29\nlarge and thriving open-source projects across five different programming\nlanguages (Java, C++, Python, Go, and JavaScript). Our evaluations of BLAZE on\nthree benchmark datasets BEETLEBOX, SWE-Bench, and Ye et al. demonstrate\nsubstantial improvements compared to six state-of-the-art baselines.\nSpecifically, BLAZE achieves up to an increase of 120% in Top 1 accuracy, 144%\nin Mean Average Precision (MAP), and 100% in Mean Reciprocal Rank (MRR). An\nextensive ablation study confirms the contributions of our pipeline components\nto the overall performance enhancement.\n","authors":["Partha Chakraborty","Mahmoud Alfadel","Meiyappan Nagappan"],"pdf_url":"https://arxiv.org/pdf/2407.17631v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10230v4","updated":"2024-08-19T13:53:12Z","published":"2023-07-15T11:49:43Z","title":"Prompt Tuning on Graph-augmented Low-resource Text Classification","summary":" Text classification is a fundamental problem in information retrieval with\nmany real-world applications, such as predicting the topics of online articles\nand the categories of e-commerce product descriptions. However, low-resource\ntext classification, with no or few labeled samples, presents a serious concern\nfor supervised learning. Meanwhile, many text data are inherently grounded on a\nnetwork structure, such as a hyperlink/citation network for online articles,\nand a user-item purchase network for e-commerce products. These graph\nstructures capture rich semantic relationships, which can potentially augment\nlow-resource text classification. In this paper, we propose a novel model\ncalled Graph-Grounded Pre-training and Prompting (G2P2) to address low-resource\ntext classification in a two-pronged approach. During pre-training, we propose\nthree graph interaction-based contrastive strategies to jointly pre-train a\ngraph-text model; during downstream classification, we explore handcrafted\ndiscrete prompts and continuous prompt tuning for the jointly pre-trained model\nto achieve zero- and few-shot classification, respectively. Moreover, we\nexplore the possibility of employing continuous prompt tuning for zero-shot\ninference. Specifically, we aim to generalize continuous prompts to unseen\nclasses while leveraging a set of base classes. To this end, we extend G2P2\ninto G2P2$^*$, hinging on a new architecture of conditional prompt tuning.\nExtensive experiments on four real-world datasets demonstrate the strength of\nG2P2 in zero- and few-shot low-resource text classification tasks, and\nillustrate the advantage of G2P2$^*$ in dealing with unseen classes.\n","authors":["Zhihao Wen","Yuan Fang"],"pdf_url":"https://arxiv.org/pdf/2307.10230v4.pdf","comment":"15 pages, accepted by TKDE (IEEE Transactions on Knowledge and Data\n Engineering). arXiv admin note: substantial text overlap with\n arXiv:2305.03324"},{"id":"http://arxiv.org/abs/2408.09992v1","updated":"2024-08-19T13:43:48Z","published":"2024-08-19T13:43:48Z","title":"Efficient Inference of Sub-Item Id-based Sequential Recommendation\n Models with Millions of Items","summary":" Transformer-based recommender systems, such as BERT4Rec or SASRec, achieve\nstate-of-the-art results in sequential recommendation. However, it is\nchallenging to use these models in production environments with catalogues of\nmillions of items: scaling Transformers beyond a few thousand items is\nproblematic for several reasons, including high model memory consumption and\nslow inference. In this respect, RecJPQ is a state-of-the-art method of\nreducing the models' memory consumption; RecJPQ compresses item catalogues by\ndecomposing item IDs into a small number of shared sub-item IDs. Despite\nreporting the reduction of memory consumption by a factor of up to 50x, the\noriginal RecJPQ paper did not report inference efficiency improvements over the\nbaseline Transformer-based models. Upon analysing RecJPQ's scoring algorithm,\nwe find that its efficiency is limited by its use of score accumulators for\neach item, which prevents parallelisation. In contrast, LightRec (a\nnon-sequential method that uses a similar idea of sub-ids) reported large\ninference efficiency improvements using an algorithm we call PQTopK. We show\nthat it is also possible to improve RecJPQ-based models' inference efficiency\nusing the PQTopK algorithm. In particular, we speed up RecJPQ-enhanced SASRec\nby a factor of 4.5 x compared to the original SASRec's inference method and by\na factor of 1.56 x compared to the method implemented in RecJPQ code on a\nlarge-scale Gowalla dataset with more than a million items. Further, using\nsimulated data, we show that PQTopK remains efficient with catalogues of up to\ntens of millions of items, removing one of the last obstacles to using\nTransformer-based models in production environments with large catalogues.\n","authors":["Aleksandr V. Petrov","Craig Macdonald","Nicola Tonellotto"],"pdf_url":"https://arxiv.org/pdf/2408.09992v1.pdf","comment":"Accepted by RecSys 2024"},{"id":"http://arxiv.org/abs/2312.06355v3","updated":"2024-08-19T11:43:18Z","published":"2023-12-11T13:03:39Z","title":"Linguistic and Structural Basis of Engineering Design Knowledge","summary":" Natural language artefact descriptions are primary carriers of engineering\ndesign knowledge, whose retrieval, representation, and reuse are fundamental to\nsupporting knowledge-intensive tasks in the design process. In this paper, we\nexplicate design knowledge from patented artefact descriptions as knowledge\ngraphs and examine these to understand the linguistic and structural basis. The\npurpose of our work is to advance the traditional and ontological perspectives\nof design knowledge and to guide Large-Language Models (LLMs) on how to\narticulate natural language responses that reflect knowledge that is valuable\nin a design environment. We populate 33,881 knowledge graphs from a sample of\npatents stratified according to technology classes. For linguistic basis, we\nconduct Zipf distribution analyses on the frequencies of unique entities and\nrelationships to identify 64 and 37 generalisable linguistic syntaxes\nrespectively. The relationships largely represent attributes ('of'), structure\n('in', 'with'), purpose ('to', 'for'), hierarchy ('include'), exemplification\n('such as'), and behaviour ('to', 'from'). For structural basis, we draw\ninspiration from various studies on biological/ecological networks and discover\nmotifs from patent knowledge graphs. We identify four 3-node and four 4-node\nsubgraph patterns that could be converged and simplified into sequence\n[->...->], aggregation [->...<-], and hierarchy [<-...->]. Based on these\nresults, we suggest concretisation strategies for entities and relationships\nand explicating hierarchical structures, potentially aiding the construction\nand modularisation of design knowledge.\n","authors":["L. Siddharth","Jianxi Luo"],"pdf_url":"https://arxiv.org/pdf/2312.06355v3.pdf","comment":"The data for this research is made available at Zenodo -\n https://zenodo.org/doi/10.5281/zenodo.13328257"},{"id":"http://arxiv.org/abs/2408.09865v1","updated":"2024-08-19T10:12:52Z","published":"2024-08-19T10:12:52Z","title":"MAPLE: Enhancing Review Generation with Multi-Aspect Prompt LEarning in\n Explainable Recommendation","summary":" Explainable Recommendation task is designed to receive a pair of user and\nitem and output explanations to justify why an item is recommended to a user.\nMany models treat review-generation as a proxy of explainable recommendation.\nAlthough they are able to generate fluent and grammatical sentences, they\nsuffer from generality and hallucination issues. We propose a personalized,\naspect-controlled model called Multi-Aspect Prompt LEarner (MAPLE), in which it\nintegrates aspect category as another input dimension to facilitate the\nmemorization of fine-grained aspect terms. Experiments on two real-world review\ndatasets in restaurant domain show that MAPLE outperforms the baseline\nreview-generation models in terms of text and feature diversity while\nmaintaining excellent coherence and factual relevance. We further treat MAPLE\nas a retriever component in the retriever-reader framework and employ a\nLarge-Language Model (LLM) as the reader, showing that MAPLE's explanation\nalong with the LLM's comprehension ability leads to enriched and personalized\nexplanation as a result. We will release the code and data in this http upon\nacceptance.\n","authors":["Ching-Wen Yang","Che Wei Chen","Kun-da Wu","Hao Xu","Jui-Feng Yao","Hung-Yu Kao"],"pdf_url":"https://arxiv.org/pdf/2408.09865v1.pdf","comment":"8 main pages, 10 pages for appendix. Under review"},{"id":"http://arxiv.org/abs/2408.09847v1","updated":"2024-08-19T09:50:20Z","published":"2024-08-19T09:50:20Z","title":"Fashion Image-to-Image Translation for Complementary Item Retrieval","summary":" The increasing demand for online fashion retail has boosted research in\nfashion compatibility modeling and item retrieval, focusing on matching user\nqueries (textual descriptions or reference images) with compatible fashion\nitems. A key challenge is top-bottom retrieval, where precise compatibility\nmodeling is essential. Traditional methods, often based on Bayesian\nPersonalized Ranking (BPR), have shown limited performance. Recent efforts have\nexplored using generative models in compatibility modeling and item retrieval,\nwhere generated images serve as additional inputs. However, these approaches\noften overlook the quality of generated images, which could be crucial for\nmodel performance. Additionally, generative models typically require large\ndatasets, posing challenges when such data is scarce.\n To address these issues, we introduce the Generative Compatibility Model\n(GeCo), a two-stage approach that improves fashion image retrieval through\npaired image-to-image translation. First, the Complementary Item Generation\nModel (CIGM), built on Conditional Generative Adversarial Networks (GANs),\ngenerates target item images (e.g., bottoms) from seed items (e.g., tops),\noffering conditioning signals for retrieval. These generated samples are then\nintegrated into GeCo, enhancing compatibility modeling and retrieval accuracy.\nEvaluations on three datasets show that GeCo outperforms state-of-the-art\nbaselines. Key contributions include: (i) the GeCo model utilizing paired\nimage-to-image translation within the Composed Image Retrieval framework, (ii)\ncomprehensive evaluations on benchmark datasets, and (iii) the release of a new\nFashion Taobao dataset designed for top-bottom retrieval, promoting further\nresearch.\n","authors":["Matteo Attimonelli","Claudio Pomo","Dietmar Jannach","Tommaso Di Noia"],"pdf_url":"https://arxiv.org/pdf/2408.09847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09831v1","updated":"2024-08-19T09:27:45Z","published":"2024-08-19T09:27:45Z","title":"Ranking Generated Answers: On the Agreement of Retrieval Models with\n Humans on Consumer Health Questions","summary":" Evaluating the output of generative large language models (LLMs) is\nchallenging and difficult to scale. Most evaluations of LLMs focus on tasks\nsuch as single-choice question-answering or text classification. These tasks\nare not suitable for assessing open-ended question-answering capabilities,\nwhich are critical in domains where expertise is required, such as health, and\nwhere misleading or incorrect answers can have a significant impact on a user's\nhealth. Using human experts to evaluate the quality of LLM answers is generally\nconsidered the gold standard, but expert annotation is costly and slow. We\npresent a method for evaluating LLM answers that uses ranking signals as a\nsubstitute for explicit relevance judgements. Our scoring method correlates\nwith the preferences of human experts. We validate it by investigating the\nwell-known fact that the quality of generated answers improves with the size of\nthe model as well as with more sophisticated prompting strategies.\n","authors":["Sebastian Heineking","Jonas Probst","Daniel Steinbach","Martin Potthast","Harrisen Scells"],"pdf_url":"https://arxiv.org/pdf/2408.09831v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09817v1","updated":"2024-08-19T09:13:52Z","published":"2024-08-19T09:13:52Z","title":"Contextual Dual Learning Algorithm with Listwise Distillation for\n Unbiased Learning to Rank","summary":" Unbiased Learning to Rank (ULTR) aims to leverage biased implicit user\nfeedback (e.g., click) to optimize an unbiased ranking model. The effectiveness\nof the existing ULTR methods has primarily been validated on synthetic\ndatasets. However, their performance on real-world click data remains unclear.\nRecently, Baidu released a large publicly available dataset of their web search\nlogs. Subsequently, the NTCIR-17 ULTRE-2 task released a subset dataset\nextracted from it. We conduct experiments on commonly used or effective ULTR\nmethods on this subset to determine whether they maintain their effectiveness.\nIn this paper, we propose a Contextual Dual Learning Algorithm with Listwise\nDistillation (CDLA-LD) to simultaneously address both position bias and\ncontextual bias. We utilize a listwise-input ranking model to obtain\nreconstructed feature vectors incorporating local contextual information and\nemploy the Dual Learning Algorithm (DLA) method to jointly train this ranking\nmodel and a propensity model to address position bias. As this ranking model\nlearns the interaction information within the documents list of the training\nset, to enhance the ranking model's generalization ability, we additionally\ntrain a pointwise-input ranking model to learn the listwise-input ranking\nmodel's capability for relevance judgment in a listwise manner. Extensive\nexperiments and analysis confirm the effectiveness of our approach.\n","authors":["Lulu Yu","Keping Bi","Shiyu Ni","Jiafeng Guo"],"pdf_url":"https://arxiv.org/pdf/2408.09817v1.pdf","comment":"12 pages, 2 figures"},{"id":"http://arxiv.org/abs/2405.11441v2","updated":"2024-08-19T08:50:54Z","published":"2024-05-19T04:31:54Z","title":"EmbSum: Leveraging the Summarization Capabilities of Large Language\n Models for Content-Based Recommendations","summary":" Content-based recommendation systems play a crucial role in delivering\npersonalized content to users in the digital world. In this work, we introduce\nEmbSum, a novel framework that enables offline pre-computations of users and\ncandidate items while capturing the interactions within the user engagement\nhistory. By utilizing the pretrained encoder-decoder model and poly-attention\nlayers, EmbSum derives User Poly-Embedding (UPE) and Content Poly-Embedding\n(CPE) to calculate relevance scores between users and candidate items. EmbSum\nactively learns the long user engagement histories by generating user-interest\nsummary with supervision from large language model (LLM). The effectiveness of\nEmbSum is validated on two datasets from different domains, surpassing\nstate-of-the-art (SoTA) methods with higher accuracy and fewer parameters.\nAdditionally, the model's ability to generate summaries of user interests\nserves as a valuable by-product, enhancing its usefulness for personalized\ncontent recommendations.\n","authors":["Chiyu Zhang","Yifei Sun","Minghao Wu","Jun Chen","Jie Lei","Muhammad Abdul-Mageed","Rong Jin","Angli Liu","Ji Zhu","Sem Park","Ning Yao","Bo Long"],"pdf_url":"https://arxiv.org/pdf/2405.11441v2.pdf","comment":"Accepted by RecSys 2024"},{"id":"http://arxiv.org/abs/2408.09748v1","updated":"2024-08-19T07:21:02Z","published":"2024-08-19T07:21:02Z","title":"Revisiting Reciprocal Recommender Systems: Metrics, Formulation, and\n Method","summary":" Reciprocal recommender systems~(RRS), conducting bilateral recommendations\nbetween two involved parties, have gained increasing attention for enhancing\nmatching efficiency. However, the majority of existing methods in the\nliterature still reuse conventional ranking metrics to separately assess the\nperformance on each side of the recommendation process. These methods overlook\nthe fact that the ranking outcomes of both sides collectively influence the\neffectiveness of the RRS, neglecting the necessity of a more holistic\nevaluation and a capable systemic solution.\n In this paper, we systemically revisit the task of reciprocal recommendation,\nby introducing the new metrics, formulation, and method. Firstly, we propose\nfive new evaluation metrics that comprehensively and accurately assess the\nperformance of RRS from three distinct perspectives: overall coverage,\nbilateral stability, and balanced ranking. These metrics provide a more\nholistic understanding of the system's effectiveness and enable a comprehensive\nevaluation. Furthermore, we formulate the RRS from a causal perspective,\nformulating recommendations as bilateral interventions, which can better model\nthe decoupled effects of potential influencing factors. By utilizing the\npotential outcome framework, we further develop a model-agnostic causal\nreciprocal recommendation method that considers the causal effects of\nrecommendations. Additionally, we introduce a reranking strategy to maximize\nmatching outcomes, as measured by the proposed metrics. Extensive experiments\non two real-world datasets from recruitment and dating scenarios demonstrate\nthe effectiveness of our proposed metrics and approach. The code and dataset\nare available at: https://github.com/RUCAIBox/CRRS.\n","authors":["Chen Yang","Sunhao Dai","Yupeng Hou","Wayne Xin Zhao","Jun Xu","Yang Song","Hengshu Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.09748v1.pdf","comment":"KDD 2024"},{"id":"http://arxiv.org/abs/2307.10323v2","updated":"2024-08-19T07:02:19Z","published":"2023-07-19T07:20:30Z","title":"IncDSI: Incrementally Updatable Document Retrieval","summary":" Differentiable Search Index is a recently proposed paradigm for document\nretrieval, that encodes information about a corpus of documents within the\nparameters of a neural network and directly maps queries to corresponding\ndocuments. These models have achieved state-of-the-art performances for\ndocument retrieval across many benchmarks. These kinds of models have a\nsignificant limitation: it is not easy to add new documents after a model is\ntrained. We propose IncDSI, a method to add documents in real time (about\n20-50ms per document), without retraining the model on the entire dataset (or\neven parts thereof). Instead we formulate the addition of documents as a\nconstrained optimization problem that makes minimal changes to the network\nparameters. Although orders of magnitude faster, our approach is competitive\nwith re-training the model on the whole dataset and enables the development of\ndocument retrieval systems that can be updated with new information in\nreal-time. Our code for IncDSI is available at\nhttps://github.com/varshakishore/IncDSI.\n","authors":["Varsha Kishore","Chao Wan","Justin Lovelace","Yoav Artzi","Kilian Q. Weinberger"],"pdf_url":"https://arxiv.org/pdf/2307.10323v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09713v1","updated":"2024-08-19T06:05:24Z","published":"2024-08-19T06:05:24Z","title":"Carbon Footprint Accounting Driven by Large Language Models and\n Retrieval-augmented Generation","summary":" Carbon footprint accounting is crucial for quantifying greenhouse gas\nemissions and achieving carbon neutrality.The dynamic nature of processes,\naccounting rules, carbon-related policies, and energy supply structures\nnecessitates real-time updates of CFA. Traditional life cycle assessment\nmethods rely heavily on human expertise, making near-real-time updates\nchallenging. This paper introduces a novel approach integrating large language\nmodels (LLMs) with retrieval-augmented generation technology to enhance the\nreal-time, professional, and economical aspects of carbon footprint information\nretrieval and analysis. By leveraging LLMs' logical and language understanding\nabilities and RAG's efficient retrieval capabilities, the proposed method\nLLMs-RAG-CFA can retrieve more relevant professional information to assist\nLLMs, enhancing the model's generative abilities. This method offers broad\nprofessional coverage, efficient real-time carbon footprint information\nacquisition and accounting, and cost-effective automation without frequent\nLLMs' parameter updates. Experimental results across five industries(primary\naluminum, lithium battery, photovoltaic, new energy vehicles, and\ntransformers)demonstrate that the LLMs-RAG-CFA method outperforms traditional\nmethods and other LLMs, achieving higher information retrieval rates and\nsignificantly lower information deviations and carbon footprint accounting\ndeviations. The economically viable design utilizes RAG technology to balance\nreal-time updates with cost-effectiveness, providing an efficient, reliable,\nand cost-saving solution for real-time carbon emission management, thereby\nenhancing environmental sustainability practices.\n","authors":["Haijin Wang","Zheng Chen","Nan Shang","Shangheng Yao","Zibin Pan","Fushuan Wen","Junhua Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.09713v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09698v1","updated":"2024-08-19T04:44:32Z","published":"2024-08-19T04:44:32Z","title":"Harnessing Multimodal Large Language Models for Multimodal Sequential\n Recommendation","summary":" Recent advances in Large Language Models (LLMs) have demonstrated significant\npotential in the field of Recommendation Systems (RSs). Most existing studies\nhave focused on converting user behavior logs into textual prompts and\nleveraging techniques such as prompt tuning to enable LLMs for recommendation\ntasks. Meanwhile, research interest has recently grown in multimodal\nrecommendation systems that integrate data from images, text, and other sources\nusing modality fusion techniques. This introduces new challenges to the\nexisting LLM-based recommendation paradigm which relies solely on text modality\ninformation. Moreover, although Multimodal Large Language Models (MLLMs)\ncapable of processing multi-modal inputs have emerged, how to equip MLLMs with\nmulti-modal recommendation capabilities remains largely unexplored. To this\nend, in this paper, we propose the Multimodal Large Language Model-enhanced\nSequential Multimodal Recommendation (MLLM-MSR) model. To capture the dynamic\nuser preference, we design a two-stage user preference summarization method.\nSpecifically, we first utilize an MLLM-based item-summarizer to extract image\nfeature given an item and convert the image into text. Then, we employ a\nrecurrent user preference summarization generation paradigm to capture the\ndynamic changes in user preferences based on an LLM-based user-summarizer.\nFinally, to enable the MLLM for multi-modal recommendation task, we propose to\nfine-tune a MLLM-based recommender using Supervised Fine-Tuning (SFT)\ntechniques. Extensive evaluations across various datasets validate the\neffectiveness of MLLM-MSR, showcasing its superior ability to capture and adapt\nto the evolving dynamics of user preferences.\n","authors":["Yuyang Ye","Zhi Zheng","Yishan Shen","Tianshu Wang","Hengruo Zhang","Peijun Zhu","Runlong Yu","Kai Zhang","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2408.09698v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08686v2","updated":"2024-08-19T04:31:51Z","published":"2024-08-16T11:59:01Z","title":"SC-Rec: Enhancing Generative Retrieval with Self-Consistent Reranking\n for Sequential Recommendation","summary":" Language Models (LMs) are increasingly employed in recommendation systems due\nto their advanced language understanding and generation capabilities. Recent\nrecommender systems based on generative retrieval have leveraged the\ninferential abilities of LMs to directly generate the index tokens of the next\nitem, based on item sequences within the user's interaction history. Previous\nstudies have mostly focused on item indices based solely on textual semantic or\ncollaborative information. However, although the standalone effectiveness of\nthese aspects has been demonstrated, the integration of this information has\nremained unexplored. Our in-depth analysis finds that there is a significant\ndifference in the knowledge captured by the model from heterogeneous item\nindices and diverse input prompts, which can have a high potential for\ncomplementarity. In this paper, we propose SC-Rec, a unified recommender system\nthat learns diverse preference knowledge from two distinct item indices and\nmultiple prompt templates. Furthermore, SC-Rec adopts a novel reranking\nstrategy that aggregates a set of ranking results, inferred based on different\nindices and prompts, to achieve the self-consistency of the model. Our\nempirical evaluation on three real-world datasets demonstrates that SC-Rec\nconsiderably outperforms the state-of-the-art methods for sequential\nrecommendation, effectively incorporating complementary knowledge from varied\noutputs of the model.\n","authors":["Tongyoung Kim","Soojin Yoon","Seongku Kang","Jinyoung Yeo","Dongha Lee"],"pdf_url":"https://arxiv.org/pdf/2408.08686v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09671v1","updated":"2024-08-19T03:13:20Z","published":"2024-08-19T03:13:20Z","title":"GANPrompt: Enhancing Robustness in LLM-Based Recommendations with\n GAN-Enhanced Diversity Prompts","summary":" In recent years, LLM has demonstrated remarkable proficiency in comprehending\nand generating natural language, with a growing prevalence in the domain of\nrecommender systems. However, LLM continues to face a significant challenge in\nthat it is highly susceptible to the influence of prompt words. This\ninconsistency in response to minor alterations in prompt input may compromise\nthe accuracy and resilience of recommendation models. To address this issue,\nthis paper proposes GANPrompt, a multi-dimensional large language model prompt\ndiversity framework based on Generative Adversarial Networks (GANs). The\nframework enhances the model's adaptability and stability to diverse prompts by\nintegrating GAN generation techniques with the deep semantic understanding\ncapabilities of LLMs. GANPrompt first trains a generator capable of producing\ndiverse prompts by analysing multidimensional user behavioural data. These\ndiverse prompts are then used to train the LLM to improve its performance in\nthe face of unseen prompts. Furthermore, to ensure a high degree of diversity\nand relevance of the prompts, this study introduces a mathematical theory-based\ndiversity constraint mechanism that optimises the generated prompts to ensure\nthat they are not only superficially distinct, but also semantically cover a\nwide range of user intentions. Through extensive experiments on multiple\ndatasets, we demonstrate the effectiveness of the proposed framework,\nespecially in improving the adaptability and robustness of recommender systems\nin complex and dynamic environments. The experimental results demonstrate that\nGANPrompt yields substantial enhancements in accuracy and robustness relative\nto existing state-of-the-art methodologies.\n","authors":["Xinyu Li","Chuang Zhao","Hongke Zhao","Likang Wu","Ming HE"],"pdf_url":"https://arxiv.org/pdf/2408.09671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.07314v2","updated":"2024-08-19T02:21:17Z","published":"2024-05-12T15:49:38Z","title":"Learnable Item Tokenization for Generative Recommendation","summary":" Utilizing powerful Large Language Models (LLMs) for generative recommendation\nhas attracted much attention. Nevertheless, a crucial challenge is transforming\nrecommendation data into the language space of LLMs through effective item\ntokenization. Current approaches, such as ID, textual, and codebook-based\nidentifiers, exhibit shortcomings in encoding semantic information,\nincorporating collaborative signals, or handling code assignment bias. To\naddress these limitations, we propose LETTER (a LEarnable Tokenizer for\ngeneraTivE Recommendation), which integrates hierarchical semantics,\ncollaborative signals, and code assignment diversity to satisfy the essential\nrequirements of identifiers. LETTER incorporates Residual Quantized VAE for\nsemantic regularization, a contrastive alignment loss for collaborative\nregularization, and a diversity loss to mitigate code assignment bias. We\ninstantiate LETTER on two models and propose a ranking-guided generation loss\nto augment their ranking ability theoretically. Experiments on three datasets\nvalidate the superiority of LETTER, advancing the state-of-the-art in the field\nof LLM-based generative recommendation.\n","authors":["Wenjie Wang","Honghui Bao","Xinyu Lin","Jizhi Zhang","Yongqi Li","Fuli Feng","See-Kiong Ng","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2405.07314v2.pdf","comment":"Accepted by CIKM 2024"},{"id":"http://arxiv.org/abs/2408.09651v1","updated":"2024-08-19T02:17:22Z","published":"2024-08-19T02:17:22Z","title":"Data-driven Conditional Instrumental Variables for Debiasing Recommender\n Systems","summary":" In recommender systems, latent variables can cause user-item interaction data\nto deviate from true user preferences. This biased data is then used to train\nrecommendation models, further amplifying the bias and ultimately compromising\nboth recommendation accuracy and user satisfaction. Instrumental Variable (IV)\nmethods are effective tools for addressing the confounding bias introduced by\nlatent variables; however, identifying a valid IV is often challenging. To\novercome this issue, we propose a novel data-driven conditional IV (CIV)\ndebiasing method for recommender systems, called CIV4Rec. CIV4Rec automatically\ngenerates valid CIVs and their corresponding conditioning sets directly from\ninteraction data, significantly reducing the complexity of IV selection while\neffectively mitigating the confounding bias caused by latent variables in\nrecommender systems. Specifically, CIV4Rec leverages a variational autoencoder\n(VAE) to generate the representations of the CIV and its conditional set from\ninteraction data, followed by the application of least squares to derive causal\nrepresentations for click prediction. Extensive experiments on two real-world\ndatasets, Movielens-10M and Douban-Movie, demonstrate that our CIV4Rec\nsuccessfully identifies valid CIVs, effectively reduces bias, and consequently\nimproves recommendation accuracy.\n","authors":["Zhirong Huang","Shichao Zhang","Debo Cheng","Jiuyong Li","Lin Liu","Guangquan Lu"],"pdf_url":"https://arxiv.org/pdf/2408.09651v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09646v1","updated":"2024-08-19T02:12:40Z","published":"2024-08-19T02:12:40Z","title":"Debiased Contrastive Representation Learning for Mitigating Dual Biases\n in Recommender Systems","summary":" In recommender systems, popularity and conformity biases undermine\nrecommender effectiveness by disproportionately favouring popular items,\nleading to their over-representation in recommendation lists and causing an\nunbalanced distribution of user-item historical data. We construct a causal\ngraph to address both biases and describe the abstract data generation\nmechanism. Then, we use it as a guide to develop a novel Debiased Contrastive\nLearning framework for Mitigating Dual Biases, called DCLMDB. In DCLMDB, both\npopularity bias and conformity bias are handled in the model training process\nby contrastive learning to ensure that user choices and recommended items are\nnot unduly influenced by conformity and popularity. Extensive experiments on\ntwo real-world datasets, Movielens-10M and Netflix, show that DCLMDB can\neffectively reduce the dual biases, as well as significantly enhance the\naccuracy and diversity of recommendations.\n","authors":["Zhirong Huang","Shichao Zhang","Debo Cheng","Jiuyong Li","Lin Liu","Guixian Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.09646v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15291v2","updated":"2024-08-19T01:31:22Z","published":"2024-07-21T23:13:05Z","title":"Evidence-Based Temporal Fact Verification","summary":" Automated fact verification plays an essential role in fostering trust in the\ndigital space. Despite the growing interest, the verification of temporal facts\nhas not received much attention in the community. Temporal fact verification\nbrings new challenges where cues of the temporal information need to be\nextracted and temporal reasoning involving various temporal aspects of the text\nmust be applied. In this work, we propose an end-to-end solution for temporal\nfact verification that considers the temporal information in claims to obtain\nrelevant evidence sentences and harness the power of large language model for\ntemporal reasoning. Recognizing that temporal facts often involve events, we\nmodel these events in the claim and evidence sentences. We curate two temporal\nfact datasets to learn time-sensitive representations that encapsulate not only\nthe semantic relationships among the events, but also their chronological\nproximity. This allows us to retrieve the top-k relevant evidence sentences and\nprovide the context for a large language model to perform temporal reasoning\nand outputs whether a claim is supported or refuted by the retrieved evidence\nsentences. Experiment results demonstrate that the proposed approach\nsignificantly enhances the accuracy of temporal claim verification, thereby\nadvancing current state-of-the-art in automated fact verification.\n","authors":["Anab Maulana Barik","Wynne Hsu","Mong Li Lee"],"pdf_url":"https://arxiv.org/pdf/2407.15291v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10435v1","updated":"2024-08-19T22:01:45Z","published":"2024-08-19T22:01:45Z","title":"Enhanced document retrieval with topic embeddings","summary":" Document retrieval systems have experienced a revitalized interest with the\nadvent of retrieval-augmented generation (RAG). RAG architecture offers a lower\nhallucination rate than LLM-only applications. However, the accuracy of the\nretrieval mechanism is known to be a bottleneck in the efficiency of these\napplications. A particular case of subpar retrieval performance is observed in\nsituations where multiple documents from several different but related topics\nare in the corpus. We have devised a new vectorization method that takes into\naccount the topic information of the document. The paper introduces this new\nmethod for text vectorization and evaluates it in the context of RAG.\nFurthermore, we discuss the challenge of evaluating RAG systems, which pertains\nto the case at hand.\n","authors":["Kavsar Huseynova","Jafar Isbarov"],"pdf_url":"https://arxiv.org/pdf/2408.10435v1.pdf","comment":"Accepted to AICT 2024"},{"id":"http://arxiv.org/abs/2408.10394v1","updated":"2024-08-19T20:26:45Z","published":"2024-08-19T20:26:45Z","title":"Joint Modeling of Search and Recommendations Via an Unified Contextual\n Recommender (UniCoRn)","summary":" Search and recommendation systems are essential in many services, and they\nare often developed separately, leading to complex maintenance and technical\ndebt. In this paper, we present a unified deep learning model that efficiently\nhandles key aspects of both tasks.\n","authors":["Moumita Bhattacharya","Vito Ostuni","Sudarshan Lamkhede"],"pdf_url":"https://arxiv.org/pdf/2408.10394v1.pdf","comment":"3 pages, 1 figure"},{"id":"http://arxiv.org/abs/2408.10357v1","updated":"2024-08-19T18:54:20Z","published":"2024-08-19T18:54:20Z","title":"Beyond Relevant Documents: A Knowledge-Intensive Approach for\n Query-Focused Summarization using Large Language Models","summary":" Query-focused summarization (QFS) is a fundamental task in natural language\nprocessing with broad applications, including search engines and report\ngeneration. However, traditional approaches assume the availability of relevant\ndocuments, which may not always hold in practical scenarios, especially in\nhighly specialized topics. To address this limitation, we propose a novel\nknowledge-intensive approach that reframes QFS as a knowledge-intensive task\nsetup. This approach comprises two main components: a retrieval module and a\nsummarization controller. The retrieval module efficiently retrieves\npotentially relevant documents from a large-scale knowledge corpus based on the\ngiven textual query, eliminating the dependence on pre-existing document sets.\nThe summarization controller seamlessly integrates a powerful large language\nmodel (LLM)-based summarizer with a carefully tailored prompt, ensuring the\ngenerated summary is comprehensive and relevant to the query. To assess the\neffectiveness of our approach, we create a new dataset, along with\nhuman-annotated relevance labels, to facilitate comprehensive evaluation\ncovering both retrieval and summarization performance. Extensive experiments\ndemonstrate the superior performance of our approach, particularly its ability\nto generate accurate summaries without relying on the availability of relevant\ndocuments initially. This underscores our method's versatility and practical\napplicability across diverse query scenarios.\n","authors":["Weijia Zhang","Jia-Hong Huang","Svitlana Vakulenko","Yumo Xu","Thilina Rajapakse","Evangelos Kanoulas"],"pdf_url":"https://arxiv.org/pdf/2408.10357v1.pdf","comment":"Accepted by the 27th International Conference on Pattern Recognition\n (ICPR 2024)"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2408.10205v1","updated":"2024-08-19T17:59:04Z","published":"2024-08-19T17:59:04Z","title":"KAN 2.0: Kolmogorov-Arnold Networks Meet Science","summary":" A major challenge of AI + Science lies in their inherent incompatibility:\ntoday's AI is primarily based on connectionism, while science depends on\nsymbolism. To bridge the two worlds, we propose a framework to seamlessly\nsynergize Kolmogorov-Arnold Networks (KANs) and science. The framework\nhighlights KANs' usage for three aspects of scientific discovery: identifying\nrelevant features, revealing modular structures, and discovering symbolic\nformulas. The synergy is bidirectional: science to KAN (incorporating\nscientific knowledge into KANs), and KAN to science (extracting scientific\ninsights from KANs). We highlight major new functionalities in the pykan\npackage: (1) MultKAN: KANs with multiplication nodes. (2) kanpiler: a KAN\ncompiler that compiles symbolic formulas into KANs. (3) tree converter: convert\nKANs (or any neural networks) to tree graphs. Based on these tools, we\ndemonstrate KANs' capability to discover various types of physical laws,\nincluding conserved quantities, Lagrangians, symmetries, and constitutive laws.\n","authors":["Ziming Liu","Pingchuan Ma","Yixuan Wang","Wojciech Matusik","Max Tegmark"],"pdf_url":"https://arxiv.org/pdf/2408.10205v1.pdf","comment":"27 pages, 14 figures"},{"id":"http://arxiv.org/abs/2306.09912v4","updated":"2024-08-19T17:58:55Z","published":"2023-06-16T15:40:21Z","title":"Towards Quantum Federated Learning","summary":" Quantum Federated Learning (QFL) is an emerging interdisciplinary field that\nmerges the principles of Quantum Computing (QC) and Federated Learning (FL),\nwith the goal of leveraging quantum technologies to enhance privacy, security,\nand efficiency in the learning process. Currently, there is no comprehensive\nsurvey for this interdisciplinary field. This review offers a thorough,\nholistic examination of QFL. We aim to provide a comprehensive understanding of\nthe principles, techniques, and emerging applications of QFL. We discuss the\ncurrent state of research in this rapidly evolving field, identify challenges\nand opportunities associated with integrating these technologies, and outline\nfuture directions and open research questions. We propose a unique taxonomy of\nQFL techniques, categorized according to their characteristics and the quantum\ntechniques employed. As the field of QFL continues to progress, we can\nanticipate further breakthroughs and applications across various industries,\ndriving innovation and addressing challenges related to data privacy, security,\nand resource optimization. This review serves as a first-of-its-kind\ncomprehensive guide for researchers and practitioners interested in\nunderstanding and advancing the field of QFL.\n","authors":["Chao Ren","Rudai Yan","Huihui Zhu","Han Yu","Minrui Xu","Yuan Shen","Yan Xu","Ming Xiao","Zhao Yang Dong","Mikael Skoglund","Dusit Niyato","Leong Chuan Kwek"],"pdf_url":"https://arxiv.org/pdf/2306.09912v4.pdf","comment":"Survey of quantum federated learning (QFL)"},{"id":"http://arxiv.org/abs/2408.10204v1","updated":"2024-08-19T17:58:03Z","published":"2024-08-19T17:58:03Z","title":"Criticality Leveraged Adversarial Training (CLAT) for Boosted\n Performance via Parameter Efficiency","summary":" Adversarial training enhances neural network robustness but suffers from a\ntendency to overfit and increased generalization errors on clean data. This\nwork introduces CLAT, an innovative approach that mitigates adversarial\noverfitting by introducing parameter efficiency into the adversarial training\nprocess, improving both clean accuracy and adversarial robustness. Instead of\ntuning the entire model, CLAT identifies and fine-tunes robustness-critical\nlayers - those predominantly learning non-robust features - while freezing the\nremaining model to enhance robustness. It employs dynamic critical layer\nselection to adapt to changes in layer criticality throughout the fine-tuning\nprocess. Empirically, CLAT can be applied on top of existing adversarial\ntraining methods, significantly reduces the number of trainable parameters by\napproximately 95%, and achieves more than a 2% improvement in adversarial\nrobustness compared to baseline methods.\n","authors":["Bhavna Gopal","Huanrui Yang","Jingyang Zhang","Mark Horton","Yiran Chen"],"pdf_url":"https://arxiv.org/pdf/2408.10204v1.pdf","comment":"9 pages + appendix/ additional experiments"},{"id":"http://arxiv.org/abs/2408.10193v1","updated":"2024-08-19T17:52:38Z","published":"2024-08-19T17:52:38Z","title":"Area under the ROC Curve has the Most Consistent Evaluation for Binary\n Classification","summary":" Evaluation Metrics is an important question for model evaluation and model\nselection in binary classification tasks. This study investigates how\nconsistent metrics are at evaluating different models under different data\nscenarios. Analyzing over 150 data scenarios and 18 model evaluation metrics\nusing statistical simulation, I find that for binary classification tasks,\nevaluation metrics that are less influenced by prevalence offer more consistent\nranking of a set of different models. In particular, Area Under the ROC Curve\n(AUC) has smallest variance in ranking of different models. Matthew's\ncorrelation coefficient as a more strict measure of model performance has the\nsecond smallest variance. These patterns holds across a rich set of data\nscenarios and five commonly used machine learning models as well as a naive\nrandom guess model. The results have significant implications for model\nevaluation and model selection in binary classification tasks.\n","authors":["Jing Li"],"pdf_url":"https://arxiv.org/pdf/2408.10193v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10189v1","updated":"2024-08-19T17:48:11Z","published":"2024-08-19T17:48:11Z","title":"Transformers to SSMs: Distilling Quadratic Knowledge to Subquadratic\n Models","summary":" Transformer architectures have become a dominant paradigm for domains like\nlanguage modeling but suffer in many inference settings due to their\nquadratic-time self-attention. Recently proposed subquadratic architectures,\nsuch as Mamba, have shown promise, but have been pretrained with substantially\nless computational resources than the strongest Transformer models. In this\nwork, we present a method that is able to distill a pretrained Transformer\narchitecture into alternative architectures such as state space models (SSMs).\nThe key idea to our approach is that we can view both Transformers and SSMs as\napplying different forms of mixing matrices over the token sequences. We can\nthus progressively distill the Transformer architecture by matching different\ndegrees of granularity in the SSM: first matching the mixing matrices\nthemselves, then the hidden units at each block, and finally the end-to-end\npredictions. Our method, called MOHAWK, is able to distill a Mamba-2 variant\nbased on the Phi-1.5 architecture (Phi-Mamba) using only 3B tokens and a hybrid\nversion (Hybrid Phi-Mamba) using 5B tokens. Despite using less than 1% of the\ntraining data typically used to train models from scratch, Phi-Mamba boasts\nsubstantially stronger performance compared to all past open-source\nnon-Transformer models. MOHAWK allows models like SSMs to leverage\ncomputational resources invested in training Transformer-based architectures,\nhighlighting a new avenue for building such models.\n","authors":["Aviv Bick","Kevin Y. Li","Eric P. Xing","J. Zico Kolter","Albert Gu"],"pdf_url":"https://arxiv.org/pdf/2408.10189v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.00212v2","updated":"2024-08-19T17:48:03Z","published":"2023-11-01T01:19:54Z","title":"A Unified Framework to Enforce, Discover, and Promote Symmetry in\n Machine Learning","summary":" Symmetry is present throughout nature and continues to play an increasingly\ncentral role in physics and machine learning. Fundamental symmetries, such as\nPoincar\\'{e} invariance, allow physical laws discovered in laboratories on\nEarth to be extrapolated to the farthest reaches of the universe. Symmetry is\nessential to achieving this extrapolatory power in machine learning\napplications. For example, translation invariance in image classification\nallows models with fewer parameters, such as convolutional neural networks, to\nbe trained on smaller data sets and achieve state-of-the-art performance. In\nthis paper, we provide a unifying theoretical and methodological framework for\nincorporating symmetry into machine learning models in three ways: 1. enforcing\nknown symmetry when training a model; 2. discovering unknown symmetries of a\ngiven model or data set; and 3. promoting symmetry during training by learning\na model that breaks symmetries within a user-specified group of candidates when\nthere is sufficient evidence in the data. We show that these tasks can be cast\nwithin a common mathematical framework whose central object is the Lie\nderivative associated with fiber-linear Lie group actions on vector bundles. We\nextend and unify several existing results by showing that enforcing and\ndiscovering symmetry are linear-algebraic tasks that are dual with respect to\nthe bilinear structure of the Lie derivative. We also propose a novel way to\npromote symmetry by introducing a class of convex regularization functions\nbased on the Lie derivative and nuclear norm relaxation to penalize symmetry\nbreaking during training of machine learning models. We explain how these ideas\ncan be applied to a wide range of machine learning models including basis\nfunction regression, dynamical systems discovery, neural networks, and neural\noperators acting on fields.\n","authors":["Samuel E. Otto","Nicholas Zolman","J. Nathan Kutz","Steven L. Brunton"],"pdf_url":"https://arxiv.org/pdf/2311.00212v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10174v1","updated":"2024-08-19T17:32:15Z","published":"2024-08-19T17:32:15Z","title":"SMILE: Zero-Shot Sparse Mixture of Low-Rank Experts Construction From\n Pre-Trained Foundation Models","summary":" Deep model training on extensive datasets is increasingly becoming\ncost-prohibitive, prompting the widespread adoption of deep model fusion\ntechniques to leverage knowledge from pre-existing models. From simple weight\naveraging to more sophisticated methods like AdaMerging, model fusion\neffectively improves model performance and accelerates the development of new\nmodels. However, potential interference between parameters of individual models\nand the lack of interpretability in the fusion progress remain significant\nchallenges. Existing methods often try to resolve the parameter interference\nissue by evaluating attributes of parameters, such as their magnitude or sign,\nor by parameter pruning. In this study, we begin by examining the fine-tuning\nof linear layers through the lens of subspace analysis and explicitly define\nparameter interference as an optimization problem to shed light on this\nsubject. Subsequently, we introduce an innovative approach to model fusion\ncalled zero-shot Sparse MIxture of Low-rank Experts (SMILE) construction, which\nallows for the upscaling of source models into an MoE model without extra data\nor further training. Our approach relies on the observation that fine-tuning\nmostly keeps the important parts from the pre-training, but it uses less\nsignificant or unused areas to adapt to new tasks. Also, the issue of parameter\ninterference, which is intrinsically intractable in the original parameter\nspace, can be managed by expanding the dimensions. We conduct extensive\nexperiments across diverse scenarios, such as image classification and text\ngeneralization tasks, using full fine-tuning and LoRA fine-tuning, and we apply\nour method to large language models (CLIP models, Flan-T5 models, and\nMistral-7B models), highlighting the adaptability and scalability of SMILE.\nCode is available at https://github.com/tanganke/fusion_bench\n","authors":["Anke Tang","Li Shen","Yong Luo","Shuai Xie","Han Hu","Lefei Zhang","Bo Du","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2408.10174v1.pdf","comment":"Code is available at https://github.com/tanganke/fusion_bench"},{"id":"http://arxiv.org/abs/2408.01747v2","updated":"2024-08-19T17:31:07Z","published":"2024-08-03T11:07:10Z","title":"Classical Machine Learning: Seventy Years of Algorithmic Learning\n Evolution","summary":" Machine learning (ML) has transformed numerous fields, but understanding its\nfoundational research is crucial for its continued progress. This paper\npresents an overview of the significant classical ML algorithms and examines\nthe state-of-the-art publications spanning twelve decades through an extensive\nbibliometric analysis study. We analyzed a dataset of highly cited papers from\nprominent ML conferences and journals, employing citation and keyword analyses\nto uncover critical insights. The study further identifies the most influential\npapers and authors, reveals the evolving collaborative networks within the ML\ncommunity, and pinpoints prevailing research themes and emerging focus areas.\nAdditionally, we examine the geographic distribution of highly cited\npublications, highlighting the leading countries in ML research. This study\nprovides a comprehensive overview of the evolution of traditional learning\nalgorithms and their impacts. It discusses challenges and opportunities for\nfuture development, focusing on the Global South. The findings from this paper\noffer valuable insights for both ML experts and the broader research community,\nenhancing understanding of the field's trajectory and its significant influence\non recent advances in learning algorithms.\n","authors":["Absalom E. Ezugwu","Yuh-Shan Ho","Ojonukpe S. Egwuche","Olufisayo S. Ekundayo","Annette Van Der Merwe","Apu K. Saha","Jayanta Pal"],"pdf_url":"https://arxiv.org/pdf/2408.01747v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17844v2","updated":"2024-08-19T17:26:18Z","published":"2024-03-26T16:33:12Z","title":"Mechanistic Design and Scaling of Hybrid Architectures","summary":" The development of deep learning architectures is a resource-demanding\nprocess, due to a vast design space, long prototyping times, and high compute\ncosts associated with at-scale model training and evaluation. We set out to\nsimplify this process by grounding it in an end-to-end mechanistic architecture\ndesign (MAD) pipeline, encompassing small-scale capability unit tests\npredictive of scaling laws. Through a suite of synthetic token manipulation\ntasks such as compression and recall, designed to probe capabilities, we\nidentify and test new hybrid architectures constructed from a variety of\ncomputational primitives. We experimentally validate the resulting\narchitectures via an extensive compute-optimal and a new state-optimal scaling\nlaw analysis, training over 500 language models between 70M to 7B parameters.\nSurprisingly, we find MAD synthetics to correlate with compute-optimal\nperplexity, enabling accurate evaluation of new architectures via isolated\nproxy tasks. The new architectures found via MAD, based on simple ideas such as\nhybridization and sparsity, outperform state-of-the-art Transformer,\nconvolutional, and recurrent architectures (Transformer++, Hyena, Mamba) in\nscaling, both at compute-optimal budgets and in overtrained regimes. Overall,\nthese results provide evidence that performance on curated synthetic tasks can\nbe predictive of scaling laws, and that an optimal architecture should leverage\nspecialized layers via a hybrid topology.\n","authors":["Michael Poli","Armin W Thomas","Eric Nguyen","Pragaash Ponnusamy","Björn Deiseroth","Kristian Kersting","Taiji Suzuki","Brian Hie","Stefano Ermon","Christopher Ré","Ce Zhang","Stefano Massaroli"],"pdf_url":"https://arxiv.org/pdf/2403.17844v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10162v1","updated":"2024-08-19T17:16:35Z","published":"2024-08-19T17:16:35Z","title":"Physics-Aware Combinatorial Assembly Planning using Deep Reinforcement\n Learning","summary":" Combinatorial assembly uses standardized unit primitives to build objects\nthat satisfy user specifications. Lego is a widely used platform for\ncombinatorial assembly, in which people use unit primitives (ie Lego bricks) to\nbuild highly customizable 3D objects. This paper studies sequence planning for\nphysical combinatorial assembly using Lego. Given the shape of the desired\nobject, we want to find a sequence of actions for placing Lego bricks to build\nthe target object. In particular, we aim to ensure the planned assembly\nsequence is physically executable. However, assembly sequence planning (ASP)\nfor combinatorial assembly is particularly challenging due to its combinatorial\nnature, ie the vast number of possible combinations and complex constraints. To\naddress the challenges, we employ deep reinforcement learning to learn a\nconstruction policy for placing unit primitives sequentially to build the\ndesired object. Specifically, we design an online physics-aware action mask\nthat efficiently filters out invalid actions and guides policy learning. In the\nend, we demonstrate that the proposed method successfully plans physically\nvalid assembly sequences for constructing different Lego structures. The\ngenerated construction plan can be executed in real.\n","authors":["Ruixuan Liu","Alan Chen","Weiye Zhao","Changliu Liu"],"pdf_url":"https://arxiv.org/pdf/2408.10162v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02138v3","updated":"2024-08-19T17:16:08Z","published":"2024-04-02T17:49:40Z","title":"Topic-Based Watermarks for LLM-Generated Text","summary":" The indistinguishability of text generated by large language models (LLMs)\nfrom human-generated text poses significant challenges. Watermarking algorithms\nare potential solutions by embedding detectable signatures within LLM-generated\noutputs. However, current watermarking schemes lack robustness to a range of\nattacks such as text substitution or manipulation, undermining their\nreliability. This paper proposes a novel topic-based watermarking algorithm for\nLLMs, designed to enhance the robustness of watermarking in LLMs. Our approach\nleverages the topics extracted from input prompts or outputs of non-watermarked\nLLMs in the generation process of watermarked text. We dynamically utilize\ntoken lists on identified topics and adjust token sampling weights accordingly.\nBy using these topic-specific token biases, we embed a topic-sensitive\nwatermarking into the generated text. We outline the theoretical framework of\nour topic-based watermarking algorithm and discuss its potential advantages in\nvarious scenarios. Additionally, we explore a comprehensive range of attacks\nagainst watermarking algorithms, including discrete alterations, paraphrasing,\nand tokenizations. We demonstrate that our proposed watermarking scheme\nclassifies various watermarked text topics with 99.99% confidence and\noutperforms existing algorithms in terms of z-score robustness and the\nfeasibility of modeling text degradation by potential attackers, while\nconsidering the trade-offs between the benefits and losses of watermarking\nLLM-generated text.\n","authors":["Alexander Nemecek","Yuzhou Jiang","Erman Ayday"],"pdf_url":"https://arxiv.org/pdf/2404.02138v3.pdf","comment":"Results for proposed scheme, additional/removal of content (figures\n and equations), 12 pages"},{"id":"http://arxiv.org/abs/2304.02146v2","updated":"2024-08-19T17:13:58Z","published":"2023-04-04T22:10:40Z","title":"Structure Learning with Continuous Optimization: A Sober Look and Beyond","summary":" This paper investigates in which cases continuous optimization for directed\nacyclic graph (DAG) structure learning can and cannot perform well and why this\nhappens, and suggests possible directions to make the search procedure more\nreliable. Reisach et al. (2021) suggested that the remarkable performance of\nseveral continuous structure learning approaches is primarily driven by a high\nagreement between the order of increasing marginal variances and the\ntopological order, and demonstrated that these approaches do not perform well\nafter data standardization. We analyze this phenomenon for continuous\napproaches assuming equal and non-equal noise variances, and show that the\nstatement may not hold in either case by providing counterexamples,\njustifications, and possible alternative explanations. We further demonstrate\nthat nonconvexity may be a main concern especially for the non-equal noise\nvariances formulation, while recent advances in continuous structure learning\nfail to achieve improvement in this case. Our findings suggest that future\nworks should take into account the non-equal noise variances formulation to\nhandle more general settings and for a more comprehensive empirical evaluation.\nLastly, we provide insights into other aspects of the search procedure,\nincluding thresholding and sparsity, and show that they play an important role\nin the final solutions.\n","authors":["Ignavier Ng","Biwei Huang","Kun Zhang"],"pdf_url":"https://arxiv.org/pdf/2304.02146v2.pdf","comment":"3rd Conference on Causal Learning and Reasoning (CLeaR 2024)"},{"id":"http://arxiv.org/abs/2408.10151v1","updated":"2024-08-19T17:02:06Z","published":"2024-08-19T17:02:06Z","title":"Multilingual Needle in a Haystack: Investigating Long-Context Behavior\n of Multilingual Large Language Models","summary":" While recent large language models (LLMs) demonstrate remarkable abilities in\nresponding to queries in diverse languages, their ability to handle long\nmultilingual contexts is unexplored. As such, a systematic evaluation of the\nlong-context capabilities of LLMs in multilingual settings is crucial,\nspecifically in the context of information retrieval. To address this gap, we\nintroduce the MultiLingual Needle-in-a-Haystack (MLNeedle) test, designed to\nassess a model's ability to retrieve relevant information (the needle) from a\ncollection of multilingual distractor texts (the haystack). This test serves as\nan extension of the multilingual question-answering task, encompassing both\nmonolingual and cross-lingual retrieval. We evaluate four state-of-the-art LLMs\non MLNeedle. Our findings reveal that model performance can vary significantly\nwith language and needle position. Specifically, we observe that model\nperformance is the lowest when the needle is (i) in a language outside the\nEnglish language family and (ii) located in the middle of the input context.\nFurthermore, although some models claim a context size of $8k$ tokens or\ngreater, none demonstrate satisfactory cross-lingual retrieval performance as\nthe context length increases. Our analysis provides key insights into the\nlong-context behavior of LLMs in multilingual settings to guide future\nevaluation protocols. To our knowledge, this is the first study to investigate\nthe multilingual long-context behavior of LLMs.\n","authors":["Amey Hengle","Prasoon Bajpai","Soham Dan","Tanmoy Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2408.10151v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12554v3","updated":"2024-08-19T16:54:21Z","published":"2023-05-21T19:31:56Z","title":"CoMusion: Towards Consistent Stochastic Human Motion Prediction via\n Motion Diffusion","summary":" Stochastic Human Motion Prediction (HMP) aims to predict multiple possible\nfuture human pose sequences from observed ones. Most prior works learn motion\ndistributions through encoding-decoding in the latent space, which does not\npreserve motion's spatial-temporal structure. While effective, these methods\noften require complex, multi-stage training and yield predictions that are\ninconsistent with the provided history and can be physically unrealistic. To\naddress these issues, we propose CoMusion, a single-stage, end-to-end\ndiffusion-based stochastic HMP framework. CoMusion is inspired from the insight\nthat a smooth future pose initialization improves prediction performance, a\nstrategy not previously utilized in stochastic models but evidenced in\ndeterministic works. To generate such initialization, CoMusion's motion\npredictor starts with a Transformer-based network for initial reconstruction of\ncorrupted motion. Then, a graph convolutional network (GCN) is employed to\nrefine the prediction considering past observations in the discrete cosine\ntransformation (DCT) space. Our method, facilitated by the Transformer-GCN\nmodule design and a proposed variance scheduler, excels in predicting accurate,\nrealistic, and consistent motions, while maintaining appropriate diversity.\nExperimental results on benchmark datasets demonstrate that CoMusion surpasses\nprior methods across metrics, while demonstrating superior generation quality.\nOur Code is released at https://github.com/jsun57/CoMusion/ .\n","authors":["Jiarui Sun","Girish Chowdhary"],"pdf_url":"https://arxiv.org/pdf/2305.12554v3.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2408.10147v1","updated":"2024-08-19T16:47:46Z","published":"2024-08-19T16:47:46Z","title":"In-Context Learning with Representations: Contextual Generalization of\n Trained Transformers","summary":" In-context learning (ICL) refers to a remarkable capability of pretrained\nlarge language models, which can learn a new task given a few examples during\ninference. However, theoretical understanding of ICL is largely under-explored,\nparticularly whether transformers can be trained to generalize to unseen\nexamples in a prompt, which will require the model to acquire contextual\nknowledge of the prompt for generalization. This paper investigates the\ntraining dynamics of transformers by gradient descent through the lens of\nnon-linear regression tasks. The contextual generalization here can be attained\nvia learning the template function for each task in-context, where all template\nfunctions lie in a linear space with $m$ basis functions. We analyze the\ntraining dynamics of one-layer multi-head transformers to in-contextly predict\nunlabeled inputs given partially labeled prompts, where the labels contain\nGaussian noise and the number of examples in each prompt are not sufficient to\ndetermine the template. Under mild assumptions, we show that the training loss\nfor a one-layer multi-head transformer converges linearly to a global minimum.\nMoreover, the transformer effectively learns to perform ridge regression over\nthe basis functions. To our knowledge, this study is the first provable\ndemonstration that transformers can learn contextual (i.e., template)\ninformation to generalize to both unseen examples and tasks when prompts\ncontain only a small number of query-answer pairs.\n","authors":["Tong Yang","Yu Huang","Yingbin Liang","Yuejie Chi"],"pdf_url":"https://arxiv.org/pdf/2408.10147v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18781v4","updated":"2024-08-19T16:45:42Z","published":"2024-02-29T01:07:29Z","title":"Conjectural Online Learning with First-order Beliefs in Asymmetric\n Information Stochastic Games","summary":" Asymmetric information stochastic games (AISGs) arise in many complex\nsocio-technical systems, such as cyber-physical systems and IT infrastructures.\nExisting computational methods for AISGs are primarily offline and can not\nadapt to equilibrium deviations. Further, current methods are limited to\nparticular information structures to avoid belief hierarchies. Considering\nthese limitations, we propose conjectural online learning (COL), an online\nlearning method under generic information structures in AISGs. COL uses a\nforecaster-actor-critic (FAC) architecture, where subjective forecasts are used\nto conjecture the opponents' strategies within a lookahead horizon, and\nBayesian learning is used to calibrate the conjectures. To adapt strategies to\nnonstationary environments based on information feedback, COL uses online\nrollout with cost function approximation (actor-critic). We prove that the\nconjectures produced by COL are asymptotically consistent with the information\nfeedback in the sense of a relaxed Bayesian consistency. We also prove that the\nempirical strategy profile induced by COL converges to the Berk-Nash\nequilibrium, a solution concept characterizing rationality under subjectivity.\nExperimental results from an intrusion response use case demonstrate COL's\n{faster convergence} over state-of-the-art reinforcement learning methods\nagainst nonstationary attacks.\n","authors":["Tao Li","Kim Hammar","Rolf Stadler","Quanyan Zhu"],"pdf_url":"https://arxiv.org/pdf/2402.18781v4.pdf","comment":"Accepted to the 63rd IEEE Conference on Decision and Control, Special\n Session on Networks, Games and Learning"},{"id":"http://arxiv.org/abs/2408.08808v2","updated":"2024-08-19T16:44:30Z","published":"2024-08-16T15:41:43Z","title":"Constructing Domain-Specific Evaluation Sets for LLM-as-a-judge","summary":" Large Language Models (LLMs) have revolutionized the landscape of machine\nlearning, yet current benchmarks often fall short in capturing the diverse\nbehavior of these models in real-world applications. A benchmark's usefulness\nis determined by its ability to clearly differentiate between models of varying\ncapabilities (separability) and closely align with human preferences. Existing\nframeworks like Alpaca-Eval 2.0 LC\n\\cite{dubois2024lengthcontrolledalpacaevalsimpleway} and Arena-Hard v0.1\n\\cite{li2024crowdsourced} are limited by their focus on general-purpose queries\nand lack of diversity across domains such as law, medicine, and multilingual\ncontexts. In this paper, we address these limitations by introducing a novel\ndata pipeline that curates diverse, domain-specific evaluation sets tailored\nfor LLM-as-a-Judge frameworks. Our approach leverages a combination of manual\ncuration, semi-supervised learning to generate clusters, and stratified\nsampling to ensure balanced representation across a wide range of domains and\nlanguages. The resulting evaluation set, which includes 1573 samples across 14\ncategories, demonstrates high separability (84\\%) across ten top-ranked models,\nand agreement (84\\%) with Chatbot Arena and (0.915) Spearman correlation. The\nagreement values are 9\\% better than Arena Hard and 20\\% better than AlpacaEval\n2.0 LC, while the Spearman coefficient is 0.7 more than the next best\nbenchmark, showcasing a significant improvement in the usefulness of the\nbenchmark. We further provide an open-source evaluation tool that enables\nfine-grained analysis of model performance across user-defined categories,\noffering valuable insights for practitioners. This work contributes to the\nongoing effort to enhance the transparency, diversity, and effectiveness of LLM\nevaluation methodologies.\n","authors":["Ravi Raju","Swayambhoo Jain","Bo Li","Jonathan Li","Urmish Thakkar"],"pdf_url":"https://arxiv.org/pdf/2408.08808v2.pdf","comment":"14 pages, 8 figures, Under review"},{"id":"http://arxiv.org/abs/2403.14379v2","updated":"2024-08-19T16:37:36Z","published":"2024-03-21T13:12:33Z","title":"Tensor network compressibility of convolutional models","summary":" Convolutional neural networks (CNNs) are one of the most widely used neural\nnetwork architectures, showcasing state-of-the-art performance in computer\nvision tasks. Although larger CNNs generally exhibit higher accuracy, their\nsize can be effectively reduced by ``tensorization'' while maintaining\naccuracy, namely, replacing the convolution kernels with compact decompositions\nsuch as Tucker, Canonical Polyadic decompositions, or quantum-inspired\ndecompositions such as matrix product states, and directly training the factors\nin the decompositions to bias the learning towards low-rank decompositions. But\nwhy doesn't tensorization seem to impact the accuracy adversely? We explore\nthis by assessing how \\textit{truncating} the convolution kernels of\n\\textit{dense} (untensorized) CNNs impact their accuracy. Specifically, we\ntruncated the kernels of (i) a vanilla four-layer CNN and (ii) ResNet-50\npre-trained for image classification on CIFAR-10 and CIFAR-100 datasets. We\nfound that kernels (especially those inside deeper layers) could often be\ntruncated along several cuts resulting in significant loss in kernel norm but\nnot in classification accuracy. This suggests that such ``correlation\ncompression'' (underlying tensorization) is an intrinsic feature of how\ninformation is encoded in dense CNNs. We also found that aggressively truncated\nmodels could often recover the pre-truncation accuracy after only a few epochs\nof re-training, suggesting that compressing the internal correlations of\nconvolution layers does not often transport the model to a worse minimum. Our\nresults can be applied to tensorize and compress CNN models more effectively.\n","authors":["Sukhbinder Singh","Saeed S. Jahromi","Roman Orus"],"pdf_url":"https://arxiv.org/pdf/2403.14379v2.pdf","comment":"40 pages, 21 images"},{"id":"http://arxiv.org/abs/2408.10136v1","updated":"2024-08-19T16:33:44Z","published":"2024-08-19T16:33:44Z","title":"Robust spectral clustering with rank statistics","summary":" This paper analyzes the statistical performance of a robust spectral\nclustering method for latent structure recovery in noisy data matrices. We\nconsider eigenvector-based clustering applied to a matrix of nonparametric rank\nstatistics that is derived entrywise from the raw, original data matrix. This\napproach is robust in the sense that, unlike traditional spectral clustering\nprocedures, it can provably recover population-level latent block structure\neven when the observed data matrix includes heavy-tailed entries and has a\nheterogeneous variance profile.\n Our main theoretical contributions are threefold and hold under flexible data\ngenerating conditions. First, we establish that robust spectral clustering with\nrank statistics can consistently recover latent block structure, viewed as\ncommunities of nodes in a graph, in the sense that unobserved community\nmemberships for all but a vanishing fraction of nodes are correctly recovered\nwith high probability when the data matrix is large. Second, we refine the\nformer result and further establish that, under certain conditions, the\ncommunity membership of any individual, specified node of interest can be\nasymptotically exactly recovered with probability tending to one in the\nlarge-data limit. Third, we establish asymptotic normality results associated\nwith the truncated eigenstructure of matrices whose entries are rank\nstatistics, made possible by synthesizing contemporary entrywise matrix\nperturbation analysis with the classical nonparametric theory of so-called\nsimple linear rank statistics. Collectively, these results demonstrate the\nstatistical utility of rank-based data transformations when paired with\nspectral techniques for dimensionality reduction. Additionally, for a dataset\nof human connectomes, our approach yields parsimonious dimensionality reduction\nand improved recovery of ground-truth neuroanatomical cluster structure.\n","authors":["Joshua Cape","Xianshi Yu","Jonquil Z. Liao"],"pdf_url":"https://arxiv.org/pdf/2408.10136v1.pdf","comment":"82 pages, 8 figures, 1 table"},{"id":"http://arxiv.org/abs/2111.07917v3","updated":"2024-08-19T16:31:32Z","published":"2021-11-15T17:10:40Z","title":"Best of Both Worlds: Practical and Theoretically Optimal Submodular\n Maximization in Parallel","summary":" For the problem of maximizing a monotone, submodular function with respect to\na cardinality constraint $k$ on a ground set of size $n$, we provide an\nalgorithm that achieves the state-of-the-art in both its empirical performance\nand its theoretical properties, in terms of adaptive complexity, query\ncomplexity, and approximation ratio; that is, it obtains, with high\nprobability, query complexity of $O(n)$ in expectation, adaptivity of\n$O(\\log(n))$, and approximation ratio of nearly $1-1/e$. The main algorithm is\nassembled from two components which may be of independent interest. The first\ncomponent of our algorithm, LINEARSEQ, is useful as a preprocessing algorithm\nto improve the query complexity of many algorithms. Moreover, a variant of\nLINEARSEQ is shown to have adaptive complexity of $O( \\log (n / k) )$ which is\nsmaller than that of any previous algorithm in the literature. The second\ncomponent is a parallelizable thresholding procedure THRESHOLDSEQ for adding\nelements with gain above a constant threshold. Finally, we demonstrate that our\nmain algorithm empirically outperforms, in terms of runtime, adaptive rounds,\ntotal queries, and objective values, the previous state-of-the-art algorithm\nFAST in a comprehensive evaluation with six submodular objective functions.\n","authors":["Yixin Chen","Tonmoy Dey","Alan Kuhnle"],"pdf_url":"https://arxiv.org/pdf/2111.07917v3.pdf","comment":"32 pages, 8 figures, to be published in NeurIPS 2021"},{"id":"http://arxiv.org/abs/2408.10128v1","updated":"2024-08-19T16:15:09Z","published":"2024-08-19T16:15:09Z","title":"Advancing Voice Cloning for Nepali: Leveraging Transfer Learning in a\n Low-Resource Language","summary":" Voice cloning is a prominent feature in personalized speech interfaces. A\nneural vocal cloning system can mimic someone's voice using just a few audio\nsamples. Both speaker encoding and speaker adaptation are topics of research in\nthe field of voice cloning. Speaker adaptation relies on fine-tuning a\nmulti-speaker generative model, which involves training a separate model to\ninfer a new speaker embedding used for speaker encoding. Both methods can\nachieve excellent performance, even with a small number of cloning audios, in\nterms of the speech's naturalness and similarity to the original speaker.\nSpeaker encoding approaches are more appropriate for low-resource deployment\nsince they require significantly less memory and have a faster cloning time\nthan speaker adaption, which can offer slightly greater naturalness and\nsimilarity. The main goal is to create a vocal cloning system that produces\naudio output with a Nepali accent or that sounds like Nepali. For the further\nadvancement of TTS, the idea of transfer learning was effectively used to\naddress several issues that were encountered in the development of this system,\nincluding the poor audio quality and the lack of available data.\n","authors":["Manjil Karki","Pratik Shakya","Sandesh Acharya","Ravi Pandit","Dinesh Gothe"],"pdf_url":"https://arxiv.org/pdf/2408.10128v1.pdf","comment":"7 pages, 10 figures"},{"id":"http://arxiv.org/abs/2408.10126v1","updated":"2024-08-19T16:13:35Z","published":"2024-08-19T16:13:35Z","title":"Learning Brave Assumption-Based Argumentation Frameworks via ASP","summary":" Assumption-based Argumentation (ABA) is advocated as a unifying formalism for\nvarious forms of non-monotonic reasoning, including logic programming. It\nallows capturing defeasible knowledge, subject to argumentative debate. While,\nin much existing work, ABA frameworks are given up-front, in this paper we\nfocus on the problem of automating their learning from background knowledge and\npositive/negative examples. Unlike prior work, we newly frame the problem in\nterms of brave reasoning under stable extensions for ABA. We present a novel\nalgorithm based on transformation rules (such as Rote Learning, Folding,\nAssumption Introduction and Fact Subsumption) and an implementation thereof\nthat makes use of Answer Set Programming. Finally, we compare our technique to\nstate-of-the-art ILP systems that learn defeasible knowledge.\n","authors":["Emanuele De Angelis","Maurizio Proietti","Francesca Toni"],"pdf_url":"https://arxiv.org/pdf/2408.10126v1.pdf","comment":"Extended version of the paper accepted at the 27th European\n Conference on Artificial Intelligence (ECAI 2024); Paper ID: M1488\n (https://www.ecai2024.eu/)"},{"id":"http://arxiv.org/abs/2408.10124v1","updated":"2024-08-19T16:11:59Z","published":"2024-08-19T16:11:59Z","title":"Molecular Graph Representation Learning Integrating Large Language\n Models with Domain-specific Small Models","summary":" Molecular property prediction is a crucial foundation for drug discovery. In\nrecent years, pre-trained deep learning models have been widely applied to this\ntask. Some approaches that incorporate prior biological domain knowledge into\nthe pre-training framework have achieved impressive results. However, these\nmethods heavily rely on biochemical experts, and retrieving and summarizing\nvast amounts of domain knowledge literature is both time-consuming and\nexpensive. Large Language Models (LLMs) have demonstrated remarkable\nperformance in understanding and efficiently providing general knowledge.\nNevertheless, they occasionally exhibit hallucinations and lack precision in\ngenerating domain-specific knowledge. Conversely, Domain-specific Small Models\n(DSMs) possess rich domain knowledge and can accurately calculate molecular\ndomain-related metrics. However, due to their limited model size and singular\nfunctionality, they lack the breadth of knowledge necessary for comprehensive\nrepresentation learning. To leverage the advantages of both approaches in\nmolecular property prediction, we propose a novel Molecular Graph\nrepresentation learning framework that integrates Large language models and\nDomain-specific small models (MolGraph-LarDo). Technically, we design a\ntwo-stage prompt strategy where DSMs are introduced to calibrate the knowledge\nprovided by LLMs, enhancing the accuracy of domain-specific information and\nthus enabling LLMs to generate more precise textual descriptions for molecular\nsamples. Subsequently, we employ a multi-modal alignment method to coordinate\nvarious modalities, including molecular graphs and their corresponding\ndescriptive texts, to guide the pre-training of molecular representations.\nExtensive experiments demonstrate the effectiveness of the proposed method.\n","authors":["Tianyu Zhang","Yuxiang Ren","Chengbin Hou","Hairong Lv","Xuegong Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.10124v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17632v2","updated":"2024-08-19T16:07:11Z","published":"2024-03-26T12:08:05Z","title":"Data-driven Energy Consumption Modelling for Electric Micromobility\n using an Open Dataset","summary":" The escalating challenges of traffic congestion and environmental degradation\nunderscore the critical importance of embracing E-Mobility solutions in urban\nspaces. In particular, micro E-Mobility tools such as E-scooters and E-bikes,\nplay a pivotal role in this transition, offering sustainable alternatives for\nurban commuters. However, the energy consumption patterns for these tools are a\ncritical aspect that impacts their effectiveness in real-world scenarios and is\nessential for trip planning and boosting user confidence in using these. To\nthis effect, recent studies have utilised physical models customised for\nspecific mobility tools and conditions, but these models struggle with\ngeneralization and effectiveness in real-world scenarios due to a notable\nabsence of open datasets for thorough model evaluation and verification. To\nfill this gap, our work presents an open dataset, collected in Dublin, Ireland,\nspecifically designed for energy modelling research related to E-Scooters and\nE-Bikes. Furthermore, we provide a comprehensive analysis of energy consumption\nmodelling based on the dataset using a set of representative machine learning\nalgorithms and compare their performance against the contemporary mathematical\nmodels as a baseline. Our results demonstrate a notable advantage for\ndata-driven models in comparison to the corresponding mathematical models for\nestimating energy consumption. Specifically, data-driven models outperform\nphysical models in accuracy by up to 83.83% for E-Bikes and 82.16% for\nE-Scooters based on an in-depth analysis of the dataset under certain\nassumptions.\n","authors":["Yue Ding","Sen Yan","Maqsood Hussain Shah","Hongyuan Fang","Ji Li","Mingming Liu"],"pdf_url":"https://arxiv.org/pdf/2403.17632v2.pdf","comment":"7 pages, 5 figures, 4 tables. This manuscript has been accepted by\n the IEEE ITEC 2024"},{"id":"http://arxiv.org/abs/2310.07794v2","updated":"2024-08-19T16:01:35Z","published":"2023-10-11T18:28:15Z","title":"CRITERIA: a New Benchmarking Paradigm for Evaluating Trajectory\n Prediction Models for Autonomous Driving","summary":" Benchmarking is a common method for evaluating trajectory prediction models\nfor autonomous driving. Existing benchmarks rely on datasets, which are biased\ntowards more common scenarios, such as cruising, and distance-based metrics\nthat are computed by averaging over all scenarios. Following such a regiment\nprovides a little insight into the properties of the models both in terms of\nhow well they can handle different scenarios and how admissible and diverse\ntheir outputs are. There exist a number of complementary metrics designed to\nmeasure the admissibility and diversity of trajectories, however, they suffer\nfrom biases, such as length of trajectories.\n In this paper, we propose a new benChmarking paRadIgm for evaluaTing\ntrajEctoRy predIction Approaches (CRITERIA). Particularly, we propose 1) a\nmethod for extracting driving scenarios at varying levels of specificity\naccording to the structure of the roads, models' performance, and data\nproperties for fine-grained ranking of prediction models; 2) A set of new\nbias-free metrics for measuring diversity, by incorporating the characteristics\nof a given scenario, and admissibility, by considering the structure of roads\nand kinematic compliancy, motivated by real-world driving constraints. 3) Using\nthe proposed benchmark, we conduct extensive experimentation on a\nrepresentative set of the prediction models using the large scale Argoverse\ndataset. We show that the proposed benchmark can produce a more accurate\nranking of the models and serve as a means of characterizing their behavior. We\nfurther present ablation studies to highlight contributions of different\nelements that are used to compute the proposed metrics.\n","authors":["Changhe Chen","Mozhgan Pourkeshavarz","Amir Rasouli"],"pdf_url":"https://arxiv.org/pdf/2310.07794v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10111v1","updated":"2024-08-19T15:59:46Z","published":"2024-08-19T15:59:46Z","title":"PLUTUS: A Well Pre-trained Large Unified Transformer can Unveil\n Financial Time Series Regularities","summary":" Financial time series modeling is crucial for understanding and predicting\nmarket behaviors but faces challenges such as non-linearity, non-stationarity,\nand high noise levels. Traditional models struggle to capture complex patterns\ndue to these issues, compounded by limitations in computational resources and\nmodel capacity. Inspired by the success of large language models in NLP, we\nintroduce \\textbf{PLUTUS}, a \\textbf{P}re-trained \\textbf{L}arge\n\\textbf{U}nified \\textbf{T}ransformer-based model that \\textbf{U}nveils\nregularities in financial time \\textbf{S}eries. PLUTUS uses an invertible\nembedding module with contrastive learning and autoencoder techniques to create\nan approximate one-to-one mapping between raw data and patch embeddings.\nTimeFormer, an attention based architecture, forms the core of PLUTUS,\neffectively modeling high-noise time series. We incorporate a novel attention\nmechanisms to capture features across both variable and temporal dimensions.\nPLUTUS is pre-trained on an unprecedented dataset of 100 billion observations,\ndesigned to thrive in noisy financial environments. To our knowledge, PLUTUS is\nthe first open-source, large-scale, pre-trained financial time series model\nwith over one billion parameters. It achieves state-of-the-art performance in\nvarious tasks, demonstrating strong transferability and establishing a robust\nfoundational model for finance. Our research provides technical guidance for\npre-training financial time series data, setting a new standard in the field.\n","authors":["Yuanjian Xu","Anxian Liu","Jianing Hao","Zhenzhuo Li","Shichang Meng","Guang Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.10111v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15871v3","updated":"2024-08-19T15:58:03Z","published":"2024-07-18T18:42:58Z","title":"Semantic Prototypes: Enhancing Transparency Without Black Boxes","summary":" As machine learning (ML) models and datasets increase in complexity, the\ndemand for methods that enhance explainability and interpretability becomes\nparamount. Prototypes, by encapsulating essential characteristics within data,\noffer insights that enable tactical decision-making and enhance transparency.\nTraditional prototype methods often rely on sub-symbolic raw data and opaque\nlatent spaces, reducing explainability and increasing the risk of\nmisinterpretations. This paper presents a novel framework that utilizes\nsemantic descriptions to define prototypes and provide clear explanations,\neffectively addressing the shortcomings of conventional methods. Our approach\nleverages concept-based descriptions to cluster data on the semantic level,\nensuring that prototypes not only represent underlying properties intuitively\nbut are also straightforward to interpret. Our method simplifies the\ninterpretative process and effectively bridges the gap between complex data\nstructures and human cognitive processes, thereby enhancing transparency and\nfostering trust. Our approach outperforms existing widely-used prototype\nmethods in facilitating human understanding and informativeness, as validated\nthrough a user survey.\n","authors":["Orfeas Menis-Mastromichalakis","Giorgos Filandrianos","Jason Liartis","Edmund Dervakos","Giorgos Stamou"],"pdf_url":"https://arxiv.org/pdf/2407.15871v3.pdf","comment":"This paper has been accepted for publication as a full paper at the\n 33rd ACM International Conference on Information and Knowledge Management\n (CIKM 2024)"},{"id":"http://arxiv.org/abs/2408.10107v1","updated":"2024-08-19T15:51:31Z","published":"2024-08-19T15:51:31Z","title":"Perturb-and-Compare Approach for Detecting Out-of-Distribution Samples\n in Constrained Access Environments","summary":" Accessing machine learning models through remote APIs has been gaining\nprevalence following the recent trend of scaling up model parameters for\nincreased performance. Even though these models exhibit remarkable ability,\ndetecting out-of-distribution (OOD) samples remains a crucial safety concern\nfor end users as these samples may induce unreliable outputs from the model. In\nthis work, we propose an OOD detection framework, MixDiff, that is applicable\neven when the model's parameters or its activations are not accessible to the\nend user. To bypass the access restriction, MixDiff applies an identical\ninput-level perturbation to a given target sample and a similar in-distribution\n(ID) sample, then compares the relative difference in the model outputs of\nthese two samples. MixDiff is model-agnostic and compatible with existing\noutput-based OOD detection methods. We provide theoretical analysis to\nillustrate MixDiff's effectiveness in discerning OOD samples that induce\noverconfident outputs from the model and empirically demonstrate that MixDiff\nconsistently enhances the OOD detection performance on various datasets in\nvision and text domains.\n","authors":["Heeyoung Lee","Hoyoon Byun","Changdae Oh","JinYeong Bak","Kyungwoo Song"],"pdf_url":"https://arxiv.org/pdf/2408.10107v1.pdf","comment":"Accepted to European Conference on Artificial Intelligence (ECAI)\n 2024"},{"id":"http://arxiv.org/abs/2309.15238v2","updated":"2024-08-19T15:39:54Z","published":"2023-09-26T20:04:48Z","title":"Learning Using Generated Privileged Information by Text-to-Image\n Diffusion Models","summary":" Learning Using Privileged Information is a particular type of knowledge\ndistillation where the teacher model benefits from an additional data\nrepresentation during training, called privileged information, improving the\nstudent model, which does not see the extra representation. However, privileged\ninformation is rarely available in practice. To this end, we propose a text\nclassification framework that harnesses text-to-image diffusion models to\ngenerate artificial privileged information. The generated images and the\noriginal text samples are further used to train multimodal teacher models based\non state-of-the-art transformer-based architectures. Finally, the knowledge\nfrom multimodal teachers is distilled into a text-based (unimodal) student.\nHence, by employing a generative model to produce synthetic data as privileged\ninformation, we guide the training of the student model. Our framework, called\nLearning Using Generated Privileged Information (LUGPI), yields noticeable\nperformance gains on four text classification data sets, demonstrating its\npotential in text classification without any additional cost during inference.\n","authors":["Rafael-Edy Menadil","Mariana-Iuliana Georgescu","Radu Tudor Ionescu"],"pdf_url":"https://arxiv.org/pdf/2309.15238v2.pdf","comment":"Accepted at ICPR 2024"},{"id":"http://arxiv.org/abs/2408.10090v1","updated":"2024-08-19T15:31:06Z","published":"2024-08-19T15:31:06Z","title":"Federated Frank-Wolfe Algorithm","summary":" Federated learning (FL) has gained a lot of attention in recent years for\nbuilding privacy-preserving collaborative learning systems. However, FL\nalgorithms for constrained machine learning problems are still limited,\nparticularly when the projection step is costly. To this end, we propose a\nFederated Frank-Wolfe Algorithm (FedFW). FedFW features data privacy, low\nper-iteration cost, and communication of sparse signals. In the deterministic\nsetting, FedFW achieves an $\\varepsilon$-suboptimal solution within\n$O(\\varepsilon^{-2})$ iterations for smooth and convex objectives, and\n$O(\\varepsilon^{-3})$ iterations for smooth but non-convex objectives.\nFurthermore, we present a stochastic variant of FedFW and show that it finds a\nsolution within $O(\\varepsilon^{-3})$ iterations in the convex setting. We\ndemonstrate the empirical performance of FedFW on several machine learning\ntasks.\n","authors":["Ali Dadras","Sourasekhar Banerjee","Karthik Prakhya","Alp Yurtsever"],"pdf_url":"https://arxiv.org/pdf/2408.10090v1.pdf","comment":"European Conference on Machine Learning and Principles and Practice\n of Knowledge Discovery in Databases"},{"id":"http://arxiv.org/abs/2408.10085v1","updated":"2024-08-19T15:26:45Z","published":"2024-08-19T15:26:45Z","title":"MASALA: Model-Agnostic Surrogate Explanations by Locality Adaptation","summary":" Existing local Explainable AI (XAI) methods, such as LIME, select a region of\nthe input space in the vicinity of a given input instance, for which they\napproximate the behaviour of a model using a simpler and more interpretable\nsurrogate model. The size of this region is often controlled by a user-defined\nlocality hyperparameter. In this paper, we demonstrate the difficulties\nassociated with defining a suitable locality size to capture impactful model\nbehaviour, as well as the inadequacy of using a single locality size to explain\nall predictions. We propose a novel method, MASALA, for generating\nexplanations, which automatically determines the appropriate local region of\nimpactful model behaviour for each individual instance being explained. MASALA\napproximates the local behaviour used by a complex model to make a prediction\nby fitting a linear surrogate model to a set of points which experience similar\nmodel behaviour. These points are found by clustering the input space into\nregions of linear behavioural trends exhibited by the model. We compare the\nfidelity and consistency of explanations generated by our method with existing\nlocal XAI methods, namely LIME and CHILLI. Experiments on the PHM08 and MIDAS\ndatasets show that our method produces more faithful and consistent\nexplanations than existing methods, without the need to define any sensitive\nlocality hyperparameters.\n","authors":["Saif Anwar","Nathan Griffiths","Abhir Bhalerao","Thomas Popham"],"pdf_url":"https://arxiv.org/pdf/2408.10085v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10084v1","updated":"2024-08-19T15:26:25Z","published":"2024-08-19T15:26:25Z","title":"TANGO: Clustering with Typicality-Aware Nonlocal Mode-Seeking and\n Graph-Cut Optimization","summary":" Density-based clustering methods by mode-seeking usually achieve clustering\nby using local density estimation to mine structural information, such as local\ndependencies from lower density points to higher neighbors. However, they often\nrely too heavily on \\emph{local} structures and neglect \\emph{global}\ncharacteristics, which can lead to significant errors in peak selection and\ndependency establishment. Although introducing more hyperparameters that revise\ndependencies can help mitigate this issue, tuning them is challenging and even\nimpossible on real-world datasets. In this paper, we propose a new algorithm\n(TANGO) to establish local dependencies by exploiting a global-view\n\\emph{typicality} of points, which is obtained by mining further the density\ndistributions and initial dependencies. TANGO then obtains sub-clusters with\nthe help of the adjusted dependencies, and characterizes the similarity between\nsub-clusters by incorporating path-based connectivity. It achieves final\nclustering by employing graph-cut on sub-clusters, thus avoiding the\nchallenging selection of cluster centers. Moreover, this paper provides\ntheoretical analysis and an efficient method for the calculation of typicality.\nExperimental results on several synthetic and $16$ real-world datasets\ndemonstrate the effectiveness and superiority of TANGO.\n","authors":["Haowen Ma","Zhiguo Long","Hua Meng"],"pdf_url":"https://arxiv.org/pdf/2408.10084v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10077v1","updated":"2024-08-19T15:20:42Z","published":"2024-08-19T15:20:42Z","title":"No Screening is More Efficient with Multiple Objects","summary":" We study efficient mechanism design for allocating multiple heterogeneous\nobjects. We aim to maximize the residual surplus, the total value generated\nfrom an allocation minus the costs for screening agents' values. We discover a\nrobust trend indicating that no-screening mechanisms such as serial\ndictatorship with exogenous priority order tend to perform better as the\nvariety of goods increases. We analyze the underlying reasons by characterizing\nefficient mechanisms in a stylized environment. We also apply an automated\nmechanism design approach to numerically derive efficient mechanisms and\nvalidate the trend in general environments. Building on this implication, we\npropose the register-invite-book system (RIB) as an efficient system for\nscheduling vaccination against pandemic diseases.\n","authors":["Shunya Noda","Genta Okada"],"pdf_url":"https://arxiv.org/pdf/2408.10077v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10075v1","updated":"2024-08-19T15:18:30Z","published":"2024-08-19T15:18:30Z","title":"Personalizing Reinforcement Learning from Human Feedback with\n Variational Preference Learning","summary":" Reinforcement Learning from Human Feedback (RLHF) is a powerful paradigm for\naligning foundation models to human values and preferences. However, current\nRLHF techniques cannot account for the naturally occurring differences in\nindividual human preferences across a diverse population. When these\ndifferences arise, traditional RLHF frameworks simply average over them,\nleading to inaccurate rewards and poor performance for individual subgroups. To\naddress the need for pluralistic alignment, we develop a class of multimodal\nRLHF methods. Our proposed techniques are based on a latent variable\nformulation - inferring a novel user-specific latent and learning reward models\nand policies conditioned on this latent without additional user-specific data.\nWhile conceptually simple, we show that in practice, this reward modeling\nrequires careful algorithmic considerations around model architecture and\nreward scaling. To empirically validate our proposed technique, we first show\nthat it can provide a way to combat underspecification in simulated control\nproblems, inferring and optimizing user-specific reward functions. Next, we\nconduct experiments on pluralistic language datasets representing diverse user\npreferences and demonstrate improved reward function accuracy. We additionally\nshow the benefits of this probabilistic framework in terms of measuring\nuncertainty, and actively learning user preferences. This work enables learning\nfrom diverse populations of users with divergent preferences, an important\nchallenge that naturally occurs in problems from robot learning to foundation\nmodel alignment.\n","authors":["Sriyash Poddar","Yanming Wan","Hamish Ivison","Abhishek Gupta","Natasha Jaques"],"pdf_url":"https://arxiv.org/pdf/2408.10075v1.pdf","comment":"weirdlabuw.github.io/vpl"},{"id":"http://arxiv.org/abs/2408.10060v1","updated":"2024-08-19T14:54:12Z","published":"2024-08-19T14:54:12Z","title":"Facial Wrinkle Segmentation for Cosmetic Dermatology: Pretraining with\n Texture Map-Based Weak Supervision","summary":" Facial wrinkle detection plays a crucial role in cosmetic dermatology.\nPrecise manual segmentation of facial wrinkles is challenging and\ntime-consuming, with inherent subjectivity leading to inconsistent results\namong graders. To address this issue, we propose two solutions. First, we build\nand release the first public facial wrinkle dataset, `FFHQ-Wrinkle', an\nextension of the NVIDIA FFHQ dataset. This dataset includes 1,000 images with\nhuman labels and 50,000 images with automatically generated weak labels. This\ndataset can foster the research community to develop advanced wrinkle detection\nalgorithms. Second, we introduce a training strategy for U-Net-like\nencoder-decoder models to detect wrinkles across the face automatically. Our\nmethod employs a two-stage training strategy: texture map pretraining and\nfinetuning on human-labeled data. Initially, we pretrain models on a large\ndataset with weak labels (N=50k) or masked texture maps generated through\ncomputer vision techniques, without human intervention. Subsequently, we\nfinetune the models using human-labeled data (N=1k), which consists of manually\nlabeled wrinkle masks. During finetuning, the network inputs a combination of\nRGB and masked texture maps, comprising four channels. We effectively combine\nlabels from multiple annotators to minimize subjectivity in manual labeling.\nOur strategies demonstrate improved segmentation performance in facial wrinkle\nsegmentation both quantitatively and visually compared to existing pretraining\nmethods.\n","authors":["Junho Moon","Haejun Chung","Ikbeom Jang"],"pdf_url":"https://arxiv.org/pdf/2408.10060v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10055v1","updated":"2024-08-19T14:50:48Z","published":"2024-08-19T14:50:48Z","title":"Efficient Exploration in Deep Reinforcement Learning: A Novel Bayesian\n Actor-Critic Algorithm","summary":" Reinforcement learning (RL) and Deep Reinforcement Learning (DRL), in\nparticular, have the potential to disrupt and are already changing the way we\ninteract with the world. One of the key indicators of their applicability is\ntheir ability to scale and work in real-world scenarios, that is in large-scale\nproblems. This scale can be achieved via a combination of factors, the\nalgorithm's ability to make use of large amounts of data and computational\nresources and the efficient exploration of the environment for viable solutions\n(i.e. policies).\n In this work, we investigate and motivate some theoretical foundations for\ndeep reinforcement learning. We start with exact dynamic programming and work\nour way up to stochastic approximations and stochastic approximations for a\nmodel-free scenario, which forms the theoretical basis of modern reinforcement\nlearning. We present an overview of this highly varied and rapidly changing\nfield from the perspective of Approximate Dynamic Programming. We then focus\nour study on the short-comings with respect to exploration of the cornerstone\napproaches (i.e. DQN, DDQN, A2C) in deep reinforcement learning. On the theory\nside, our main contribution is the proposal of a novel Bayesian actor-critic\nalgorithm. On the empirical side, we evaluate Bayesian exploration as well as\nactor-critic algorithms on standard benchmarks as well as state-of-the-art\nevaluation suites and show the benefits of both of these approaches over\ncurrent state-of-the-art deep RL methods. We release all the implementations\nand provide a full python library that is easy to install and hopefully will\nserve the reinforcement learning community in a meaningful way, and provide a\nstrong foundation for future work.\n","authors":["Nikolai Rozanov"],"pdf_url":"https://arxiv.org/pdf/2408.10055v1.pdf","comment":"74 pages, MRes Thesis in Computer Science, UCL"},{"id":"http://arxiv.org/abs/2406.04920v2","updated":"2024-08-19T14:45:05Z","published":"2024-06-07T13:24:19Z","title":"Sim-to-Real Transfer of Deep Reinforcement Learning Agents for Online\n Coverage Path Planning","summary":" Sim-to-real transfer presents a difficult challenge, where models trained in\nsimulation are to be deployed in the real world. The distribution shift between\nthe two settings leads to biased representations of the dynamics, and thus to\nsuboptimal predictions in the real-world environment. In this work, we tackle\nthe challenge of sim-to-real transfer of reinforcement learning (RL) agents for\ncoverage path planning (CPP). In CPP, the task is for a robot to find a path\nthat covers every point of a confined area. Specifically, we consider the case\nwhere the environment is unknown, and the agent needs to plan the path online\nwhile mapping the environment. We bridge the sim-to-real gap through a\nsemi-virtual environment, including a real robot and real-time aspects, while\nutilizing a simulated sensor and obstacles to enable environment randomization\nand automated episode resetting. We investigate what level of fine-tuning is\nneeded for adapting to a realistic setting, comparing to an agent trained\nsolely in simulation. We find that a high inference frequency allows\nfirst-order Markovian policies to transfer directly from simulation, while\nhigher-order policies can be fine-tuned to further reduce the sim-to-real gap.\nMoreover, they can operate at a lower frequency, thus reducing computational\nrequirements. In both cases, our approaches transfer state-of-the-art results\nfrom simulation to the real domain, where direct learning would take in the\norder of weeks with manual interaction, that is, it would be completely\ninfeasible.\n","authors":["Arvi Jonnarth","Ola Johansson","Michael Felsberg"],"pdf_url":"https://arxiv.org/pdf/2406.04920v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14893v2","updated":"2024-08-19T14:42:10Z","published":"2024-05-20T08:27:14Z","title":"Revisiting Day-ahead Electricity Price: Simple Model Save Millions","summary":" Accurate day-ahead electricity price forecasting is essential for residential\nwelfare, yet current methods often fall short in forecast accuracy. We observe\nthat commonly used time series models struggle to utilize the prior correlation\nbetween price and demand-supply, which, we found, can contribute a lot to a\nreliable electricity price forecaster. Leveraging this prior, we propose a\nsimple piecewise linear model that significantly enhances forecast accuracy by\ndirectly deriving prices from readily forecastable demand-supply values.\nExperiments in the day-ahead electricity markets of Shanxi province and ISO New\nEngland reveal that such forecasts could potentially save residents millions of\ndollars a year compared to existing methods. Our findings underscore the value\nof suitably integrating time series modeling with economic prior for enhanced\nelectricity price forecasting accuracy.\n","authors":["Linian Wang","Jianghong Liu","Huibin Zhang","Leye Wang"],"pdf_url":"https://arxiv.org/pdf/2405.14893v2.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2408.10046v1","updated":"2024-08-19T14:38:27Z","published":"2024-08-19T14:38:27Z","title":"Exploiting Fine-Grained Prototype Distribution for Boosting Unsupervised\n Class Incremental Learning","summary":" The dynamic nature of open-world scenarios has attracted more attention to\nclass incremental learning (CIL). However, existing CIL methods typically\npresume the availability of complete ground-truth labels throughout the\ntraining process, an assumption rarely met in practical applications.\nConsequently, this paper explores a more challenging problem of unsupervised\nclass incremental learning (UCIL). The essence of addressing this problem lies\nin effectively capturing comprehensive feature representations and discovering\nunknown novel classes. To achieve this, we first model the knowledge of class\ndistribution by exploiting fine-grained prototypes. Subsequently, a granularity\nalignment technique is introduced to enhance the unsupervised class discovery.\nAdditionally, we proposed a strategy to minimize overlap between novel and\nexisting classes, thereby preserving historical knowledge and mitigating the\nphenomenon of catastrophic forgetting. Extensive experiments on the five\ndatasets demonstrate that our approach significantly outperforms current\nstate-of-the-art methods, indicating the effectiveness of the proposed method.\n","authors":["Jiaming Liu","Hongyuan Liu","Zhili Qin","Wei Han","Yulu Fan","Qinli Yang","Junming Shao"],"pdf_url":"https://arxiv.org/pdf/2408.10046v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.03515v7","updated":"2024-08-19T14:34:22Z","published":"2023-05-05T13:24:35Z","title":"GradTree: Learning Axis-Aligned Decision Trees with Gradient Descent","summary":" Decision Trees (DTs) are commonly used for many machine learning tasks due to\ntheir high degree of interpretability. However, learning a DT from data is a\ndifficult optimization problem, as it is non-convex and non-differentiable.\nTherefore, common approaches learn DTs using a greedy growth algorithm that\nminimizes the impurity locally at each internal node. Unfortunately, this\ngreedy procedure can lead to inaccurate trees. In this paper, we present a\nnovel approach for learning hard, axis-aligned DTs with gradient descent. The\nproposed method uses backpropagation with a straight-through operator on a\ndense DT representation, to jointly optimize all tree parameters. Our approach\noutperforms existing methods on binary classification benchmarks and achieves\ncompetitive results for multi-class tasks. The method is available under:\nhttps://github.com/s-marton/GradTree\n","authors":["Sascha Marton","Stefan Lüdtke","Christian Bartelt","Heiner Stuckenschmidt"],"pdf_url":"https://arxiv.org/pdf/2305.03515v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03228v2","updated":"2024-08-19T14:33:29Z","published":"2024-05-06T07:40:13Z","title":"TED: Accelerate Model Training by Internal Generalization","summary":" Large language models have demonstrated strong performance in recent years,\nbut the high cost of training drives the need for efficient methods to compress\ndataset sizes. We propose TED pruning, a method that addresses the challenge of\noverfitting under high pruning ratios by quantifying the model's ability to\nimprove performance on pruned data while fitting retained data, known as\nInternal Generalization (IG). TED uses an optimization objective based on\nInternal Generalization Distance (IGD), measuring changes in IG before and\nafter pruning to align with true generalization performance and achieve\nimplicit regularization. The IGD optimization objective was verified to allow\nthe model to achieve the smallest upper bound on generalization error. The\nimpact of small mask fluctuations on IG is studied through masks and Taylor\napproximation, and fast estimation of IGD is enabled. In analyzing continuous\ntraining dynamics, the prior effect of IGD is validated, and a progressive\npruning strategy is proposed. Experiments on image classification, natural\nlanguage understanding, and large language model fine-tuning show TED achieves\nlossless performance with 60-70\\% of the data. Upon acceptance, our code will\nbe made publicly available.\n","authors":["Jinying Xiao","Ping Li","Jie Nie"],"pdf_url":"https://arxiv.org/pdf/2405.03228v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06599v3","updated":"2024-08-19T14:24:40Z","published":"2024-04-09T20:06:25Z","title":"Collaborative Multi-source Domain Adaptation Through Optimal Transport","summary":" Multi-source Domain Adaptation (MDA) seeks to adapt models trained on data\nfrom multiple labeled source domains to perform effectively on an unlabeled\ntarget domain data, assuming access to sources data. To address the challenges\nof model adaptation and data privacy, we introduce Collaborative MDA Through\nOptimal Transport (CMDA-OT), a novel framework consisting of two key phases. In\nthe first phase, each source domain is independently adapted to the target\ndomain using optimal transport methods. In the second phase, a centralized\ncollaborative learning architecture is employed, which aggregates the N models\nfrom the N sources without accessing their data, thereby safeguarding privacy.\nDuring this process, the server leverages a small set of pseudo-labeled samples\nfrom the target domain, known as the target validation subset, to refine and\nguide the adaptation. This dual-phase approach not only improves model\nperformance on the target domain but also addresses vital privacy challenges\ninherent in domain adaptation.\n","authors":["Omar Ghannou","Younès Bennani"],"pdf_url":"https://arxiv.org/pdf/2404.06599v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06009v3","updated":"2024-08-19T14:24:30Z","published":"2024-03-09T21:07:16Z","title":"Detectors for Safe and Reliable LLMs: Implementations, Uses, and\n Limitations","summary":" Large language models (LLMs) are susceptible to a variety of risks, from\nnon-faithful output to biased and toxic generations. Due to several limiting\nfactors surrounding LLMs (training cost, API access, data availability, etc.),\nit may not always be feasible to impose direct safety constraints on a deployed\nmodel. Therefore, an efficient and reliable alternative is required. To this\nend, we present our ongoing efforts to create and deploy a library of\ndetectors: compact and easy-to-build classification models that provide labels\nfor various harms. In addition to the detectors themselves, we discuss a wide\nrange of uses for these detector models - from acting as guardrails to enabling\neffective AI governance. We also deep dive into inherent challenges in their\ndevelopment and discuss future work aimed at making the detectors more reliable\nand broadening their scope.\n","authors":["Swapnaja Achintalwar","Adriana Alvarado Garcia","Ateret Anaby-Tavor","Ioana Baldini","Sara E. Berger","Bishwaranjan Bhattacharjee","Djallel Bouneffouf","Subhajit Chaudhury","Pin-Yu Chen","Lamogha Chiazor","Elizabeth M. Daly","Kirushikesh DB","Rogério Abreu de Paula","Pierre Dognin","Eitan Farchi","Soumya Ghosh","Michael Hind","Raya Horesh","George Kour","Ja Young Lee","Nishtha Madaan","Sameep Mehta","Erik Miehling","Keerthiram Murugesan","Manish Nagireddy","Inkit Padhi","David Piorkowski","Ambrish Rawat","Orna Raz","Prasanna Sattigeri","Hendrik Strobelt","Sarathkrishna Swaminathan","Christoph Tillmann","Aashka Trivedi","Kush R. Varshney","Dennis Wei","Shalisha Witherspooon","Marcel Zalmanovici"],"pdf_url":"https://arxiv.org/pdf/2403.06009v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.00359v2","updated":"2024-08-19T14:15:03Z","published":"2024-08-01T07:58:51Z","title":"Memorization Capacity for Additive Fine-Tuning with Small ReLU Networks","summary":" Fine-tuning large pre-trained models is a common practice in machine learning\napplications, yet its mathematical analysis remains largely unexplored. In this\npaper, we study fine-tuning through the lens of memorization capacity. Our new\nmeasure, the Fine-Tuning Capacity (FTC), is defined as the maximum number of\nsamples a neural network can fine-tune, or equivalently, as the minimum number\nof neurons ($m$) needed to arbitrarily change $N$ labels among $K$ samples\nconsidered in the fine-tuning process. In essence, FTC extends the memorization\ncapacity concept to the fine-tuning scenario. We analyze FTC for the additive\nfine-tuning scenario where the fine-tuned network is defined as the summation\nof the frozen pre-trained network $f$ and a neural network $g$ (with $m$\nneurons) designed for fine-tuning. When $g$ is a ReLU network with either 2 or\n3 layers, we obtain tight upper and lower bounds on FTC; we show that $N$\nsamples can be fine-tuned with $m=\\Theta(N)$ neurons for 2-layer networks, and\nwith $m=\\Theta(\\sqrt{N})$ neurons for 3-layer networks, no matter how large $K$\nis. Our results recover the known memorization capacity results when $N = K$ as\na special case.\n","authors":["Jy-yong Sohn","Dohyun Kwon","Seoyeon An","Kangwook Lee"],"pdf_url":"https://arxiv.org/pdf/2408.00359v2.pdf","comment":"10 pages, 9 figures, UAI 2024"},{"id":"http://arxiv.org/abs/2408.10011v1","updated":"2024-08-19T14:05:28Z","published":"2024-08-19T14:05:28Z","title":"PinnDE: Physics-Informed Neural Networks for Solving Differential\n Equations","summary":" In recent years the study of deep learning for solving differential equations\nhas grown substantially. The use of physics-informed neural networks (PINNs)\nand deep operator networks (DeepONets) have emerged as two of the most useful\napproaches in approximating differential equation solutions using machine\nlearning. Here, we propose PinnDE, an open-source python library for solving\ndifferential equations with both PINNs and DeepONets. We give a brief review of\nboth PINNs and DeepONets, introduce PinnDE along with the structure and usage\nof the package, and present worked examples to show PinnDE's effectiveness in\napproximating solutions with both PINNs and DeepONets.\n","authors":["Jason Matthews","Alex Bihlo"],"pdf_url":"https://arxiv.org/pdf/2408.10011v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17631v2","updated":"2024-08-19T13:59:30Z","published":"2024-07-24T20:44:36Z","title":"BLAZE: Cross-Language and Cross-Project Bug Localization via Dynamic\n Chunking and Hard Example Learning","summary":" Software bugs require developers to exert significant effort to identify and\nresolve them, often consuming about one-third of their time. Bug localization,\nthe process of pinpointing the exact source code files that need modification,\nis crucial in reducing this effort. Existing bug localization tools, typically\nreliant on deep learning techniques, face limitations in cross-project\napplicability and effectiveness in multi-language environments. Recent\nadvancements with Large Language Models (LLMs) offer detailed representations\nfor bug localization. However, they encounter challenges with limited context\nwindows and mapping accuracy. To address these issues, we propose BLAZE, an\napproach that employs dynamic chunking and hard example learning. First, BLAZE\ndynamically segments source code to minimize continuity loss. Then, BLAZE\nfine-tunes a GPT-based model using challenging bug cases, in order to enhance\ncross-project and cross-language bug localization. To support the capability of\nBLAZE, we create the BEETLEBOX dataset, which comprises 26,321 bugs from 29\nlarge and thriving open-source projects across five different programming\nlanguages (Java, C++, Python, Go, and JavaScript). Our evaluations of BLAZE on\nthree benchmark datasets BEETLEBOX, SWE-Bench, and Ye et al. demonstrate\nsubstantial improvements compared to six state-of-the-art baselines.\nSpecifically, BLAZE achieves up to an increase of 120% in Top 1 accuracy, 144%\nin Mean Average Precision (MAP), and 100% in Mean Reciprocal Rank (MRR). An\nextensive ablation study confirms the contributions of our pipeline components\nto the overall performance enhancement.\n","authors":["Partha Chakraborty","Mahmoud Alfadel","Meiyappan Nagappan"],"pdf_url":"https://arxiv.org/pdf/2407.17631v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10006v1","updated":"2024-08-19T13:59:26Z","published":"2024-08-19T13:59:26Z","title":"Unlocking the Power of LSTM for Long Term Time Series Forecasting","summary":" Traditional recurrent neural network architectures, such as long short-term\nmemory neural networks (LSTM), have historically held a prominent role in time\nseries forecasting (TSF) tasks. While the recently introduced sLSTM for Natural\nLanguage Processing (NLP) introduces exponential gating and memory mixing that\nare beneficial for long term sequential learning, its potential short memory\nissue is a barrier to applying sLSTM directly in TSF. To address this, we\npropose a simple yet efficient algorithm named P-sLSTM, which is built upon\nsLSTM by incorporating patching and channel independence. These modifications\nsubstantially enhance sLSTM's performance in TSF, achieving state-of-the-art\nresults. Furthermore, we provide theoretical justifications for our design, and\nconduct extensive comparative and analytical experiments to fully validate the\nefficiency and superior performance of our model.\n","authors":["Yaxuan Kong","Zepu Wang","Yuqi Nie","Tian Zhou","Stefan Zohren","Yuxuan Liang","Peng Sun","Qingsong Wen"],"pdf_url":"https://arxiv.org/pdf/2408.10006v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10002v1","updated":"2024-08-19T13:57:15Z","published":"2024-08-19T13:57:15Z","title":"The Fairness-Quality Trade-off in Clustering","summary":" Fairness in clustering has been considered extensively in the past; however,\nthe trade-off between the two objectives -- e.g., can we sacrifice just a\nlittle in the quality of the clustering to significantly increase fairness, or\nvice-versa? -- has rarely been addressed. We introduce novel algorithms for\ntracing the complete trade-off curve, or Pareto front, between quality and\nfairness in clustering problems; that is, computing all clusterings that are\nnot dominated in both objectives by other clusterings. Unlike previous work\nthat deals with specific objectives for quality and fairness, we deal with all\nobjectives for fairness and quality in two general classes encompassing most of\nthe special cases addressed in previous work. Our algorithm must take\nexponential time in the worst case as the Pareto front itself can be\nexponential. Even when the Pareto front is polynomial, our algorithm may take\nexponential time, and we prove that this is inevitable unless P = NP. However,\nwe also present a new polynomial-time algorithm for computing the entire Pareto\nfront when the cluster centers are fixed, and for perhaps the most natural\nfairness objective: minimizing the sum, over all clusters, of the imbalance\nbetween the two groups in each cluster.\n","authors":["Rashida Hakim","Ana-Andreea Stoica","Christos H. Papadimitriou","Mihalis Yannakakis"],"pdf_url":"https://arxiv.org/pdf/2408.10002v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09937v2","updated":"2024-08-19T13:55:42Z","published":"2024-04-15T17:03:41Z","title":"Compression Represents Intelligence Linearly","summary":" There is a belief that learning to compress well will lead to intelligence.\nRecently, language modeling has been shown to be equivalent to compression,\nwhich offers a compelling rationale for the success of large language models\n(LLMs): the development of more advanced language models is essentially\nenhancing compression which facilitates intelligence. Despite such appealing\ndiscussions, little empirical evidence is present for the interplay between\ncompression and intelligence. In this work, we examine their relationship in\nthe context of LLMs, treating LLMs as data compressors. Given the abstract\nconcept of \"intelligence\", we adopt the average downstream benchmark scores as\na surrogate, specifically targeting intelligence related to knowledge and\ncommonsense, coding, and mathematical reasoning. Across 12 benchmarks, our\nstudy brings together 31 public LLMs that originate from diverse organizations.\nRemarkably, we find that LLMs' intelligence -- reflected by average benchmark\nscores -- almost linearly correlates with their ability to compress external\ntext corpora. These results provide concrete evidence supporting the belief\nthat superior compression indicates greater intelligence. Furthermore, our\nfindings suggest that compression efficiency, as an unsupervised metric derived\nfrom raw text corpora, serves as a reliable evaluation measure that is linearly\nassociated with the model capabilities. We open-source our compression datasets\nas well as our data collection pipelines to facilitate future researchers to\nassess compression properly.\n","authors":["Yuzhen Huang","Jinghan Zhang","Zifei Shan","Junxian He"],"pdf_url":"https://arxiv.org/pdf/2404.09937v2.pdf","comment":"COLM 2024. Data and code are available at\n https://github.com/hkust-nlp/llm-compression-intelligence"},{"id":"http://arxiv.org/abs/2408.09995v1","updated":"2024-08-19T13:47:17Z","published":"2024-08-19T13:47:17Z","title":"Uniting contrastive and generative learning for event sequences models","summary":" High-quality representation of transactional sequences is vital for modern\nbanking applications, including risk management, churn prediction, and\npersonalized customer offers. Different tasks require distinct representation\nproperties: local tasks benefit from capturing the client's current state,\nwhile global tasks rely on general behavioral patterns. Previous research has\ndemonstrated that various self-supervised approaches yield representations that\nbetter capture either global or local qualities.\n This study investigates the integration of two self-supervised learning\ntechniques - instance-wise contrastive learning and a generative approach based\non restoring masked events in latent space. The combined approach creates\nrepresentations that balance local and global transactional data\ncharacteristics. Experiments conducted on several public datasets, focusing on\nsequence classification and next-event type prediction, show that the\nintegrated method achieves superior performance compared to individual\napproaches and demonstrates synergistic effects. These findings suggest that\nthe proposed approach offers a robust framework for advancing event sequences\nrepresentation learning in the financial sector.\n","authors":["Aleksandr Yugay","Alexey Zaytsev"],"pdf_url":"https://arxiv.org/pdf/2408.09995v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18208v3","updated":"2024-08-19T13:40:47Z","published":"2023-10-27T15:31:22Z","title":"ArcheType: A Novel Framework for Open-Source Column Type Annotation\n using Large Language Models","summary":" Existing deep-learning approaches to semantic column type annotation (CTA)\nhave important shortcomings: they rely on semantic types which are fixed at\ntraining time; require a large number of training samples per type and incur\nlarge run-time inference costs; and their performance can degrade when\nevaluated on novel datasets, even when types remain constant. Large language\nmodels have exhibited strong zero-shot classification performance on a wide\nrange of tasks and in this paper we explore their use for CTA. We introduce\nArcheType, a simple, practical method for context sampling, prompt\nserialization, model querying, and label remapping, which enables large\nlanguage models to solve CTA problems in a fully zero-shot manner. We ablate\neach component of our method separately, and establish that improvements to\ncontext sampling and label remapping provide the most consistent gains.\nArcheType establishes a new state-of-the-art performance on zero-shot CTA\nbenchmarks (including three new domain-specific benchmarks which we release\nalong with this paper), and when used in conjunction with classical CTA\ntechniques, it outperforms a SOTA DoDuo model on the fine-tuned SOTAB\nbenchmark. Our code is available at https://github.com/penfever/ArcheType.\n","authors":["Benjamin Feuer","Yurong Liu","Chinmay Hegde","Juliana Freire"],"pdf_url":"https://arxiv.org/pdf/2310.18208v3.pdf","comment":"VLDB 2024"},{"id":"http://arxiv.org/abs/2406.00047v2","updated":"2024-08-19T13:40:20Z","published":"2024-05-28T15:42:15Z","title":"A Theoretical Framework for an Efficient Normalizing Flow-Based Solution\n to the Electronic Schrodinger Equation","summary":" A central problem in quantum mechanics involves solving the Electronic\nSchrodinger Equation for a molecule or material. The Variational Monte Carlo\napproach to this problem approximates a particular variational objective via\nsampling, and then optimizes this approximated objective over a chosen\nparameterized family of wavefunctions, known as the ansatz. Recently neural\nnetworks have been used as the ansatz, with accompanying success. However,\nsampling from such wavefunctions has required the use of a Markov Chain Monte\nCarlo approach, which is inherently inefficient. In this work, we propose a\nsolution to this problem via an ansatz which is cheap to sample from, yet\nsatisfies the requisite quantum mechanical properties. We prove that a\nnormalizing flow using the following two essential ingredients satisfies our\nrequirements: (a) a base distribution which is constructed from Determinantal\nPoint Processes; (b) flow layers which are equivariant to a particular subgroup\nof the permutation group. We then show how to construct both continuous and\ndiscrete normalizing flows which satisfy the requisite equivariance. We further\ndemonstrate the manner in which the non-smooth nature (\"cusps\") of the\nwavefunction may be captured, and how the framework may be generalized to\nprovide induction across multiple molecules. The resulting theoretical\nframework entails an efficient approach to solving the Electronic Schrodinger\nEquation.\n","authors":["Daniel Freedman","Eyal Rozenberg","Alex Bronstein"],"pdf_url":"https://arxiv.org/pdf/2406.00047v2.pdf","comment":"Added references"},{"id":"http://arxiv.org/abs/2408.09981v1","updated":"2024-08-19T13:31:16Z","published":"2024-08-19T13:31:16Z","title":"Parseval Convolution Operators and Neural Networks","summary":" We first establish a kernel theorem that characterizes all linear\nshift-invariant (LSI) operators acting on discrete multicomponent signals. This\nresult naturally leads to the identification of the Parseval convolution\noperators as the class of energy-preserving filterbanks. We then present a\nconstructive approach for the design/specification of such filterbanks via the\nchaining of elementary Parseval modules, each of which being parameterized by\nan orthogonal matrix or a 1-tight frame. Our analysis is complemented with\nexplicit formulas for the Lipschitz constant of all the components of a\nconvolutional neural network (CNN), which gives us a handle on their stability.\nFinally, we demonstrate the usage of those tools with the design of a CNN-based\nalgorithm for the iterative reconstruction of biomedical images. Our algorithm\nfalls within the plug-and-play framework for the resolution of inverse\nproblems. It yields better-quality results than the sparsity-based methods used\nin compressed sensing, while offering essentially the same convergence and\nrobustness guarantees.\n","authors":["Michael Unser","Stanislas Ducotterd"],"pdf_url":"https://arxiv.org/pdf/2408.09981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.02329v2","updated":"2024-08-19T13:27:59Z","published":"2024-01-04T16:06:31Z","title":"Exploring Vacant Classes in Label-Skewed Federated Learning","summary":" Label skews, characterized by disparities in local label distribution across\nclients, pose a significant challenge in federated learning. As minority\nclasses suffer from worse accuracy due to overfitting on local imbalanced data,\nprior methods often incorporate class-balanced learning techniques during local\ntraining. Although these methods improve the mean accuracy across all classes,\nwe observe that vacant classes-referring to categories absent from a client's\ndata distribution-remain poorly recognized. Besides, there is still a gap in\nthe accuracy of local models on minority classes compared to the global model.\nThis paper introduces FedVLS, a novel approach to label-skewed federated\nlearning that integrates both vacant-class distillation and logit suppression\nsimultaneously. Specifically, vacant-class distillation leverages knowledge\ndistillation during local training on each client to retain essential\ninformation related to vacant classes from the global model. Moreover, logit\nsuppression directly penalizes network logits for non-label classes,\neffectively addressing misclassifications in minority classes that may be\nbiased toward majority classes. Extensive experiments validate the efficacy of\nFedVLS, demonstrating superior performance compared to previous\nstate-of-the-art (SOTA) methods across diverse datasets with varying degrees of\nlabel skews. Code is available in the supplementary material.\n","authors":["Kuangpu Guo","Yuhe Ding","Jian Liang","Ran He","Zilei Wang","Tieniu Tan"],"pdf_url":"https://arxiv.org/pdf/2401.02329v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09976v1","updated":"2024-08-19T13:23:07Z","published":"2024-08-19T13:23:07Z","title":"Preference-Optimized Pareto Set Learning for Blackbox Optimization","summary":" Multi-Objective Optimization (MOO) is an important problem in real-world\napplications. However, for a non-trivial problem, no single solution exists\nthat can optimize all the objectives simultaneously. In a typical MOO problem,\nthe goal is to find a set of optimum solutions (Pareto set) that trades off the\npreferences among objectives. Scalarization in MOO is a well-established method\nfor finding a finite set approximation of the whole Pareto set (PS). However,\nin real-world experimental design scenarios, it's beneficial to obtain the\nwhole PS for flexible exploration of the design space. Recently Pareto set\nlearning (PSL) has been introduced to approximate the whole PS. PSL involves\ncreating a manifold representing the Pareto front of a multi-objective\noptimization problem. A naive approach includes finding discrete points on the\nPareto front through randomly generated preference vectors and connecting them\nby regression. However, this approach is computationally expensive and leads to\na poor PS approximation. We propose to optimize the preference points to be\ndistributed evenly on the Pareto front. Our formulation leads to a bilevel\noptimization problem that can be solved by e.g. differentiable cross-entropy\nmethods. We demonstrated the efficacy of our method for complex and difficult\nblack-box MOO problems using both synthetic and real-world benchmark data.\n","authors":["Zhang Haishan","Diptesh Das","Koji Tsuda"],"pdf_url":"https://arxiv.org/pdf/2408.09976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09974v1","updated":"2024-08-19T13:21:46Z","published":"2024-08-19T13:21:46Z","title":"The Exploration-Exploitation Dilemma Revisited: An Entropy Perspective","summary":" The imbalance of exploration and exploitation has long been a significant\nchallenge in reinforcement learning. In policy optimization, excessive reliance\non exploration reduces learning efficiency, while over-dependence on\nexploitation might trap agents in local optima. This paper revisits the\nexploration-exploitation dilemma from the perspective of entropy by revealing\nthe relationship between entropy and the dynamic adaptive process of\nexploration and exploitation. Based on this theoretical insight, we establish\nan end-to-end adaptive framework called AdaZero, which automatically determines\nwhether to explore or to exploit as well as their balance of strength.\nExperiments show that AdaZero significantly outperforms baseline models across\nvarious Atari and MuJoCo environments with only a single setting. Especially in\nthe challenging environment of Montezuma, AdaZero boosts the final returns by\nup to fifteen times. Moreover, we conduct a series of visualization analyses to\nreveal the dynamics of our self-adaptive mechanism, demonstrating how entropy\nreflects and changes with respect to the agent's performance and adaptive\nprocess.\n","authors":["Renye Yan","Yaozhong Gan","You Wu","Ling Liang","Junliang Xing","Yimao Cai","Ru Huang"],"pdf_url":"https://arxiv.org/pdf/2408.09974v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09967v1","updated":"2024-08-19T13:14:26Z","published":"2024-08-19T13:14:26Z","title":"Unsupervised Machine Learning Hybrid Approach Integrating Linear\n Programming in Loss Function: A Robust Optimization Technique","summary":" This paper presents a novel hybrid approach that integrates linear\nprogramming (LP) within the loss function of an unsupervised machine learning\nmodel. By leveraging the strengths of both optimization techniques and machine\nlearning, this method introduces a robust framework for solving complex\noptimization problems where traditional methods may fall short. The proposed\napproach encapsulates the constraints and objectives of a linear programming\nproblem directly into the loss function, guiding the learning process to adhere\nto these constraints while optimizing the desired outcomes. This technique not\nonly preserves the interpretability of linear programming but also benefits\nfrom the flexibility and adaptability of machine learning, making it\nparticularly well-suited for unsupervised or semi-supervised learning\nscenarios.\n","authors":["Andrew Kiruluta","Andreas Lemos"],"pdf_url":"https://arxiv.org/pdf/2408.09967v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09966v1","updated":"2024-08-19T13:14:02Z","published":"2024-08-19T13:14:02Z","title":"Mask in the Mirror: Implicit Sparsification","summary":" Sparsifying deep neural networks to reduce their inference cost is an NP-hard\nproblem and difficult to optimize due to its mixed discrete and continuous\nnature. Yet, as we prove, continuous sparsification has already an implicit\nbias towards sparsity that would not require common projections of relaxed mask\nvariables. While implicit rather than explicit regularization induces benefits,\nit usually does not provide enough flexibility in practice, as only a specific\ntarget sparsity is obtainable. To exploit its potential for continuous\nsparsification, we propose a way to control the strength of the implicit bias.\nBased on the mirror flow framework, we derive resulting convergence and\noptimality guarantees in the context of underdetermined linear regression and\ndemonstrate the utility of our insights in more general neural network\nsparsification experiments, achieving significant performance gains,\nparticularly in the high-sparsity regime. Our theoretical contribution might be\nof independent interest, as we highlight a way to enter the rich regime and\nshow that implicit bias is controllable by a time-dependent Bregman potential.\n","authors":["Tom Jacobs","Rebekka Burkholz"],"pdf_url":"https://arxiv.org/pdf/2408.09966v1.pdf","comment":"20 pages, 5 figures"},{"id":"http://arxiv.org/abs/2403.04484v2","updated":"2024-08-19T13:06:36Z","published":"2024-03-07T13:36:15Z","title":"Source Matters: Source Dataset Impact on Model Robustness in Medical\n Imaging","summary":" Transfer learning has become an essential part of medical imaging\nclassification algorithms, often leveraging ImageNet weights. The domain shift\nfrom natural to medical images has prompted alternatives such as RadImageNet,\noften showing comparable classification performance. However, it remains\nunclear whether the performance gains from transfer learning stem from improved\ngeneralization or shortcut learning. To address this, we conceptualize\nconfounders by introducing the Medical Imaging Contextualized Confounder\nTaxonomy (MICCAT) and investigate a range of confounders across it -- whether\nsynthetic or sampled from the data -- using two public chest X-ray and CT\ndatasets. We show that ImageNet and RadImageNet achieve comparable\nclassification performance, yet ImageNet is much more prone to overfitting to\nconfounders. We recommend that researchers using ImageNet-pretrained models\nreexamine their model robustness by conducting similar experiments. Our code\nand experiments are available at https://github.com/DovileDo/source-matters.\n","authors":["Dovile Juodelyte","Yucheng Lu","Amelia Jiménez-Sánchez","Sabrina Bottazzi","Enzo Ferrante","Veronika Cheplygina"],"pdf_url":"https://arxiv.org/pdf/2403.04484v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09958v1","updated":"2024-08-19T12:58:51Z","published":"2024-08-19T12:58:51Z","title":"AdaResNet: Enhancing Residual Networks with Dynamic Weight Adjustment\n for Improved Feature Integration","summary":" In very deep neural networks, gradients can become extremely small during\nbackpropagation, making it challenging to train the early layers. ResNet\n(Residual Network) addresses this issue by enabling gradients to flow directly\nthrough the network via skip connections, facilitating the training of much\ndeeper networks. However, in these skip connections, the input ipd is directly\nadded to the transformed data tfd, treating ipd and tfd equally, without\nadapting to different scenarios. In this paper, we propose AdaResNet\n(Auto-Adapting Residual Network), which automatically adjusts the ratio between\nipd and tfd based on the training data. We introduce a variable,\nweight}_{tfd}^{ipd, to represent this ratio. This variable is dynamically\nadjusted during backpropagation, allowing it to adapt to the training data\nrather than remaining fixed. Experimental results demonstrate that AdaResNet\nachieves a maximum accuracy improvement of over 50\\% compared to traditional\nResNet.\n","authors":["Hong Su"],"pdf_url":"https://arxiv.org/pdf/2408.09958v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09952v1","updated":"2024-08-19T12:47:47Z","published":"2024-08-19T12:47:47Z","title":"Weakly Supervised Pretraining and Multi-Annotator Supervised Finetuning\n for Facial Wrinkle Detection","summary":" 1. Research question: With the growing interest in skin diseases and skin\naesthetics, the ability to predict facial wrinkles is becoming increasingly\nimportant. This study aims to evaluate whether a computational model,\nconvolutional neural networks (CNN), can be trained for automated facial\nwrinkle segmentation. 2. Findings: Our study presents an effective technique\nfor integrating data from multiple annotators and illustrates that transfer\nlearning can enhance performance, resulting in dependable segmentation of\nfacial wrinkles. 3. Meaning: This approach automates intricate and\ntime-consuming tasks of wrinkle analysis with a deep learning framework. It\ncould be used to facilitate skin treatments and diagnostics.\n","authors":["Ik Jun Moon","Junho Moon","Ikbeom Jang"],"pdf_url":"https://arxiv.org/pdf/2408.09952v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08164v5","updated":"2024-08-19T12:44:27Z","published":"2023-10-12T09:36:03Z","title":"Interpreting Learned Feedback Patterns in Large Language Models","summary":" Reinforcement learning from human feedback (RLHF) is widely used to train\nlarge language models (LLMs). However, it is unclear whether LLMs accurately\nlearn the underlying preferences in human feedback data. We coin the term\n\\textit{Learned Feedback Pattern} (LFP) for patterns in an LLM's activations\nlearned during RLHF that improve its performance on the fine-tuning task. We\nhypothesize that LLMs with LFPs accurately aligned to the fine-tuning feedback\nexhibit consistent activation patterns for outputs that would have received\nsimilar feedback during RLHF. To test this, we train probes to estimate the\nfeedback signal implicit in the activations of a fine-tuned LLM. We then\ncompare these estimates to the true feedback, measuring how accurate the LFPs\nare to the fine-tuning feedback. Our probes are trained on a condensed, sparse\nand interpretable representation of LLM activations, making it easier to\ncorrelate features of the input with our probe's predictions. We validate our\nprobes by comparing the neural features they correlate with positive feedback\ninputs against the features GPT-4 describes and classifies as related to LFPs.\nUnderstanding LFPs can help minimize discrepancies between LLM behavior and\ntraining objectives, which is essential for the safety of LLMs.\n","authors":["Luke Marks","Amir Abdullah","Clement Neo","Rauno Arike","David Krueger","Philip Torr","Fazl Barez"],"pdf_url":"https://arxiv.org/pdf/2310.08164v5.pdf","comment":"19 pages, 8 figures"},{"id":"http://arxiv.org/abs/2408.04841v2","updated":"2024-08-19T12:23:18Z","published":"2024-08-09T03:32:37Z","title":"Kolmogorov-Arnold Network for Online Reinforcement Learning","summary":" Kolmogorov-Arnold Networks (KANs) have shown potential as an alternative to\nMulti-Layer Perceptrons (MLPs) in neural networks, providing universal function\napproximation with fewer parameters and reduced memory usage. In this paper, we\nexplore the use of KANs as function approximators within the Proximal Policy\nOptimization (PPO) algorithm. We evaluate this approach by comparing its\nperformance to the original MLP-based PPO using the DeepMind Control Proprio\nRobotics benchmark. Our results indicate that the KAN-based reinforcement\nlearning algorithm can achieve comparable performance to its MLP-based\ncounterpart, often with fewer parameters. These findings suggest that KANs may\noffer a more efficient option for reinforcement learning models.\n","authors":["Victor Augusto Kich","Jair Augusto Bottega","Raul Steinmetz","Ricardo Bedin Grando","Ayano Yorozu","Akihisa Ohya"],"pdf_url":"https://arxiv.org/pdf/2408.04841v2.pdf","comment":"Paper accepted at 24th International Conference on Control,\n Automation and Systems (ICCAS)"},{"id":"http://arxiv.org/abs/2408.08664v2","updated":"2024-08-19T12:20:26Z","published":"2024-08-16T11:11:56Z","title":"A new perspective on Bayesian Operational Modal Analysis","summary":" In the field of operational modal analysis (OMA), obtained modal information\nis frequently used to assess the current state of aerospace, mechanical,\noffshore and civil structures. However, the stochasticity of operational\nsystems and the lack of forcing information can lead to inconsistent results.\nQuantifying the uncertainty of the recovered modal parameters through OMA is\ntherefore of significant value. In this article, a new perspective on Bayesian\nOMA is proposed: a Bayesian stochastic subspace identification (SSI) algorithm.\nDistinct from existing approaches to Bayesian OMA, a hierarchical probabilistic\nmodel is embedded at the core of covariance-driven SSI. Through substitution of\ncanonical correlation analysis with a Bayesian equivalent, posterior\ndistributions over the modal properties are obtained. Two inference schemes are\npresented for the proposed Bayesian formulation: Markov Chain Monte Carlo and\nvariational Bayes. Two case studies are then explored. The first is benchmark\nstudy using data from a simulated, multi degree-of-freedom, linear system.\nFollowing application of Bayesian SSI, it is shown that the same posterior is\ntargeted and recovered by both inference schemes, with good agreement between\nthe posterior mean and the conventional SSI result. The second study applies\nthe variational form to data obtained from an in-service structure: The Z24\nbridge. The results of this study are presented at single model orders, and\nthen using a stabilisation diagram. The recovered posterior uncertainty is\npresented and compared to the classic SSI result. It is observed that the\nposterior distributions with mean values coinciding with the natural\nfrequencies exhibit lower variance than values situated away from the natural\nfrequencies.\n","authors":["Brandon J. O'Connell","Max D. Champneys","Timothy J. Rogers"],"pdf_url":"https://arxiv.org/pdf/2408.08664v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09937v1","updated":"2024-08-19T12:18:07Z","published":"2024-08-19T12:18:07Z","title":"The curse of random quantum data","summary":" Quantum machine learning, which involves running machine learning algorithms\non quantum devices, may be one of the most significant flagship applications\nfor these devices. Unlike its classical counterparts, the role of data in\nquantum machine learning has not been fully understood. In this work, we\nquantify the performances of quantum machine learning in the landscape of\nquantum data. Provided that the encoding of quantum data is sufficiently\nrandom, the performance, we find that the training efficiency and\ngeneralization capabilities in quantum machine learning will be exponentially\nsuppressed with the increase in the number of qubits, which we call \"the curse\nof random quantum data\". Our findings apply to both the quantum kernel method\nand the large-width limit of quantum neural networks. Conversely, we highlight\nthat through meticulous design of quantum datasets, it is possible to avoid\nthese curses, thereby achieving efficient convergence and robust\ngeneralization. Our conclusions are corroborated by extensive numerical\nsimulations.\n","authors":["Kaining Zhang","Junyu Liu","Liu Liu","Liang Jiang","Min-Hsiu Hsieh","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2408.09937v1.pdf","comment":"40 pages, 8 figures"},{"id":"http://arxiv.org/abs/2408.09936v1","updated":"2024-08-19T12:17:37Z","published":"2024-08-19T12:17:37Z","title":"Electron-nucleus cross sections from transfer learning","summary":" Transfer learning (TL) allows a deep neural network (DNN) trained on one type\nof data to be adapted for new problems with limited information. We propose to\nuse the TL technique in physics. The DNN learns the physics of one process, and\nafter fine-tuning, it makes predictions for related processes. We consider the\nDNNs, trained on inclusive electron-carbon scattering data, and show that after\nfine-tuning, they accurately predict cross sections for electron interactions\nwith nuclear targets ranging from lithium to iron. The method works even when\nthe DNN is fine-tuned on a small dataset.\n","authors":["Krzysztof M. Graczyk","Beata E. Kowal","Artur M. Ankowski","Rwik Dharmapal Banerjee","Jose Luis Bonilla","Hemant Prasad","Jan T. Sobczyk"],"pdf_url":"https://arxiv.org/pdf/2408.09936v1.pdf","comment":"4 pages, 2 figures"},{"id":"http://arxiv.org/abs/2408.09929v1","updated":"2024-08-19T12:07:42Z","published":"2024-08-19T12:07:42Z","title":"Data Augmentation of Contrastive Learning is Estimating\n Positive-incentive Noise","summary":" Inspired by the idea of Positive-incentive Noise (Pi-Noise or $\\pi$-Noise)\nthat aims at learning the reliable noise beneficial to tasks, we scientifically\ninvestigate the connection between contrastive learning and $\\pi$-noise in this\npaper. By converting the contrastive loss to an auxiliary Gaussian distribution\nto quantitatively measure the difficulty of the specific contrastive model\nunder the information theory framework, we properly define the task entropy,\nthe core concept of $\\pi$-noise, of contrastive learning. It is further proved\nthat the predefined data augmentation in the standard contrastive learning\nparadigm can be regarded as a kind of point estimation of $\\pi$-noise. Inspired\nby the theoretical study, a framework that develops a $\\pi$-noise generator to\nlearn the beneficial noise (instead of estimation) as data augmentations for\ncontrast is proposed. The designed framework can be applied to diverse types of\ndata and is also completely compatible with the existing contrastive models.\nFrom the visualization, we surprisingly find that the proposed method\nsuccessfully learns effective augmentations.\n","authors":["Hongyuan Zhang","Yanchen Xu","Sida Huang","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2408.09929v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09918v1","updated":"2024-08-19T11:47:05Z","published":"2024-08-19T11:47:05Z","title":"Expressive Power of Temporal Message Passing","summary":" Graph neural networks (GNNs) have recently been adapted to temporal settings,\noften employing temporal versions of the message-passing mechanism known from\nGNNs. We divide temporal message passing mechanisms from literature into two\nmain types: global and local, and establish Weisfeiler-Leman characterisations\nfor both. This allows us to formally analyse expressive power of temporal\nmessage-passing models. We show that global and local temporal message-passing\nmechanisms have incomparable expressive power when applied to arbitrary\ntemporal graphs. However, the local mechanism is strictly more expressive than\nthe global mechanism when applied to colour-persistent temporal graphs, whose\nnode colours are initially the same in all time points. Our theoretical\nfindings are supported by experimental evidence, underlining practical\nimplications of our analysis.\n","authors":["Przemysław Andrzej Wałęga","Michael Rawson"],"pdf_url":"https://arxiv.org/pdf/2408.09918v1.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2408.09914v1","updated":"2024-08-19T11:40:20Z","published":"2024-08-19T11:40:20Z","title":"Active Learning for Identifying Disaster-Related Tweets: A Comparison\n with Keyword Filtering and Generic Fine-Tuning","summary":" Information from social media can provide essential information for emergency\nresponse during natural disasters in near real-time. However, it is difficult\nto identify the disaster-related posts among the large amounts of unstructured\ndata available. Previous methods often use keyword filtering, topic modelling\nor classification-based techniques to identify such posts. Active Learning (AL)\npresents a promising sub-field of Machine Learning (ML) that has not been used\nmuch in the field of text classification of social media content. This study\ntherefore investigates the potential of AL for identifying disaster-related\nTweets. We compare a keyword filtering approach, a RoBERTa model fine-tuned\nwith generic data from CrisisLex, a base RoBERTa model trained with AL and a\nfine-tuned RoBERTa model trained with AL regarding classification performance.\nFor testing, data from CrisisLex and manually labelled data from the 2021 flood\nin Germany and the 2023 Chile forest fires were considered. The results show\nthat generic fine-tuning combined with 10 rounds of AL outperformed all other\napproaches. Consequently, a broadly applicable model for the identification of\ndisaster-related Tweets could be trained with very little labelling effort. The\nmodel can be applied to use cases beyond this study and provides a useful tool\nfor further research in social media analysis.\n","authors":["David Hanny","Sebastian Schmidt","Bernd Resch"],"pdf_url":"https://arxiv.org/pdf/2408.09914v1.pdf","comment":"Submitted for the Intelligent Systems Conference (IntelliSys 2024).\n The version of record of this contribution is published in the Springer\n series Lecture Notes in Networks and Systems, and is available online at\n https://doi.org/10.1007/978-3-031-66428-1_8. This preprint has not undergone\n peer review or any post-submission improvements or corrections. 13 pages, 2\n figures"},{"id":"http://arxiv.org/abs/2408.09908v1","updated":"2024-08-19T11:30:00Z","published":"2024-08-19T11:30:00Z","title":"$p$SVM: Soft-margin SVMs with $p$-norm Hinge Loss","summary":" Support Vector Machines (SVMs) based on hinge loss have been extensively\ndiscussed and applied to various binary classification tasks. These SVMs\nachieve a balance between margin maximization and the minimization of slack due\nto outliers. Although many efforts have been dedicated to enhancing the\nperformance of SVMs with hinge loss, studies on $p$SVMs, soft-margin SVMs with\n$p$-norm hinge loss, remain relatively scarce. In this paper, we explore the\nproperties, performance, and training algorithms of $p$SVMs. We first derive\nthe generalization bound of $p$SVMs, then formulate the dual optimization\nproblem, comparing it with the traditional approach. Furthermore, we discuss a\ngeneralized version of the Sequential Minimal Optimization (SMO) algorithm,\n$p$SMO, to train our $p$SVM model. Comparative experiments on various datasets,\nincluding binary and multi-class classification tasks, demonstrate the\neffectiveness and advantages of our $p$SVM model and the $p$SMO method.\n","authors":["Haoxiang Sun"],"pdf_url":"https://arxiv.org/pdf/2408.09908v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17607v2","updated":"2024-08-19T11:26:40Z","published":"2023-11-29T13:05:06Z","title":"Topology-preserving Adversarial Training for Alleviating Natural\n Accuracy Degradation","summary":" Despite the effectiveness in improving the robustness of neural networks,\nadversarial training has suffered from the natural accuracy degradation\nproblem, i.e., accuracy on natural samples has reduced significantly. In this\nstudy, we reveal that natural accuracy degradation is highly related to the\ndisruption of the natural sample topology in the representation space by\nquantitative and qualitative experiments. Based on this observation, we propose\nTopology-pReserving Adversarial traINing (TRAIN) to alleviate the problem by\npreserving the topology structure of natural samples from a standard model\ntrained only on natural samples during adversarial training. As an additional\nregularization, our method can be combined with various popular adversarial\ntraining algorithms, taking advantage of both sides. Extensive experiments on\nCIFAR-10, CIFAR-100, and Tiny ImageNet show that our proposed method achieves\nconsistent and significant improvements over various strong baselines in most\ncases. Specifically, without additional data, TRAIN achieves up to 8.86%\nimprovement in natural accuracy and 6.33% improvement in robust accuracy.\n","authors":["Xiaoyue Mi","Fan Tang","Yepeng Weng","Danding Wang","Juan Cao","Sheng Tang","Peng Li","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2311.17607v2.pdf","comment":"BMVC 2024; Code will be released on https://github.com/KululuMi/TRAIN"},{"id":"http://arxiv.org/abs/2408.09896v1","updated":"2024-08-19T11:09:15Z","published":"2024-08-19T11:09:15Z","title":"Instruction-Based Molecular Graph Generation with Unified Text-Graph\n Diffusion Model","summary":" Recent advancements in computational chemistry have increasingly focused on\nsynthesizing molecules based on textual instructions. Integrating graph\ngeneration with these instructions is complex, leading most current methods to\nuse molecular sequences with pre-trained large language models. In response to\nthis challenge, we propose a novel framework, named $\\textbf{UTGDiff (Unified\nText-Graph Diffusion Model)}$, which utilizes language models for discrete\ngraph diffusion to generate molecular graphs from instructions. UTGDiff\nfeatures a unified text-graph transformer as the denoising network, derived\nfrom pre-trained language models and minimally modified to process graph data\nthrough attention bias. Our experimental results demonstrate that UTGDiff\nconsistently outperforms sequence-based baselines in tasks involving\ninstruction-based molecule generation and editing, achieving superior\nperformance with fewer parameters given an equivalent level of pretraining\ncorpus. Our code is availble at https://github.com/ran1812/UTGDiff.\n","authors":["Yuran Xiang","Haiteng Zhao","Chang Ma","Zhi-Hong Deng"],"pdf_url":"https://arxiv.org/pdf/2408.09896v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09895v1","updated":"2024-08-19T11:09:12Z","published":"2024-08-19T11:09:12Z","title":"Performance Law of Large Language Models","summary":" Guided by the belief of the scaling law, large language models (LLMs) have\nachieved impressive performance in recent years. However, scaling law only\ngives a qualitative estimation of loss, which is influenced by various factors\nsuch as model architectures, data distributions, tokenizers, and computation\nprecision. Thus, estimating the real performance of LLMs with different\ntraining settings rather than loss may be quite useful in practical\ndevelopment. In this article, we present an empirical equation named\n\"Performance Law\" to directly predict the MMLU score of an LLM, which is a\nwidely used metric to indicate the general capability of LLMs in real-world\nconversations and applications. Based on only a few key hyperparameters of the\nLLM architecture and the size of training data, we obtain a quite accurate MMLU\nprediction of various LLMs with diverse sizes and architectures developed by\ndifferent organizations in different years. Performance law can be used to\nguide the choice of LLM architecture and the effective allocation of\ncomputational resources without extensive experiments.\n","authors":["Chuhan Wu","Ruiming Tang"],"pdf_url":"https://arxiv.org/pdf/2408.09895v1.pdf","comment":"Personal opinions of the authors"},{"id":"http://arxiv.org/abs/2408.09891v1","updated":"2024-08-19T11:07:05Z","published":"2024-08-19T11:07:05Z","title":"Differential Private Stochastic Optimization with Heavy-tailed Data:\n Towards Optimal Rates","summary":" We study convex optimization problems under differential privacy (DP). With\nheavy-tailed gradients, existing works achieve suboptimal rates. The main\nobstacle is that existing gradient estimators have suboptimal tail properties,\nresulting in a superfluous factor of $d$ in the union bound. In this paper, we\nexplore algorithms achieving optimal rates of DP optimization with heavy-tailed\ngradients. Our first method is a simple clipping approach. Under bounded $p$-th\norder moments of gradients, with $n$ samples, it achieves\n$\\tilde{O}(\\sqrt{d/n}+\\sqrt{d}(\\sqrt{d}/n\\epsilon)^{1-1/p})$ population risk\nwith $\\epsilon\\leq 1/\\sqrt{d}$. We then propose an iterative updating method,\nwhich is more complex but achieves this rate for all $\\epsilon\\leq 1$. The\nresults significantly improve over existing methods. Such improvement relies on\na careful treatment of the tail behavior of gradient estimators. Our results\nmatch the minimax lower bound in \\cite{kamath2022improved}, indicating that the\ntheoretical limit of stochastic convex optimization under DP is achievable.\n","authors":["Puning Zhao","Jiafei Wu","Zhe Liu","Chong Wang","Rongfei Fan","Qingming Li"],"pdf_url":"https://arxiv.org/pdf/2408.09891v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09882v1","updated":"2024-08-19T10:50:45Z","published":"2024-08-19T10:50:45Z","title":"GINO-Q: Learning an Asymptotically Optimal Index Policy for Restless\n Multi-armed Bandits","summary":" The restless multi-armed bandit (RMAB) framework is a popular model with\napplications across a wide variety of fields. However, its solution is hindered\nby the exponentially growing state space (with respect to the number of arms)\nand the combinatorial action space, making traditional reinforcement learning\nmethods infeasible for large-scale instances. In this paper, we propose GINO-Q,\na three-timescale stochastic approximation algorithm designed to learn an\nasymptotically optimal index policy for RMABs. GINO-Q mitigates the curse of\ndimensionality by decomposing the RMAB into a series of subproblems, each with\nthe same dimension as a single arm, ensuring that complexity increases linearly\nwith the number of arms. Unlike recently developed Whittle-index-based\nalgorithms, GINO-Q does not require RMABs to be indexable, enhancing its\nflexibility and applicability. Our experimental results demonstrate that GINO-Q\nconsistently learns near-optimal policies, even for non-indexable RMABs where\nWhittle-index-based algorithms perform poorly, and it converges significantly\nfaster than existing baselines.\n","authors":["Gongpu Chen","Soung Chang Liew","Deniz Gunduz"],"pdf_url":"https://arxiv.org/pdf/2408.09882v1.pdf","comment":"9 pages, 11 figures"},{"id":"http://arxiv.org/abs/2408.09873v1","updated":"2024-08-19T10:24:57Z","published":"2024-08-19T10:24:57Z","title":"New spectral imaging biomarkers for sepsis and mortality in intensive\n care","summary":" With sepsis remaining a leading cause of mortality, early identification of\nseptic patients and those at high risk of death is a challenge of high\nsocioeconomic importance. The driving hypothesis of this study was that\nhyperspectral imaging (HSI) could provide novel biomarkers for sepsis diagnosis\nand treatment management due to its potential to monitor microcirculatory\nalterations. We conducted a comprehensive study involving HSI data of the palm\nand fingers from more than 480 patients on the day of their intensive care unit\n(ICU) admission. The findings demonstrate that HSI measurements can predict\nsepsis with an area under the receiver operating characteristic curve (AUROC)\nof 0.80 (95 % confidence interval (CI) [0.76; 0.84]) and mortality with an\nAUROC of 0.72 (95 % CI [0.65; 0.79]). The predictive performance improves\nsubstantially when additional clinical data is incorporated, leading to an\nAUROC of up to 0.94 (95 % CI [0.92; 0.96]) for sepsis and 0.84 (95 % CI [0.78;\n0.89]) for mortality. We conclude that HSI presents novel imaging biomarkers\nfor the rapid, non-invasive prediction of sepsis and mortality, suggesting its\npotential as an important modality for guiding diagnosis and treatment.\n","authors":["Silvia Seidlitz","Katharina Hölzl","Ayca von Garrel","Jan Sellner","Stephan Katzenschlager","Tobias Hölle","Dania Fischer","Maik von der Forst","Felix C. F. Schmitt","Markus A. Weigand","Lena Maier-Hein","Maximilian Dietrich"],"pdf_url":"https://arxiv.org/pdf/2408.09873v1.pdf","comment":"Markus A. Weigand, Lena Maier-Hein and Maximilian Dietrich\n contributed equally"},{"id":"http://arxiv.org/abs/2408.09865v1","updated":"2024-08-19T10:12:52Z","published":"2024-08-19T10:12:52Z","title":"MAPLE: Enhancing Review Generation with Multi-Aspect Prompt LEarning in\n Explainable Recommendation","summary":" Explainable Recommendation task is designed to receive a pair of user and\nitem and output explanations to justify why an item is recommended to a user.\nMany models treat review-generation as a proxy of explainable recommendation.\nAlthough they are able to generate fluent and grammatical sentences, they\nsuffer from generality and hallucination issues. We propose a personalized,\naspect-controlled model called Multi-Aspect Prompt LEarner (MAPLE), in which it\nintegrates aspect category as another input dimension to facilitate the\nmemorization of fine-grained aspect terms. Experiments on two real-world review\ndatasets in restaurant domain show that MAPLE outperforms the baseline\nreview-generation models in terms of text and feature diversity while\nmaintaining excellent coherence and factual relevance. We further treat MAPLE\nas a retriever component in the retriever-reader framework and employ a\nLarge-Language Model (LLM) as the reader, showing that MAPLE's explanation\nalong with the LLM's comprehension ability leads to enriched and personalized\nexplanation as a result. We will release the code and data in this http upon\nacceptance.\n","authors":["Ching-Wen Yang","Che Wei Chen","Kun-da Wu","Hao Xu","Jui-Feng Yao","Hung-Yu Kao"],"pdf_url":"https://arxiv.org/pdf/2408.09865v1.pdf","comment":"8 main pages, 10 pages for appendix. Under review"},{"id":"http://arxiv.org/abs/2408.09860v1","updated":"2024-08-19T10:08:25Z","published":"2024-08-19T10:08:25Z","title":"3D-Aware Instance Segmentation and Tracking in Egocentric Videos","summary":" Egocentric videos present unique challenges for 3D scene understanding due to\nrapid camera motion, frequent object occlusions, and limited object visibility.\nThis paper introduces a novel approach to instance segmentation and tracking in\nfirst-person video that leverages 3D awareness to overcome these obstacles. Our\nmethod integrates scene geometry, 3D object centroid tracking, and instance\nsegmentation to create a robust framework for analyzing dynamic egocentric\nscenes. By incorporating spatial and temporal cues, we achieve superior\nperformance compared to state-of-the-art 2D approaches. Extensive evaluations\non the challenging EPIC Fields dataset demonstrate significant improvements\nacross a range of tracking and segmentation consistency metrics. Specifically,\nour method outperforms the next best performing approach by $7$ points in\nAssociation Accuracy (AssA) and $4.5$ points in IDF1 score, while reducing the\nnumber of ID switches by $73\\%$ to $80\\%$ across various object categories.\nLeveraging our tracked instance segmentations, we showcase downstream\napplications in 3D object reconstruction and amodal video object segmentation\nin these egocentric settings.\n","authors":["Yash Bhalgat","Vadim Tschernezki","Iro Laina","João F. Henriques","Andrea Vedaldi","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2408.09860v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09858v1","updated":"2024-08-19T10:03:14Z","published":"2024-08-19T10:03:14Z","title":"ShortCircuit: AlphaZero-Driven Circuit Design","summary":" Chip design relies heavily on generating Boolean circuits, such as\nAND-Inverter Graphs (AIGs), from functional descriptions like truth tables.\nWhile recent advances in deep learning have aimed to accelerate circuit design,\nthese efforts have mostly focused on tasks other than synthesis, and\ntraditional heuristic methods have plateaued. In this paper, we introduce\nShortCircuit, a novel transformer-based architecture that leverages the\nstructural properties of AIGs and performs efficient space exploration.\nContrary to prior approaches attempting end-to-end generation of logic circuits\nusing deep networks, ShortCircuit employs a two-phase process combining\nsupervised with reinforcement learning to enhance generalization to unseen\ntruth tables. We also propose an AlphaZero variant to handle the double\nexponentially large state space and the sparsity of the rewards, enabling the\ndiscovery of near-optimal designs. To evaluate the generative performance of\nour trained model , we extract 500 truth tables from a benchmark set of 20\nreal-world circuits. ShortCircuit successfully generates AIGs for 84.6% of the\n8-input test truth tables, and outperforms the state-of-the-art logic synthesis\ntool, ABC, by 14.61% in terms of circuits size.\n","authors":["Dimitrios Tsaras","Antoine Grosnit","Lei Chen","Zhiyao Xie","Haitham Bou-Ammar","Mingxuan Yuan"],"pdf_url":"https://arxiv.org/pdf/2408.09858v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12614v3","updated":"2024-08-19T10:01:56Z","published":"2024-06-18T13:43:22Z","title":"EUvsDisinfo: a Dataset for Multilingual Detection of Pro-Kremlin\n Disinformation in News Articles","summary":" This work introduces EUvsDisinfo, a multilingual dataset of disinformation\narticles originating from pro-Kremlin outlets, along with trustworthy articles\nfrom credible / less biased sources. It is sourced directly from the debunk\narticles written by experts leading the EUvsDisinfo project. Our dataset is the\nlargest to-date resource in terms of the overall number of articles and\ndistinct languages. It also provides the largest topical and temporal coverage.\nUsing this dataset, we investigate the dissemination of pro-Kremlin\ndisinformation across different languages, uncovering language-specific\npatterns targeting certain disinformation topics. We further analyse the\nevolution of topic distribution over an eight-year period, noting a significant\nsurge in disinformation content before the full-scale invasion of Ukraine in\n2022. Lastly, we demonstrate the dataset's applicability in training models to\neffectively distinguish between disinformation and trustworthy content in\nmultilingual settings.\n","authors":["João A. Leite","Olesya Razuvayevskaya","Kalina Bontcheva","Carolina Scarton"],"pdf_url":"https://arxiv.org/pdf/2406.12614v3.pdf","comment":"Published at CIKM 2024"},{"id":"http://arxiv.org/abs/2408.09840v1","updated":"2024-08-19T09:36:07Z","published":"2024-08-19T09:36:07Z","title":"Machine Learning with Physics Knowledge for Prediction: A Survey","summary":" This survey examines the broad suite of methods and models for combining\nmachine learning with physics knowledge for prediction and forecast, with a\nfocus on partial differential equations. These methods have attracted\nsignificant interest due to their potential impact on advancing scientific\nresearch and industrial practices by improving predictive models with small- or\nlarge-scale datasets and expressive predictive models with useful inductive\nbiases. The survey has two parts. The first considers incorporating physics\nknowledge on an architectural level through objective functions, structured\npredictive models, and data augmentation. The second considers data as physics\nknowledge, which motivates looking at multi-task, meta, and contextual learning\nas an alternative approach to incorporating physics knowledge in a data-driven\nfashion. Finally, we also provide an industrial perspective on the application\nof these methods and a survey of the open-source ecosystem for physics-informed\nmachine learning.\n","authors":["Joe Watson","Chen Song","Oliver Weeger","Theo Gruner","An T. Le","Kay Hansel","Ahmed Hendawy","Oleg Arenz","Will Trojak","Miles Cranmer","Carlo D'Eramo","Fabian Bülow","Tanmay Goyal","Jan Peters","Martin W. Hoffman"],"pdf_url":"https://arxiv.org/pdf/2408.09840v1.pdf","comment":"56 pages, 8 figures, 2 tables"},{"id":"http://arxiv.org/abs/2408.09838v1","updated":"2024-08-19T09:33:31Z","published":"2024-08-19T09:33:31Z","title":"Mitigating the Stability-Plasticity Dilemma in Adaptive Train Scheduling\n with Curriculum-Driven Continual DQN Expansion","summary":" A continual learning agent builds on previous experiences to develop\nincreasingly complex behaviors by adapting to non-stationary and dynamic\nenvironments while preserving previously acquired knowledge. However, scaling\nthese systems presents significant challenges, particularly in balancing the\npreservation of previous policies with the adaptation of new ones to current\nenvironments. This balance, known as the stability-plasticity dilemma, is\nespecially pronounced in complex multi-agent domains such as the train\nscheduling problem, where environmental and agent behaviors are constantly\nchanging, and the search space is vast. In this work, we propose addressing\nthese challenges in the train scheduling problem using curriculum learning. We\ndesign a curriculum with adjacent skills that build on each other to improve\ngeneralization performance. Introducing a curriculum with distinct tasks\nintroduces non-stationarity, which we address by proposing a new algorithm:\nContinual Deep Q-Network (DQN) Expansion (CDE). Our approach dynamically\ngenerates and adjusts Q-function subspaces to handle environmental changes and\ntask requirements. CDE mitigates catastrophic forgetting through EWC while\nensuring high plasticity using adaptive rational activation functions.\nExperimental results demonstrate significant improvements in learning\nefficiency and adaptability compared to RL baselines and other adapted methods\nfor continual learning, highlighting the potential of our method in managing\nthe stability-plasticity dilemma in the adaptive train scheduling setting.\n","authors":["Achref Jaziri","Etienne Künzel","Visvanathan Ramesh"],"pdf_url":"https://arxiv.org/pdf/2408.09838v1.pdf","comment":"9 Pages, 2 Figures"},{"id":"http://arxiv.org/abs/2307.11091v2","updated":"2024-08-19T09:24:10Z","published":"2023-07-20T17:59:59Z","title":"Data-driven criteria for quantum correlations","summary":" We build a machine learning model to detect correlations in a three-qubit\nsystem using a neural network trained in an unsupervised manner on randomly\ngenerated states. The network is forced to recognize separable states, and\ncorrelated states are detected as anomalies. Quite surprisingly, we find that\nthe proposed detector performs much better at distinguishing a weaker form of\nquantum correlations, namely, the quantum discord, than entanglement. In fact,\nit has a tendency to grossly overestimate the set of entangled states even at\nthe optimal threshold for entanglement detection, while it underestimates the\nset of discordant states to a much lesser extent. In order to illustrate the\nnature of states classified as quantum-correlated, we construct a diagram\ncontaining various types of states -- entangled, as well as separable, both\ndiscordant and non-discordant. We find that the near-zero value of the\nrecognition loss reproduces the shape of the non-discordant separable states\nwith high accuracy, especially considering the non-trivial shape of this set on\nthe diagram. The network architecture is designed carefully: it preserves\nseparability, and its output is equivariant with respect to qubit permutations.\nWe show that the choice of architecture is important to get the highest\ndetection accuracy, much better than for a baseline model that just utilizes a\npartial trace operation.\n","authors":["Mateusz Krawczyk","Jarosław Pawłowski","Maciej M. Maśka","Katarzyna Roszak"],"pdf_url":"https://arxiv.org/pdf/2307.11091v2.pdf","comment":"7 pages, 3 figures, 3 tables, and extra 5 pages of supplementary\n materials"},{"id":"http://arxiv.org/abs/2408.09821v1","updated":"2024-08-19T09:18:28Z","published":"2024-08-19T09:18:28Z","title":"Symplectic Neural Networks Based on Dynamical Systems","summary":" We present and analyze a framework for designing symplectic neural networks\n(SympNets) based on geometric integrators for Hamiltonian differential\nequations. The SympNets are universal approximators in the space of Hamiltonian\ndiffeomorphisms, interpretable and have a non-vanishing gradient property. We\nalso give a representation theory for linear systems, meaning the proposed\nP-SympNets can exactly parameterize any symplectic map corresponding to\nquadratic Hamiltonians. Extensive numerical tests demonstrate increased\nexpressiveness and accuracy -- often several orders of magnitude better -- for\nlower training cost over existing architectures. Lastly, we show how to perform\nsymbolic Hamiltonian regression with SympNets for polynomial systems using\nbackward error analysis.\n","authors":["Benjamin K Tapley"],"pdf_url":"https://arxiv.org/pdf/2408.09821v1.pdf","comment":"33 pages including appendices but not references, 7 figures"},{"id":"http://arxiv.org/abs/2408.09818v1","updated":"2024-08-19T09:14:25Z","published":"2024-08-19T09:14:25Z","title":"Liquid Fourier Latent Dynamics Networks for fast GPU-based numerical\n simulations in computational cardiology","summary":" Scientific Machine Learning (ML) is gaining momentum as a cost-effective\nalternative to physics-based numerical solvers in many engineering\napplications. In fact, scientific ML is currently being used to build accurate\nand efficient surrogate models starting from high-fidelity numerical\nsimulations, effectively encoding the parameterized temporal dynamics\nunderlying Ordinary Differential Equations (ODEs), or even the spatio-temporal\nbehavior underlying Partial Differential Equations (PDEs), in appropriately\ndesigned neural networks. We propose an extension of Latent Dynamics Networks\n(LDNets), namely Liquid Fourier LDNets (LFLDNets), to create parameterized\nspace-time surrogate models for multiscale and multiphysics sets of highly\nnonlinear differential equations on complex geometries. LFLDNets employ a\nneurologically-inspired, sparse, liquid neural network for temporal dynamics,\nrelaxing the requirement of a numerical solver for time advancement and leading\nto superior performance in terms of tunable parameters, accuracy, efficiency\nand learned trajectories with respect to neural ODEs based on feedforward\nfully-connected neural networks. Furthermore, in our implementation of\nLFLDNets, we use a Fourier embedding with a tunable kernel in the\nreconstruction network to learn high-frequency functions better and faster than\nusing space coordinates directly as input. We challenge LFLDNets in the\nframework of computational cardiology and evaluate their capabilities on two\n3-dimensional test cases arising from multiscale cardiac electrophysiology and\ncardiovascular hemodynamics. This paper illustrates the capability to run\nArtificial Intelligence-based numerical simulations on single or multiple GPUs\nin a matter of minutes and represents a significant step forward in the\ndevelopment of physics-informed digital twins.\n","authors":["Matteo Salvador","Alison L. Marsden"],"pdf_url":"https://arxiv.org/pdf/2408.09818v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09815v1","updated":"2024-08-19T09:07:33Z","published":"2024-08-19T09:07:33Z","title":"A Population-to-individual Tuning Framework for Adapting Pretrained LM\n to On-device User Intent Prediction","summary":" Mobile devices, especially smartphones, can support rich functions and have\ndeveloped into indispensable tools in daily life. With the rise of generative\nAI services, smartphones can potentially transform into personalized\nassistants, anticipating user needs and scheduling services accordingly.\nPredicting user intents on smartphones, and reflecting anticipated activities\nbased on past interactions and context, remains a pivotal step towards this\nvision. Existing research predominantly focuses on specific domains, neglecting\nthe challenge of modeling diverse event sequences across dynamic contexts.\nLeveraging pre-trained language models (PLMs) offers a promising avenue, yet\nadapting PLMs to on-device user intent prediction presents significant\nchallenges. To address these challenges, we propose PITuning, a\nPopulation-to-Individual Tuning framework. PITuning enhances common pattern\nextraction through dynamic event-to-intent transition modeling and addresses\nlong-tailed preferences via adaptive unlearning strategies. Experimental\nresults on real-world datasets demonstrate PITuning's superior intent\nprediction performance, highlighting its ability to capture long-tailed\npreferences and its practicality for on-device prediction scenarios.\n","authors":["Jiahui Gong","Jingtao Ding","Fanjin Meng","Guilong Chen","Hong Chen","Shen Zhao","Haisheng Lu","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2408.09815v1.pdf","comment":"accepted by KDD 2024"},{"id":"http://arxiv.org/abs/2402.10043v5","updated":"2024-08-19T08:55:28Z","published":"2024-02-15T16:05:35Z","title":"Negative impact of heavy-tailed uncertainty and error distributions on\n the reliability of calibration statistics for machine learning regression\n tasks","summary":" Average calibration of the (variance-based) prediction uncertainties of\nmachine learning regression tasks can be tested in two ways: one is to estimate\nthe calibration error (CE) as the difference between the mean absolute error\n(MSE) and the mean variance (MV); the alternative is to compare the mean\nsquared z-scores (ZMS) to 1. The problem is that both approaches might lead to\ndifferent conclusions, as illustrated in this study for an ensemble of datasets\nfrom the recent machine learning uncertainty quantification (ML-UQ) literature.\nIt is shown that the estimation of MV, MSE and their confidence intervals\nbecomes unreliable for heavy-tailed uncertainty and error distributions, which\nseems to be a frequent feature of ML-UQ datasets. By contrast, the ZMS\nstatistic is less sensitive and offers the most reliable approach in this\ncontext, still acknowledging that datasets with heavy-tailed z-scores\ndistributions should be considered with great care. Unfortunately, the same\nproblem is expected to affect also conditional calibrations statistics, such as\nthe popular ENCE, and very likely post-hoc calibration methods based on similar\nstatistics. Several solutions to circumvent the outlined problems are proposed.\n","authors":["Pascal Pernot"],"pdf_url":"https://arxiv.org/pdf/2402.10043v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.08744v3","updated":"2024-08-19T08:45:26Z","published":"2022-12-16T22:48:37Z","title":"Toward cross-subject and cross-session generalization in EEG-based\n emotion recognition: Systematic review, taxonomy, and methods","summary":" A systematic review on machine-learning strategies for improving\ngeneralizability (cross-subjects and cross-sessions) electroencephalography\n(EEG) based in emotion classification was realized. In this context, the\nnon-stationarity of EEG signals is a critical issue and can lead to the Dataset\nShift problem. Several architectures and methods have been proposed to address\nthis issue, mainly based on transfer learning methods. 418 papers were\nretrieved from the Scopus, IEEE Xplore and PubMed databases through a search\nquery focusing on modern machine learning techniques for generalization in\nEEG-based emotion assessment. Among these papers, 75 were found eligible based\non their relevance to the problem. Studies lacking a specific cross-subject and\ncross-session validation strategy and making use of other biosignals as support\nwere excluded. On the basis of the selected papers' analysis, a taxonomy of the\nstudies employing Machine Learning (ML) methods was proposed, together with a\nbrief discussion on the different ML approaches involved. The studies with the\nbest results in terms of average classification accuracy were identified,\nsupporting that transfer learning methods seem to perform better than other\napproaches. A discussion is proposed on the impact of (i) the emotion\ntheoretical models and (ii) psychological screening of the experimental sample\non the classifier performances.\n","authors":["Andrea Apicella","Pasquale Arpaia","Giovanni D'Errico","Davide Marocco","Giovanna Mastrati","Nicola Moccaldi","Roberto Prevete"],"pdf_url":"https://arxiv.org/pdf/2212.08744v3.pdf","comment":"This work has been published on Neurocomputing journal in its final\n version. Please refer to the final Open Access version of the paper on\n https://doi.org/10.1016/j.neucom.2024.128354 . Old title \"Machine Learning\n Strategies to Improve Generalization in EEG-based Emotion Assessment: a\n Systematic Review\" has been changed to the current one"},{"id":"http://arxiv.org/abs/2408.09798v1","updated":"2024-08-19T08:44:55Z","published":"2024-08-19T08:44:55Z","title":"Enhance Modality Robustness in Text-Centric Multimodal Alignment with\n Adversarial Prompting","summary":" Converting different modalities into generalized text, which then serves as\ninput prompts for large language models (LLMs), is a common approach for\naligning multimodal models, particularly when pairwise data is limited.\nText-centric alignment method leverages the unique properties of text as a\nmodality space, transforming diverse inputs into a unified textual\nrepresentation, thereby enabling downstream models to effectively interpret\nvarious modal inputs. This study evaluates the quality and robustness of\nmultimodal representations in the face of noise imperfections, dynamic input\norder permutations, and missing modalities, revealing that current text-centric\nalignment methods can compromise downstream robustness. To address this issue,\nwe propose a new text-centric adversarial training approach that significantly\nenhances robustness compared to traditional robust training methods and\npre-trained multimodal foundation models. Our findings underscore the potential\nof this approach to improve the robustness and adaptability of multimodal\nrepresentations, offering a promising solution for dynamic and real-world\napplications.\n","authors":["Yun-Da Tsai","Ting-Yu Yen","Keng-Te Liao","Shou-De Lin"],"pdf_url":"https://arxiv.org/pdf/2408.09798v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2407.05036"},{"id":"http://arxiv.org/abs/2408.09792v1","updated":"2024-08-19T08:41:09Z","published":"2024-08-19T08:41:09Z","title":"Unsupervised Composable Representations for Audio","summary":" Current generative models are able to generate high-quality artefacts but\nhave been shown to struggle with compositional reasoning, which can be defined\nas the ability to generate complex structures from simpler elements. In this\npaper, we focus on the problem of compositional representation learning for\nmusic data, specifically targeting the fully-unsupervised setting. We propose a\nsimple and extensible framework that leverages an explicit compositional\ninductive bias, defined by a flexible auto-encoding objective that can leverage\nany of the current state-of-art generative models. We demonstrate that our\nframework, used with diffusion models, naturally addresses the task of\nunsupervised audio source separation, showing that our model is able to perform\nhigh-quality separation. Our findings reveal that our proposal achieves\ncomparable or superior performance with respect to other blind source\nseparation methods and, furthermore, it even surpasses current state-of-art\nsupervised baselines on signal-to-interference ratio metrics. Additionally, by\nlearning an a-posteriori masking diffusion model in the space of composable\nrepresentations, we achieve a system capable of seamlessly performing\nunsupervised source separation, unconditional generation, and variation\ngeneration. Finally, as our proposal works in the latent space of pre-trained\nneural audio codecs, it also provides a lower computational cost with respect\nto other neural baselines.\n","authors":["Giovanni Bindi","Philippe Esling"],"pdf_url":"https://arxiv.org/pdf/2408.09792v1.pdf","comment":"ISMIR 2024"},{"id":"http://arxiv.org/abs/2408.09791v1","updated":"2024-08-19T08:40:53Z","published":"2024-08-19T08:40:53Z","title":"ALTBI: Constructing Improved Outlier Detection Models via Optimization\n of Inlier-Memorization Effect","summary":" Outlier detection (OD) is the task of identifying unusual observations (or\noutliers) from a given or upcoming data by learning unique patterns of normal\nobservations (or inliers). Recently, a study introduced a powerful unsupervised\nOD (UOD) solver based on a new observation of deep generative models, called\ninlier-memorization (IM) effect, which suggests that generative models memorize\ninliers before outliers in early learning stages. In this study, we aim to\ndevelop a theoretically principled method to address UOD tasks by maximally\nutilizing the IM effect. We begin by observing that the IM effect is observed\nmore clearly when the given training data contain fewer outliers. This finding\nindicates a potential for enhancing the IM effect in UOD regimes if we can\neffectively exclude outliers from mini-batches when designing the loss\nfunction. To this end, we introduce two main techniques: 1) increasing the\nmini-batch size as the model training proceeds and 2) using an adaptive\nthreshold to calculate the truncated loss function. We theoretically show that\nthese two techniques effectively filter out outliers from the truncated loss\nfunction, allowing us to utilize the IM effect to the fullest. Coupled with an\nadditional ensemble strategy, we propose our method and term it Adaptive Loss\nTruncation with Batch Increment (ALTBI). We provide extensive experimental\nresults to demonstrate that ALTBI achieves state-of-the-art performance in\nidentifying outliers compared to other recent methods, even with significantly\nlower computation costs. Additionally, we show that our method yields robust\nperformances when combined with privacy-preserving algorithms.\n","authors":["Seoyoung Cho","Jaesung Hwang","Kwan-Young Bak","Dongha Kim"],"pdf_url":"https://arxiv.org/pdf/2408.09791v1.pdf","comment":"24 pages in total"},{"id":"http://arxiv.org/abs/2408.09790v1","updated":"2024-08-19T08:39:08Z","published":"2024-08-19T08:39:08Z","title":"Structure-enhanced Contrastive Learning for Graph Clustering","summary":" Graph clustering is a crucial task in network analysis with widespread\napplications, focusing on partitioning nodes into distinct groups with stronger\nintra-group connections than inter-group ones. Recently, contrastive learning\nhas achieved significant progress in graph clustering. However, most methods\nsuffer from the following issues: 1) an over-reliance on meticulously designed\ndata augmentation strategies, which can undermine the potential of contrastive\nlearning. 2) overlooking cluster-oriented structural information, particularly\nthe higher-order cluster(community) structure information, which could unveil\nthe mesoscopic cluster structure information of the network. In this study,\nStructure-enhanced Contrastive Learning (SECL) is introduced to addresses these\nissues by leveraging inherent network structures. SECL utilizes a cross-view\ncontrastive learning mechanism to enhance node embeddings without elaborate\ndata augmentations, a structural contrastive learning module for ensuring\nstructural consistency, and a modularity maximization strategy for harnessing\nclustering-oriented information. This comprehensive approach results in robust\nnode representations that greatly enhance clustering performance. Extensive\nexperiments on six datasets confirm SECL's superiority over current\nstate-of-the-art methods, indicating a substantial improvement in the domain of\ngraph clustering.\n","authors":["Xunlian Wu","Jingqi Hu","Anqi Zhang","Yining Quan","Qiguang Miao","Peng Gang Sun"],"pdf_url":"https://arxiv.org/pdf/2408.09790v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.09271v2","updated":"2024-08-19T08:38:32Z","published":"2024-07-12T13:57:49Z","title":"iNeMo: Incremental Neural Mesh Models for Robust Class-Incremental\n Learning","summary":" Different from human nature, it is still common practice today for vision\ntasks to train deep learning models only initially and on fixed datasets. A\nvariety of approaches have recently addressed handling continual data streams.\nHowever, extending these methods to manage out-of-distribution (OOD) scenarios\nhas not effectively been investigated. On the other hand, it has recently been\nshown that non-continual neural mesh models exhibit strong performance in\ngeneralizing to such OOD scenarios. To leverage this decisive property in a\ncontinual learning setting, we propose incremental neural mesh models that can\nbe extended with new meshes over time. In addition, we present a latent space\ninitialization strategy that enables us to allocate feature space for future\nunseen classes in advance and a positional regularization term that forces the\nfeatures of the different classes to consistently stay in respective latent\nspace regions. We demonstrate the effectiveness of our method through extensive\nexperiments on the Pascal3D and ObjectNet3D datasets and show that our approach\noutperforms the baselines for classification by $2-6\\%$ in the in-domain and by\n$6-50\\%$ in the OOD setting. Our work also presents the first incremental\nlearning approach for pose estimation. Our code and model can be found at\nhttps://github.com/Fischer-Tom/iNeMo.\n","authors":["Tom Fischer","Yaoyao Liu","Artur Jesslen","Noor Ahmed","Prakhar Kaushik","Angtian Wang","Alan Yuille","Adam Kortylewski","Eddy Ilg"],"pdf_url":"https://arxiv.org/pdf/2407.09271v2.pdf","comment":"ECCV-24"},{"id":"http://arxiv.org/abs/2403.01888v3","updated":"2024-08-19T08:07:38Z","published":"2024-03-04T09:49:35Z","title":"Fast Benchmarking of Asynchronous Multi-Fidelity Optimization on\n Zero-Cost Benchmarks","summary":" While deep learning has celebrated many successes, its results often hinge on\nthe meticulous selection of hyperparameters (HPs). However, the time-consuming\nnature of deep learning training makes HP optimization (HPO) a costly endeavor,\nslowing down the development of efficient HPO tools. While zero-cost\nbenchmarks, which provide performance and runtime without actual training,\noffer a solution for non-parallel setups, they fall short in parallel setups as\neach worker must communicate its queried runtime to return its evaluation in\nthe exact order. This work addresses this challenge by introducing a\nuser-friendly Python package that facilitates efficient parallel HPO with\nzero-cost benchmarks. Our approach calculates the exact return order based on\nthe information stored in file system, eliminating the need for long waiting\ntimes and enabling much faster HPO evaluations. We first verify the correctness\nof our approach through extensive testing and the experiments with 6 popular\nHPO libraries show its applicability to diverse libraries and its ability to\nachieve over 1000x speedup compared to a traditional approach. Our package can\nbe installed via pip install mfhpo-simulator.\n","authors":["Shuhei Watanabe","Neeratyoy Mallik","Edward Bergman","Frank Hutter"],"pdf_url":"https://arxiv.org/pdf/2403.01888v3.pdf","comment":"Accepted to AutoML Conference 2024 ABCD Track"},{"id":"http://arxiv.org/abs/2408.09775v1","updated":"2024-08-19T08:05:33Z","published":"2024-08-19T08:05:33Z","title":"Faster Adaptive Decentralized Learning Algorithms","summary":" Decentralized learning recently has received increasing attention in machine\nlearning due to its advantages in implementation simplicity and system\nrobustness, data privacy. Meanwhile, the adaptive gradient methods show\nsuperior performances in many machine learning tasks such as training neural\nnetworks. Although some works focus on studying decentralized optimization\nalgorithms with adaptive learning rates, these adaptive decentralized\nalgorithms still suffer from high sample complexity. To fill these gaps, we\npropose a class of faster adaptive decentralized algorithms (i.e., AdaMDOS and\nAdaMDOF) for distributed nonconvex stochastic and finite-sum optimization,\nrespectively. Moreover, we provide a solid convergence analysis framework for\nour methods. In particular, we prove that our AdaMDOS obtains a near-optimal\nsample complexity of $\\tilde{O}(\\epsilon^{-3})$ for finding an\n$\\epsilon$-stationary solution of nonconvex stochastic optimization. Meanwhile,\nour AdaMDOF obtains a near-optimal sample complexity of\n$O(\\sqrt{n}\\epsilon^{-2})$ for finding an $\\epsilon$-stationary solution of\nnonconvex finite-sum optimization, where $n$ denotes the sample size. To the\nbest of our knowledge, our AdaMDOF algorithm is the first adaptive\ndecentralized algorithm for nonconvex finite-sum optimization. Some\nexperimental results demonstrate efficiency of our algorithms.\n","authors":["Feihu Huang","Jianyu Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.09775v1.pdf","comment":"ICML 2024 (Spotlight)"},{"id":"http://arxiv.org/abs/2408.09765v1","updated":"2024-08-19T07:53:50Z","published":"2024-08-19T07:53:50Z","title":"Baby Bear: Seeking a Just Right Rating Scale for Scalar Annotations","summary":" Our goal is a mechanism for efficiently assigning scalar ratings to each of a\nlarge set of elements. For example, \"what percent positive or negative is this\nproduct review?\" When sample sizes are small, prior work has advocated for\nmethods such as Best Worst Scaling (BWS) as being more robust than direct\nordinal annotation (\"Likert scales\"). Here we first introduce IBWS, which\niteratively collects annotations through Best-Worst Scaling, resulting in\nrobustly ranked crowd-sourced data. While effective, IBWS is too expensive for\nlarge-scale tasks. Using the results of IBWS as a best-desired outcome, we\nevaluate various direct assessment methods to determine what is both\ncost-efficient and best correlating to a large scale BWS annotation strategy.\nFinally, we illustrate in the domains of dialogue and sentiment how these\nannotations can support robust learning-to-rank models.\n","authors":["Xu Han","Felix Yu","Joao Sedoc","Benjamin Van Durme"],"pdf_url":"https://arxiv.org/pdf/2408.09765v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02889v3","updated":"2024-08-19T07:53:17Z","published":"2024-03-05T11:50:01Z","title":"InterrogateLLM: Zero-Resource Hallucination Detection in LLM-Generated\n Answers","summary":" Despite the many advances of Large Language Models (LLMs) and their\nunprecedented rapid evolution, their impact and integration into every facet of\nour daily lives is limited due to various reasons. One critical factor\nhindering their widespread adoption is the occurrence of hallucinations, where\nLLMs invent answers that sound realistic, yet drift away from factual truth. In\nthis paper, we present a novel method for detecting hallucinations in large\nlanguage models, which tackles a critical issue in the adoption of these models\nin various real-world scenarios. Through extensive evaluations across multiple\ndatasets and LLMs, including Llama-2, we study the hallucination levels of\nvarious recent LLMs and demonstrate the effectiveness of our method to\nautomatically detect them. Notably, we observe up to 87% hallucinations for\nLlama-2 in a specific experiment, where our method achieves a Balanced Accuracy\nof 81%, all without relying on external knowledge.\n","authors":["Yakir Yehuda","Itzik Malkiel","Oren Barkan","Jonathan Weill","Royi Ronen","Noam Koenigstein"],"pdf_url":"https://arxiv.org/pdf/2403.02889v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05147v2","updated":"2024-08-19T07:51:05Z","published":"2024-08-09T16:06:42Z","title":"Gemma Scope: Open Sparse Autoencoders Everywhere All At Once on Gemma 2","summary":" Sparse autoencoders (SAEs) are an unsupervised method for learning a sparse\ndecomposition of a neural network's latent representations into seemingly\ninterpretable features. Despite recent excitement about their potential,\nresearch applications outside of industry are limited by the high cost of\ntraining a comprehensive suite of SAEs. In this work, we introduce Gemma Scope,\nan open suite of JumpReLU SAEs trained on all layers and sub-layers of Gemma 2\n2B and 9B and select layers of Gemma 2 27B base models. We primarily train SAEs\non the Gemma 2 pre-trained models, but additionally release SAEs trained on\ninstruction-tuned Gemma 2 9B for comparison. We evaluate the quality of each\nSAE on standard metrics and release these results. We hope that by releasing\nthese SAE weights, we can help make more ambitious safety and interpretability\nresearch easier for the community. Weights and a tutorial can be found at\nhttps://huggingface.co/google/gemma-scope and an interactive demo can be found\nat https://www.neuronpedia.org/gemma-scope\n","authors":["Tom Lieberum","Senthooran Rajamanoharan","Arthur Conmy","Lewis Smith","Nicolas Sonnerat","Vikrant Varma","János Kramár","Anca Dragan","Rohin Shah","Neel Nanda"],"pdf_url":"https://arxiv.org/pdf/2408.05147v2.pdf","comment":"12 main text pages, and 14 pages of acknowledgements, references and\n appendices"},{"id":"http://arxiv.org/abs/2404.06063v2","updated":"2024-08-19T07:50:54Z","published":"2024-04-09T07:02:14Z","title":"Heuristic-enhanced Candidates Selection strategy for GPTs tackle\n Few-Shot Aspect-Based Sentiment Analysis","summary":" Few-Shot Aspect-Based Sentiment Analysis (FSABSA) is an indispensable and\nhighly challenging task in natural language processing. However, methods based\non Pre-trained Language Models (PLMs) struggle to accommodate multiple\nsub-tasks, and methods based on Generative Pre-trained Transformers (GPTs)\nperform poorly. To address the above issues, the paper designs a\nHeuristic-enhanced Candidates Selection (HCS) strategy and further proposes All\nin One (AiO) model based on it. The model works in a two-stage, which\nsimultaneously accommodates the accuracy of PLMs and the generalization\ncapability of GPTs. Specifically, in the first stage, a backbone model based on\nPLMs generates rough heuristic candidates for the input sentence. In the second\nstage, AiO leverages LLMs' contextual learning capabilities to generate precise\npredictions. The study conducted comprehensive comparative and ablation\nexperiments on five benchmark datasets. The experimental results demonstrate\nthat the proposed model can better adapt to multiple sub-tasks, and also\noutperforms the methods that directly utilize GPTs.\n","authors":["Baoxing Jiang","Yujie Wan","Shenggen Ju"],"pdf_url":"https://arxiv.org/pdf/2404.06063v2.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.09762v1","updated":"2024-08-19T07:43:35Z","published":"2024-08-19T07:43:35Z","title":"Sequential Federated Learning in Hierarchical Architecture on Non-IID\n Datasets","summary":" In a real federated learning (FL) system, communication overhead for passing\nmodel parameters between the clients and the parameter server (PS) is often a\nbottleneck. Hierarchical federated learning (HFL) that poses multiple edge\nservers (ESs) between clients and the PS can partially alleviate communication\npressure but still needs the aggregation of model parameters from multiple ESs\nat the PS. To further reduce communication overhead, we bring sequential FL\n(SFL) into HFL for the first time, which removes the central PS and enables the\nmodel training to be completed only through passing the global model between\ntwo adjacent ESs for each iteration, and propose a novel algorithm adaptive to\nsuch a combinational framework, referred to as Fed-CHS. Convergence results are\nderived for strongly convex and non-convex loss functions under various data\nheterogeneity setups, which show comparable convergence performance with the\nalgorithms for HFL or SFL solely. Experimental results provide evidence of the\nsuperiority of our proposed Fed-CHS on both communication overhead saving and\ntest accuracy over baseline methods.\n","authors":["Xingrun Yan","Shiyuan Zuo","Rongfei Fan","Han Hu","Li Shen","Puning Zhao","Yong Luo"],"pdf_url":"https://arxiv.org/pdf/2408.09762v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09757v1","updated":"2024-08-19T07:34:43Z","published":"2024-08-19T07:34:43Z","title":"Strategic Demonstration Selection for Improved Fairness in LLM\n In-Context Learning","summary":" Recent studies highlight the effectiveness of using in-context learning (ICL)\nto steer large language models (LLMs) in processing tabular data, a challenging\ntask given the structured nature of such data. Despite advancements in\nperformance, the fairness implications of these methods are less understood.\nThis study investigates how varying demonstrations within ICL prompts influence\nthe fairness outcomes of LLMs. Our findings reveal that deliberately including\nminority group samples in prompts significantly boosts fairness without\nsacrificing predictive accuracy. Further experiments demonstrate that the\nproportion of minority to majority samples in demonstrations affects the\ntrade-off between fairness and prediction accuracy. Based on these insights, we\nintroduce a mitigation technique that employs clustering and evolutionary\nstrategies to curate a diverse and representative sample set from the training\ndata. This approach aims to enhance both predictive performance and fairness in\nICL applications. Experimental results validate that our proposed method\ndramatically improves fairness across various metrics, showing its efficacy in\nreal-world scenarios.\n","authors":["Jingyu Hu","Weiru Liu","Mengnan Du"],"pdf_url":"https://arxiv.org/pdf/2408.09757v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09756v1","updated":"2024-08-19T07:32:41Z","published":"2024-08-19T07:32:41Z","title":"Parallel-in-Time Solutions with Random Projection Neural Networks","summary":" This paper considers one of the fundamental parallel-in-time methods for the\nsolution of ordinary differential equations, Parareal, and extends it by\nadopting a neural network as a coarse propagator. We provide a theoretical\nanalysis of the convergence properties of the proposed algorithm and show its\neffectiveness for several examples, including Lorenz and Burgers' equations. In\nour numerical simulations, we further specialize the underpinning neural\narchitecture to Random Projection Neural Networks (RPNNs), a 2-layer neural\nnetwork where the first layer weights are drawn at random rather than\noptimized. This restriction substantially increases the efficiency of fitting\nRPNN's weights in comparison to a standard feedforward network without\nnegatively impacting the accuracy, as demonstrated in the SIR system example.\n","authors":["Marta M. Betcke","Lisa Maria Kreusser","Davide Murari"],"pdf_url":"https://arxiv.org/pdf/2408.09756v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.07930v2","updated":"2024-08-19T07:32:25Z","published":"2024-07-10T07:22:15Z","title":"Token-Mol 1.0: Tokenized drug design with large language model","summary":" Significant interests have recently risen in leveraging sequence-based large\nlanguage models (LLMs) for drug design. However, most current applications of\nLLMs in drug discovery lack the ability to comprehend three-dimensional (3D)\nstructures, thereby limiting their effectiveness in tasks that explicitly\ninvolve molecular conformations. In this study, we introduced Token-Mol, a\ntoken-only 3D drug design model. This model encodes all molecular information,\nincluding 2D and 3D structures, as well as molecular property data, into\ntokens, which transforms classification and regression tasks in drug discovery\ninto probabilistic prediction problems, thereby enabling learning through a\nunified paradigm. Token-Mol is built on the transformer decoder architecture\nand trained using random causal masking techniques. Additionally, we proposed\nthe Gaussian cross-entropy (GCE) loss function to overcome the challenges in\nregression tasks, significantly enhancing the capacity of LLMs to learn\ncontinuous numerical values. Through a combination of fine-tuning and\nreinforcement learning (RL), Token-Mol achieves performance comparable to or\nsurpassing existing task-specific methods across various downstream tasks,\nincluding pocket-based molecular generation, conformation generation, and\nmolecular property prediction. Compared to existing molecular pre-trained\nmodels, Token-Mol exhibits superior proficiency in handling a wider range of\ndownstream tasks essential for drug design. Notably, our approach improves\nregression task accuracy by approximately 30% compared to similar token-only\nmethods. Token-Mol overcomes the precision limitations of token-only models and\nhas the potential to integrate seamlessly with general models such as ChatGPT,\npaving the way for the development of a universal artificial intelligence drug\ndesign model that facilitates rapid and high-quality drug design by experts.\n","authors":["Jike Wang","Rui Qin","Mingyang Wang","Meijing Fang","Yangyang Zhang","Yuchen Zhu","Qun Su","Qiaolin Gou","Chao Shen","Odin Zhang","Zhenxing Wu","Dejun Jiang","Xujun Zhang","Huifeng Zhao","Xiaozhe Wan","Zhourui Wu","Liwei Liu","Yu Kang","Chang-Yu Hsieh","Tingjun Hou"],"pdf_url":"https://arxiv.org/pdf/2407.07930v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10323v2","updated":"2024-08-19T07:02:19Z","published":"2023-07-19T07:20:30Z","title":"IncDSI: Incrementally Updatable Document Retrieval","summary":" Differentiable Search Index is a recently proposed paradigm for document\nretrieval, that encodes information about a corpus of documents within the\nparameters of a neural network and directly maps queries to corresponding\ndocuments. These models have achieved state-of-the-art performances for\ndocument retrieval across many benchmarks. These kinds of models have a\nsignificant limitation: it is not easy to add new documents after a model is\ntrained. We propose IncDSI, a method to add documents in real time (about\n20-50ms per document), without retraining the model on the entire dataset (or\neven parts thereof). Instead we formulate the addition of documents as a\nconstrained optimization problem that makes minimal changes to the network\nparameters. Although orders of magnitude faster, our approach is competitive\nwith re-training the model on the whole dataset and enables the development of\ndocument retrieval systems that can be updated with new information in\nreal-time. Our code for IncDSI is available at\nhttps://github.com/varshakishore/IncDSI.\n","authors":["Varsha Kishore","Chao Wan","Justin Lovelace","Yoav Artzi","Kilian Q. Weinberger"],"pdf_url":"https://arxiv.org/pdf/2307.10323v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07084v2","updated":"2024-08-19T06:52:07Z","published":"2024-08-08T04:19:20Z","title":"Dynamic Hypergraph-Enhanced Prediction of Sequential Medical Visits","summary":" This study introduces a pioneering Dynamic Hypergraph Networks (DHCE) model\ndesigned to predict future medical diagnoses from electronic health records\nwith enhanced accuracy. The DHCE model innovates by identifying and\ndifferentiating acute and chronic diseases within a patient's visit history,\nconstructing dynamic hypergraphs that capture the complex, high-order\ninteractions between diseases. It surpasses traditional recurrent neural\nnetworks and graph neural networks by effectively integrating clinical event\ndata, reflected through medical language model-assisted encoding, into a robust\npatient representation. Through extensive experiments on two benchmark\ndatasets, MIMIC-III and MIMIC-IV, the DHCE model exhibits superior performance,\nsignificantly outpacing established baseline models in the precision of\nsequential diagnosis prediction.\n","authors":["Wangying Yang","Zitao Zheng","Shi Bo","Zhizhong Wu","Bo Zhang","Yuanfang Yang"],"pdf_url":"https://arxiv.org/pdf/2408.07084v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09735v1","updated":"2024-08-19T06:49:04Z","published":"2024-08-19T06:49:04Z","title":"Icing on the Cake: Automatic Code Summarization at Ericsson","summary":" This paper presents our findings on the automatic summarization of Java\nmethods within Ericsson, a global telecommunications company. We evaluate the\nperformance of an approach called Automatic Semantic Augmentation of Prompts\n(ASAP), which uses a Large Language Model (LLM) to generate leading summary\ncomments for Java methods. ASAP enhances the $LLM's$ prompt context by\nintegrating static program analysis and information retrieval techniques to\nidentify similar exemplar methods along with their developer-written Javadocs,\nand serves as the baseline in our study. In contrast, we explore and compare\nthe performance of four simpler approaches that do not require static program\nanalysis, information retrieval, or the presence of exemplars as in the ASAP\nmethod. Our methods rely solely on the Java method body as input, making them\nlightweight and more suitable for rapid deployment in commercial software\ndevelopment environments. We conducted experiments on an Ericsson software\nproject and replicated the study using two widely-used open-source Java\nprojects, Guava and Elasticsearch, to ensure the reliability of our results.\nPerformance was measured across eight metrics that capture various aspects of\nsimilarity. Notably, one of our simpler approaches performed as well as or\nbetter than the ASAP method on both the Ericsson project and the open-source\nprojects. Additionally, we performed an ablation study to examine the impact of\nmethod names on Javadoc summary generation across our four proposed approaches\nand the ASAP method. By masking the method names and observing the generated\nsummaries, we found that our approaches were statistically significantly less\ninfluenced by the absence of method names compared to the baseline. This\nsuggests that our methods are more robust to variations in method names and may\nderive summaries more comprehensively from the method body than the ASAP\napproach.\n","authors":["Giriprasad Sridhara","Sujoy Roychowdhury","Sumit Soman","Ranjani H G","Ricardo Britto"],"pdf_url":"https://arxiv.org/pdf/2408.09735v1.pdf","comment":"16 pages, 6 tables, 4 figures. Accepted at the 2024 International\n Conference on Software Maintenance and Evolution (ICSME) 2024 - Industry\n Track"},{"id":"http://arxiv.org/abs/2310.09789v3","updated":"2024-08-19T06:46:04Z","published":"2023-10-15T10:13:44Z","title":"FLrce: Resource-Efficient Federated Learning with Early-Stopping\n Strategy","summary":" Federated Learning (FL) achieves great popularity in the Internet of Things\n(IoT) as a powerful interface to offer intelligent services to customers while\nmaintaining data privacy. Under the orchestration of a server, edge devices\n(also called clients in FL) collaboratively train a global deep-learning model\nwithout sharing any local data. Nevertheless, the unequal training\ncontributions among clients have made FL vulnerable, as clients with heavily\nbiased datasets can easily compromise FL by sending malicious or heavily biased\nparameter updates. Furthermore, the resource shortage issue of the network also\nbecomes a bottleneck. Due to overwhelming computation overheads generated by\ntraining deep-learning models on edge devices, and significant communication\noverheads for transmitting deep-learning models across the network, enormous\namounts of resources are consumed in the FL process. This encompasses\ncomputation resources like energy and communication resources like bandwidth.\nTo comprehensively address these challenges, in this paper, we present FLrce,\nan efficient FL framework with a relationship-based client selection and\nearly-stopping strategy. FLrce accelerates the FL process by selecting clients\nwith more significant effects, enabling the global model to converge to a high\naccuracy in fewer rounds. FLrce also leverages an early stopping mechanism that\nterminates FL in advance to save communication and computation resources.\nExperiment results show that, compared with existing efficient FL frameworks,\nFLrce improves the computation and communication efficiency by at least 30% and\n43% respectively.\n","authors":["Ziru Niu","Hai Dong","A. Kai Qin","Tao Gu"],"pdf_url":"https://arxiv.org/pdf/2310.09789v3.pdf","comment":"Preprint, accepted by IEEE Transactions on Mobile Computing"},{"id":"http://arxiv.org/abs/2405.03133v2","updated":"2024-08-19T06:45:06Z","published":"2024-05-06T03:06:33Z","title":"Lory: Fully Differentiable Mixture-of-Experts for Autoregressive\n Language Model Pre-training","summary":" Mixture-of-experts (MoE) models facilitate efficient scaling; however,\ntraining the router network introduces the challenge of optimizing a\nnon-differentiable, discrete objective. Recently, a fully-differentiable MoE\narchitecture, SMEAR, was proposed (Muqeeth et al., 2023), which softly merges\nexperts in the parameter space; nevertheless, its effectiveness was only\ndemonstrated in downstream fine-tuning on classification tasks. In this paper,\nwe present Lory, the first approach that scales such architectures to\nautoregressive language model pre-training. Lory introduces two key techniques:\n(1) a causal segment routing strategy that achieves high efficiency for expert\nmerging operations while preserving the autoregressive nature of language\nmodels; (2) a similarity-based data batching method that encourages expert\nspecialization by grouping similar documents in training instances. We\npre-train a series of Lory models on 150B tokens from scratch, with up to 32\nexperts and 30B (1.5B active) parameters. Experimental results show significant\nperformance gains over parameter-matched dense models on both perplexity\n(+13.9%) and a variety of downstream tasks (+1.5%-11.1%). Despite segment-level\nrouting, Lory models achieve competitive performance compared to\nstate-of-the-art MoE models with token-level routing. We further demonstrate\nthat the trained experts in Lory capture domain-level specialization without\nsupervision. Our work highlights the potential of fully-differentiable MoE\narchitectures for language model pre-training and advocates future research in\nthis area.\n","authors":["Zexuan Zhong","Mengzhou Xia","Danqi Chen","Mike Lewis"],"pdf_url":"https://arxiv.org/pdf/2405.03133v2.pdf","comment":"COLM 2024"},{"id":"http://arxiv.org/abs/2408.09723v1","updated":"2024-08-19T06:23:41Z","published":"2024-08-19T06:23:41Z","title":"sTransformer: A Modular Approach for Extracting Inter-Sequential and\n Temporal Information for Time-Series Forecasting","summary":" In recent years, numerous Transformer-based models have been applied to\nlong-term time-series forecasting (LTSF) tasks. However, recent studies with\nlinear models have questioned their effectiveness, demonstrating that simple\nlinear layers can outperform sophisticated Transformer-based models. In this\nwork, we review and categorize existing Transformer-based models into two main\ntypes: (1) modifications to the model structure and (2) modifications to the\ninput data. The former offers scalability but falls short in capturing\ninter-sequential information, while the latter preprocesses time-series data\nbut is challenging to use as a scalable module. We propose\n$\\textbf{sTransformer}$, which introduces the Sequence and Temporal\nConvolutional Network (STCN) to fully capture both sequential and temporal\ninformation. Additionally, we introduce a Sequence-guided Mask Attention\nmechanism to capture global feature information. Our approach ensures the\ncapture of inter-sequential information while maintaining module scalability.\nWe compare our model with linear models and existing forecasting models on\nlong-term time-series forecasting, achieving new state-of-the-art results. We\nalso conducted experiments on other time-series tasks, achieving strong\nperformance. These demonstrate that Transformer-based structures remain\neffective and our model can serve as a viable baseline for time-series tasks.\n","authors":["Jiaheng Yin","Zhengxin Shi","Jianshen Zhang","Xiaomin Lin","Yulin Huang","Yongzhi Qi","Wei Qi"],"pdf_url":"https://arxiv.org/pdf/2408.09723v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09722v1","updated":"2024-08-19T06:23:21Z","published":"2024-08-19T06:23:21Z","title":"Towards Few-Shot Learning in the Open World: A Review and Beyond","summary":" Human intelligence is characterized by our ability to absorb and apply\nknowledge from the world around us, especially in rapidly acquiring new\nconcepts from minimal examples, underpinned by prior knowledge. Few-shot\nlearning (FSL) aims to mimic this capacity by enabling significant\ngeneralizations and transferability. However, traditional FSL frameworks often\nrely on assumptions of clean, complete, and static data, conditions that are\nseldom met in real-world environments. Such assumptions falter in the\ninherently uncertain, incomplete, and dynamic contexts of the open world. This\npaper presents a comprehensive review of recent advancements designed to adapt\nFSL for use in open-world settings. We categorize existing methods into three\ndistinct types of open-world few-shot learning: those involving varying\ninstances, varying classes, and varying distributions. Each category is\ndiscussed in terms of its specific challenges and methods, as well as its\nstrengths and weaknesses. We standardize experimental settings and metric\nbenchmarks across scenarios, and provide a comparative analysis of the\nperformance of various methods. In conclusion, we outline potential future\nresearch directions for this evolving field. It is our hope that this review\nwill catalyze further development of effective solutions to these complex\nchallenges, thereby advancing the field of artificial intelligence.\n","authors":["Hui Xue","Yuexuan An","Yongchun Qin","Wenqian Li","Yixin Wu","Yongjuan Che","Pengfei Fang","Minling Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.09722v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09718v1","updated":"2024-08-19T06:18:42Z","published":"2024-08-19T06:18:42Z","title":"Confirmation Bias in Gaussian Mixture Models","summary":" Confirmation bias, the tendency to interpret information in a way that aligns\nwith one's preconceptions, can profoundly impact scientific research, leading\nto conclusions that reflect the researcher's hypotheses even when the\nobservational data do not support them. This issue is especially critical in\nscientific fields involving highly noisy observations, such as cryo-electron\nmicroscopy.\n This study investigates confirmation bias in Gaussian mixture models. We\nconsider the following experiment: A team of scientists assumes they are\nanalyzing data drawn from a Gaussian mixture model with known signals\n(hypotheses) as centroids. However, in reality, the observations consist\nentirely of noise without any informative structure. The researchers use a\nsingle iteration of the K-means or expectation-maximization algorithms, two\npopular algorithms to estimate the centroids. Despite the observations being\npure noise, we show that these algorithms yield biased estimates that resemble\nthe initial hypotheses, contradicting the unbiased expectation that averaging\nthese noise observations would converge to zero. Namely, the algorithms\ngenerate estimates that mirror the postulated model, although the hypotheses\n(the presumed centroids of the Gaussian mixture) are not evident in the\nobservations. Specifically, among other results, we prove a positive\ncorrelation between the estimates produced by the algorithms and the\ncorresponding hypotheses. We also derive explicit closed-form expressions of\nthe estimates for a finite and infinite number of hypotheses. This study\nunderscores the risks of confirmation bias in low signal-to-noise environments,\nprovides insights into potential pitfalls in scientific methodologies, and\nhighlights the importance of prudent data interpretation.\n","authors":["Amnon Balanov","Tamir Bendory","Wasim Huleihel"],"pdf_url":"https://arxiv.org/pdf/2408.09718v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12341v2","updated":"2024-08-19T06:17:28Z","published":"2023-03-22T06:35:08Z","title":"EasyDGL: Encode, Train and Interpret for Continuous-time Dynamic Graph\n Learning","summary":" Dynamic graphs arise in various real-world applications, and it is often\nwelcomed to model the dynamics directly in continuous time domain for its\nflexibility. This paper aims to design an easy-to-use pipeline (termed as\nEasyDGL which is also due to its implementation by DGL toolkit) composed of\nthree key modules with both strong fitting ability and interpretability.\nSpecifically the proposed pipeline which involves encoding, training and\ninterpreting: i) a temporal point process (TPP) modulated attention\narchitecture to endow the continuous-time resolution with the coupled\nspatiotemporal dynamics of the observed graph with edge-addition events; ii) a\nprincipled loss composed of task-agnostic TPP posterior maximization based on\nobserved events on the graph, and a task-aware loss with a masking strategy\nover dynamic graph, where the covered tasks include dynamic link prediction,\ndynamic node classification and node traffic forecasting; iii) interpretation\nof the model outputs (e.g., representations and predictions) with scalable\nperturbation-based quantitative analysis in the graph Fourier domain, which\ncould more comprehensively reflect the behavior of the learned model. Extensive\nexperimental results on public benchmarks show the superior performance of our\nEasyDGL for time-conditioned predictive tasks, and in particular demonstrate\nthat EasyDGL can effectively quantify the predictive power of frequency content\nthat a model learn from the evolving graph data.\n","authors":["Chao Chen","Haoyu Geng","Nianzu Yang","Xiaokang Yang","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2303.12341v2.pdf","comment":"Published in IEEE Transactions on Pattern Analysis and Machine\n Intelligence"},{"id":"http://arxiv.org/abs/2408.06867v2","updated":"2024-08-19T06:11:40Z","published":"2024-08-13T13:05:36Z","title":"Optimal Bound for PCA with Outliers using Higher-Degree Voronoi Diagrams","summary":" In this paper, we introduce new algorithms for Principal Component Analysis\n(PCA) with outliers. Utilizing techniques from computational geometry,\nspecifically higher-degree Voronoi diagrams, we navigate to the optimal\nsubspace for PCA even in the presence of outliers. This approach achieves an\noptimal solution with a time complexity of\n$n^{d+\\mathcal{O}(1)}\\text{poly}(n,d)$. Additionally, we present a randomized\nalgorithm with a complexity of $2^{\\mathcal{O}(r(d-r))} \\times \\text{poly}(n,\nd)$. This algorithm samples subspaces characterized in terms of a Grassmannian\nmanifold. By employing such sampling method, we ensure a high likelihood of\ncapturing the optimal subspace, with the success probability $(1 - \\delta)^T$.\nWhere $\\delta$ represents the probability that a sampled subspace does not\ncontain the optimal solution, and $T$ is the number of subspaces sampled,\nproportional to $2^{r(d-r)}$. Our use of higher-degree Voronoi diagrams and\nGrassmannian based sampling offers a clearer conceptual pathway and practical\nadvantages, particularly in handling large datasets or higher-dimensional\nsettings.\n","authors":["Sajjad Hashemian","Mohammad Saeed Arvenaghi","Ebrahim Ardeshir-Larijani"],"pdf_url":"https://arxiv.org/pdf/2408.06867v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09715v1","updated":"2024-08-19T06:06:30Z","published":"2024-08-19T06:06:30Z","title":"HYDEN: Hyperbolic Density Representations for Medical Images and Reports","summary":" In light of the inherent entailment relations between images and text,\nhyperbolic point vector embeddings, leveraging the hierarchical modeling\nadvantages of hyperbolic space, have been utilized for visual semantic\nrepresentation learning. However, point vector embedding approaches fail to\naddress the issue of semantic uncertainty, where an image may have multiple\ninterpretations, and text may refer to different images, a phenomenon\nparticularly prevalent in the medical domain. Therefor, we propose\n\\textbf{HYDEN}, a novel hyperbolic density embedding based image-text\nrepresentation learning approach tailored for specific medical domain data.\nThis method integrates text-aware local features alongside global features from\nimages, mapping image-text features to density features in hyperbolic space via\nusing hyperbolic pseudo-Gaussian distributions. An encapsulation loss function\nis employed to model the partial order relations between image-text density\ndistributions. Experimental results demonstrate the interpretability of our\napproach and its superior performance compared to the baseline methods across\nvarious zero-shot tasks and different datasets.\n","authors":["Zhi Qiao","Linbin Han","Xiantong Zhen","Jia-Hong Gao","Zhen Qian"],"pdf_url":"https://arxiv.org/pdf/2408.09715v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07922v3","updated":"2024-08-19T05:46:56Z","published":"2023-08-15T17:59:18Z","title":"RAVEN: In-Context Learning with Retrieval-Augmented Encoder-Decoder\n Language Models","summary":" In this paper, we investigate the in-context learning ability of\nretrieval-augmented encoder-decoder language models. We first conduct a\ncomprehensive analysis of existing models and identify their limitations in\nin-context learning, primarily due to a mismatch between pretraining and\ninference, as well as a restricted context length. To address these issues, we\npropose RAVEN, a model that combines retrieval-augmented masked language\nmodeling and prefix language modeling. We further introduce Fusion-in-Context\nLearning to enhance the few-shot performance by enabling the model to leverage\nmore in-context examples without requiring additional training. Through\nextensive experiments, we demonstrate that our simple yet effective design\nsignificantly improves performance, achieving results comparable to the most\nadvanced language models in certain scenarios, despite having substantially\nfewer parameters. Our work underscores the potential of retrieval-augmented\nencoder-decoder language models for in-context learning and encourages further\nresearch in this direction.\n","authors":["Jie Huang","Wei Ping","Peng Xu","Mohammad Shoeybi","Kevin Chen-Chuan Chang","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2308.07922v3.pdf","comment":"COLM 2024"},{"id":"http://arxiv.org/abs/2408.09705v1","updated":"2024-08-19T05:37:35Z","published":"2024-08-19T05:37:35Z","title":"Community-Centric Graph Unlearning","summary":" Graph unlearning technology has become increasingly important since the\nadvent of the `right to be forgotten' and the growing concerns about the\nprivacy and security of artificial intelligence. Graph unlearning aims to\nquickly eliminate the effects of specific data on graph neural networks (GNNs).\nHowever, most existing deterministic graph unlearning frameworks follow a\nbalanced partition-submodel training-aggregation paradigm, resulting in a lack\nof structural information between subgraph neighborhoods and redundant\nunlearning parameter calculations. To address this issue, we propose a novel\nGraph Structure Mapping Unlearning paradigm (GSMU) and a novel method based on\nit named Community-centric Graph Eraser (CGE). CGE maps community subgraphs to\nnodes, thereby enabling the reconstruction of a node-level unlearning operation\nwithin a reduced mapped graph. CGE makes the exponential reduction of both the\namount of training data and the number of unlearning parameters. Extensive\nexperiments conducted on five real-world datasets and three widely used GNN\nbackbones have verified the high performance and efficiency of our CGE method,\nhighlighting its potential in the field of graph unlearning.\n","authors":["Yi Li","Shichao Zhang","Guixian Zhang","Debo Cheng"],"pdf_url":"https://arxiv.org/pdf/2408.09705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07978v2","updated":"2024-08-19T05:04:38Z","published":"2024-08-15T06:52:24Z","title":"Coupling without Communication and Drafter-Invariant Speculative\n Decoding","summary":" Suppose Alice has a distribution $P$ and Bob has a distribution $Q$. Alice\nwants to generate a sample $a\\sim P$ and Bob a sample $b \\sim Q$ such that $a =\nb$ with has as high of probability as possible. It is well-known that, by\nsampling from an optimal coupling between the distributions, Alice and Bob can\nachieve $Pr[a = b] = 1 - D_{TV}(P,Q)$, where $D_{TV}(P,Q)$ is the total\nvariation distance. What if Alice and Bob must solve this same problem without\ncommunicating at all? Perhaps surprisingly, with access to public randomness,\nthey can still achieve $Pr[a=b] \\geq \\frac{1-D_{TV}(P,Q)}{1+D_{TV}(P,Q)} \\geq\n1-2D_{TV}(P,Q)$. In fact, this bound can be obtained using a simple protocol\nbased on the Weighted MinHash algorithm. In this work, we explore the\ncommunication-free coupling problem in greater depth. First, we show that an\nequally simple protocol based on Gumbel sampling matches the worst-case\nguarantees of the Weighted MinHash approach, but tends to perform better in\npractice. Conversely, we prove that both approaches are actually sharp: no\ncommunication-free protocol can achieve\n$Pr[a=b]>\\frac{1-D_{TV}(P,Q)}{1+D_{TV}(P,Q)}$ in the worst-case. Finally, we\nprove that, for distributions over $n$ items, there exists a scheme that uses\njust $O(\\log(n/\\epsilon))$ bits of communication to achieve $Pr[a = b] = 1 -\nD_{TV}(P,Q) - \\epsilon$, i.e. to essentially match optimal coupling. Beyond our\ntheoretical results, we demonstrate an application of communication-free\ncoupling to speculative decoding, a recent method for accelerating\nautoregressive large language models [Leviathan, Kalman, Matias, ICML 2023]. We\nshow that communication-free protocols yield a variant of speculative decoding\nthat we call Drafter-Invariant Speculative Decoding, which has the desirable\nproperty that the output of the method is fixed given a fixed random seed,\nregardless of what drafter is used for speculation.\n","authors":["Majid Daliri","Christopher Musco","Ananda Theertha Suresh"],"pdf_url":"https://arxiv.org/pdf/2408.07978v2.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2408.08869v2","updated":"2024-08-19T04:29:34Z","published":"2024-08-16T17:54:09Z","title":"PEDAL: Enhancing Greedy Decoding with Large Language Models using\n Diverse Exemplars","summary":" Self-ensembling techniques with diverse reasoning paths such as\nSelf-Consistency have demonstrated remarkable performance gains in text\ngeneration with Large Language Models (LLMs). However, such techniques depend\non the availability of an accurate answer extraction process to aggregate\nacross multiple outputs. Moreover, they acquire higher inference cost, in\ncomparison to Greedy Decoding, due to generation of relatively higher number of\noutput tokens. Research has shown that the free form text outputs from\nSelf-Consistency can be aggregated reliably using LLMs to produce the final\noutput. Additionally, recent advancements in LLM inference have demonstrated\nthat usage of diverse exemplars in prompts have the ability to induce diversity\nin the LLM outputs. Such proven techniques can be easily extended to\nself-ensembling based approaches to achieve enhanced results in text\ngeneration. In this paper, we introduce PEDAL (Prompts based on Exemplar\nDiversity Aggregated using LLMs), a hybrid self-ensembling approach, that\ncombines the strengths of diverse exemplar based prompts and LLM based\naggregation to achieve improvement in overall performance. On the publicly\navailable SVAMP and ARC datasets, our experiments reveal that PEDAL can achieve\nbetter accuracy than Greedy Decoding based strategies with lower inference cost\ncompared to Self Consistency based approaches.\n","authors":["Sumanth Prabhu"],"pdf_url":"https://arxiv.org/pdf/2408.08869v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09695v1","updated":"2024-08-19T04:23:40Z","published":"2024-08-19T04:23:40Z","title":"LightWeather: Harnessing Absolute Positional Encoding to Efficient and\n Scalable Global Weather Forecasting","summary":" Recently, Transformers have gained traction in weather forecasting for their\ncapability to capture long-term spatial-temporal correlations. However, their\ncomplex architectures result in large parameter counts and extended training\ntimes, limiting their practical application and scalability to global-scale\nforecasting. This paper aims to explore the key factor for accurate weather\nforecasting and design more efficient solutions. Interestingly, our empirical\nfindings reveal that absolute positional encoding is what really works in\nTransformer-based weather forecasting models, which can explicitly model the\nspatial-temporal correlations even without attention mechanisms. We\ntheoretically prove that its effectiveness stems from the integration of\ngeographical coordinates and real-world time features, which are intrinsically\nrelated to the dynamics of weather. Based on this, we propose LightWeather, a\nlightweight and effective model for station-based global weather forecasting.\nWe employ absolute positional encoding and a simple MLP in place of other\ncomponents of Transformer. With under 30k parameters and less than one hour of\ntraining time, LightWeather achieves state-of-the-art performance on global\nweather datasets compared to other advanced DL methods. The results underscore\nthe superiority of integrating spatial-temporal knowledge over complex\narchitectures, providing novel insights for DL in weather forecasting.\n","authors":["Yisong Fu","Fei Wang","Zezhi Shao","Chengqing Yu","Yujie Li","Zhao Chen","Zhulin An","Yongjun Xu"],"pdf_url":"https://arxiv.org/pdf/2408.09695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00958v3","updated":"2024-08-19T04:02:44Z","published":"2024-07-01T04:29:35Z","title":"Universal Approximation Theory: The Basic Theory for Transformer-based\n Large Language Models","summary":" Language models have emerged as a critical area of focus in artificial\nintelligence, particularly with the introduction of groundbreaking innovations\nlike ChatGPT. Large-scale Transformer networks have quickly become the leading\napproach for advancing natural language processing algorithms. Built on the\nTransformer architecture, these models enable interactions that closely mimic\nhuman communication and, equipped with extensive knowledge, can even assist in\nguiding human tasks. Despite their impressive capabilities and growing\ncomplexity, a key question remains-the theoretical foundations of large\nlanguage models (LLMs). What makes Transformer so effective for powering\nintelligent language applications, such as translation and coding? What\nunderlies LLMs' ability for In-Context Learning (ICL)? How does the LoRA scheme\nenhance the fine-tuning of LLMs? And what supports the practicality of pruning\nLLMs? To address these critical questions and explore the technological\nstrategies within LLMs, we leverage the Universal Approximation Theory (UAT) to\noffer a theoretical backdrop, shedding light on the mechanisms that underpin\nthese advancements.\n","authors":["Wei Wang","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2407.00958v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.16424v2","updated":"2024-08-19T03:44:35Z","published":"2023-03-29T03:10:09Z","title":"ProductAE: Toward Deep Learning Driven Error-Correction Codes of Large\n Dimensions","summary":" While decades of theoretical research have led to the invention of several\nclasses of error-correction codes, the design of such codes is an extremely\nchallenging task, mostly driven by human ingenuity. Recent studies demonstrate\nthat such designs can be effectively automated and accelerated via tools from\nmachine learning (ML), thus enabling ML-driven classes of error-correction\ncodes with promising performance gains compared to classical designs. A\nfundamental challenge, however, is that it is prohibitively complex, if not\nimpossible, to design and train fully ML-driven encoder and decoder pairs for\nlarge code dimensions. In this paper, we propose Product Autoencoder\n(ProductAE) -- a computationally-efficient family of deep learning driven\n(encoder, decoder) pairs -- aimed at enabling the training of relatively large\ncodes (both encoder and decoder) with a manageable training complexity. We\nbuild upon ideas from classical product codes and propose constructing large\nneural codes using smaller code components. ProductAE boils down the complex\nproblem of training the encoder and decoder for a large code dimension $k$ and\nblocklength $n$ to less-complex sub-problems of training encoders and decoders\nfor smaller dimensions and blocklengths. Our training results show successful\ntraining of ProductAEs of dimensions as large as $k = 300$ bits with meaningful\nperformance gains compared to state-of-the-art classical and neural designs.\nMoreover, we demonstrate excellent robustness and adaptivity of ProductAEs to\nchannel models different than the ones used for training.\n","authors":["Mohammad Vahid Jamali","Hamid Saber","Homayoon Hatami","Jung Hyun Bae"],"pdf_url":"https://arxiv.org/pdf/2303.16424v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2110.04466"},{"id":"http://arxiv.org/abs/2407.03637v2","updated":"2024-08-19T03:18:59Z","published":"2024-07-04T05:13:58Z","title":"HERA: High-efficiency Matrix Compression via Element Replacement","summary":" Matrix quantization involves encoding matrix elements in a more\nspace-efficient manner to minimize storage requirements, with dequantization\nused to reconstruct the original matrix for practical use. We define the\nQuantization Error Minimization (QEM) problem as minimizing the difference\nbetween a matrix before and after quantization while ensuring that the\nquantized matrix occupies the same amount of memory. Matrix quantization is\nessential in various fields, including weight quantization in Large Language\nModels (LLMs), vector databases, KV cache quantization, graph compression, and\nimage compression. The growing scale of LLMs, such as GPT-4 and BERT,\nunderscores the need for matrix compression due to the large size of parameters\nand KV caches, which are stored as matrices.\n To address the QEM problem, we introduce HETA, an algorithm that leverages\nthe local orderliness of matrix elements by iteratively swapping elements to\ncreate a locally ordered matrix. This matrix is then grouped and quantized by\ncolumns. To further improve HETA, we present two optimizations: additional\nquantization of residuals to reduce mean squared error (MSE) and the\napplication of masking and batch processing to accelerate the algorithm.\n Our experiments show that HETA effectively reduces MSE to 12.3% of its\noriginal value at the same compression ratio, outperforming leading baseline\nalgorithms. Our contributions include formalizing the QEM problem, developing\nthe HETA algorithm, and proposing two optimizations to enhance both accuracy\nand processing speed.\n","authors":["Yanshu Wang","Wang Li","Tong Yang"],"pdf_url":"https://arxiv.org/pdf/2407.03637v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09672v1","updated":"2024-08-19T03:15:41Z","published":"2024-08-19T03:15:41Z","title":"Regularization for Adversarial Robust Learning","summary":" Despite the growing prevalence of artificial neural networks in real-world\napplications, their vulnerability to adversarial attacks remains to be a\nsignificant concern, which motivates us to investigate the robustness of\nmachine learning models. While various heuristics aim to optimize the\ndistributionally robust risk using the $\\infty$-Wasserstein metric, such a\nnotion of robustness frequently encounters computation intractability. To\ntackle the computational challenge, we develop a novel approach to adversarial\ntraining that integrates $\\phi$-divergence regularization into the\ndistributionally robust risk function. This regularization brings a notable\nimprovement in computation compared with the original formulation. We develop\nstochastic gradient methods with biased oracles to solve this problem\nefficiently, achieving the near-optimal sample complexity. Moreover, we\nestablish its regularization effects and demonstrate it is asymptotic\nequivalence to a regularized empirical risk minimization (ERM) framework, by\nconsidering various scaling regimes of the regularization parameter $\\eta$ and\nrobustness level $\\rho$. These regimes yield gradient norm regularization,\nvariance regularization, or a smoothed gradient norm regularization that\ninterpolates between these extremes. We numerically validate our proposed\nmethod in supervised learning, reinforcement learning, and contextual learning\nand showcase its state-of-the-art performance against various adversarial\nattacks.\n","authors":["Jie Wang","Rui Gao","Yao Xie"],"pdf_url":"https://arxiv.org/pdf/2408.09672v1.pdf","comment":"51 pages, 5 figures"},{"id":"http://arxiv.org/abs/2403.20329v2","updated":"2024-08-19T03:06:24Z","published":"2024-03-29T17:59:06Z","title":"ReALM: Reference Resolution As Language Modeling","summary":" Reference resolution is an important problem, one that is essential to\nunderstand and successfully handle context of different kinds. This context\nincludes both previous turns and context that pertains to non-conversational\nentities, such as entities on the user's screen or those running in the\nbackground. While LLMs have been shown to be extremely powerful for a variety\nof tasks, their use in reference resolution, particularly for\nnon-conversational entities, remains underutilized. This paper demonstrates how\nLLMs can be used to create an extremely effective system to resolve references\nof various types, by showing how reference resolution can be converted into a\nlanguage modeling problem, despite involving forms of entities like those on\nscreen that are not traditionally conducive to being reduced to a text-only\nmodality. We demonstrate large improvements over an existing system with\nsimilar functionality across different types of references, with our smallest\nmodel obtaining absolute gains of over 5% for on-screen references. We also\nbenchmark against GPT-3.5 and GPT-4, with our smallest model achieving\nperformance comparable to that of GPT-4, and our larger models substantially\noutperforming it.\n","authors":["Joel Ruben Antony Moniz","Soundarya Krishnan","Melis Ozyildirim","Prathamesh Saraf","Halim Cagri Ates","Yuan Zhang","Hong Yu"],"pdf_url":"https://arxiv.org/pdf/2403.20329v2.pdf","comment":"Accepted at SIGDIAL 2024 (Oral presentation)"},{"id":"http://arxiv.org/abs/2308.10875v3","updated":"2024-08-19T02:42:23Z","published":"2023-08-08T16:41:33Z","title":"Applications of Nature-Inspired Metaheuristic Algorithms for Tackling\n Optimization Problems Across Disciplines","summary":" Nature-inspired metaheuristic algorithms are important components of\nartificial intelligence, and are increasingly used across disciplines to tackle\nvarious types of challenging optimization problems. This paper demonstrates the\nusefulness of such algorithms for solving a variety of challenging optimization\nproblems in statistics using a nature-inspired metaheuristic algorithm called\ncompetitive swarm optimizer with mutated agents (CSO-MA). This algorithm was\nproposed by one of the authors and its superior performance relative to many of\nits competitors had been demonstrated in earlier work and again in this paper.\nThe main goal of this paper is to show a typical nature-inspired metaheuristic\nalgorithmi, like CSO-MA, is efficient for tackling many different types of\noptimization problems in statistics. Our applications are new and include\nfinding maximum likelihood estimates of parameters in a single cell generalized\ntrend model to study pseudotime in bioinformatics, estimating parameters in the\ncommonly used Rasch model in education research, finding M-estimates for a Cox\nregression in a Markov renewal model, performing matrix completion tasks to\nimpute missing data for a two compartment model, and selecting variables\noptimally in an ecology problem in China. To further demonstrate the\nflexibility of metaheuristics, we also find an optimal design for a car\nrefueling experiment in the auto industry using a logistic model with multiple\ninteracting factors. In addition, we show that metaheuristics can sometimes\noutperform optimization algorithms commonly used in statistics.\n","authors":["Elvis Han Cui","Zizhao Zhang","Culsome Junwen Chen","Weng Kee Wong"],"pdf_url":"https://arxiv.org/pdf/2308.10875v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09655v1","updated":"2024-08-19T02:30:37Z","published":"2024-08-19T02:30:37Z","title":"Contextual Bandits for Unbounded Context Distributions","summary":" Nonparametric contextual bandit is an important model of sequential decision\nmaking problems. Under $\\alpha$-Tsybakov margin condition, existing research\nhas established a regret bound of\n$\\tilde{O}\\left(T^{1-\\frac{\\alpha+1}{d+2}}\\right)$ for bounded supports.\nHowever, the optimal regret with unbounded contexts has not been analyzed. The\nchallenge of solving contextual bandit problems with unbounded support is to\nachieve both exploration-exploitation tradeoff and bias-variance tradeoff\nsimultaneously. In this paper, we solve the nonparametric contextual bandit\nproblem with unbounded contexts. We propose two nearest neighbor methods\ncombined with UCB exploration. The first method uses a fixed $k$. Our analysis\nshows that this method achieves minimax optimal regret under a weak margin\ncondition and relatively light-tailed context distributions. The second method\nuses adaptive $k$. By a proper data-driven selection of $k$, this method\nachieves an expected regret of\n$\\tilde{O}\\left(T^{1-\\frac{(\\alpha+1)\\beta}{\\alpha+(d+2)\\beta}}+T^{1-\\beta}\\right)$,\nin which $\\beta$ is a parameter describing the tail strength. This bound\nmatches the minimax lower bound up to logarithm factors, indicating that the\nsecond method is approximately optimal.\n","authors":["Puning Zhao","Jiafei Wu","Zhe Liu","Huiwen Wu"],"pdf_url":"https://arxiv.org/pdf/2408.09655v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21670v4","updated":"2024-08-19T02:27:44Z","published":"2024-07-31T15:13:39Z","title":"Universal Approximation Theory: Foundations for Parallelism in Neural\n Networks","summary":" Neural networks are increasingly evolving towards training large models with\nbig data, a method that has demonstrated superior performance across many\ntasks. However, this approach introduces an urgent problem: current deep\nlearning models are predominantly serial, meaning that as the number of network\nlayers increases, so do the training and inference times. This is unacceptable\nif deep learning is to continue advancing. Therefore, this paper proposes a\ndeep learning parallelization strategy based on the Universal Approximation\nTheorem (UAT). From this foundation, we designed a parallel network called\nPara-Former to test our theory. Unlike traditional serial models, the inference\ntime of Para-Former does not increase with the number of layers, significantly\naccelerating the inference speed of multi-layer networks. Experimental results\nvalidate the effectiveness of this network.\n","authors":["Wei Wang","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2407.21670v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19697v2","updated":"2024-08-19T02:13:57Z","published":"2024-07-29T04:42:18Z","title":"Multiscale Representation Enhanced Temporal Flow Fusion Model for\n Long-Term Workload Forecasting","summary":" Accurate workload forecasting is critical for efficient resource management\nin cloud computing systems, enabling effective scheduling and autoscaling.\nDespite recent advances with transformer-based forecasting models, challenges\nremain due to the non-stationary, nonlinear characteristics of workload time\nseries and the long-term dependencies. In particular, inconsistent performance\nbetween long-term history and near-term forecasts hinders long-range\npredictions. This paper proposes a novel framework leveraging self-supervised\nmultiscale representation learning to capture both long-term and near-term\nworkload patterns. The long-term history is encoded through multiscale\nrepresentations while the near-term observations are modeled via temporal flow\nfusion. These representations of different scales are fused using an attention\nmechanism and characterized with normalizing flows to handle\nnon-Gaussian/non-linear distributions of time series. Extensive experiments on\n9 benchmarks demonstrate superiority over existing methods.\n","authors":["Shiyu Wang","Zhixuan Chu","Yinbo Sun","Yu Liu","Yuliang Guo","Yang Chen","Huiyang Jian","Lintao Ma","Xingyu Lu","Jun Zhou"],"pdf_url":"https://arxiv.org/pdf/2407.19697v2.pdf","comment":"Proceedings of the 33rd ACM International Conference on Information\n and Knowledge Management (CIKM '24), October 21--25, 2024, Boise, ID, USA"},{"id":"http://arxiv.org/abs/2408.09635v1","updated":"2024-08-19T01:39:12Z","published":"2024-08-19T01:39:12Z","title":"Meta-Learning on Augmented Gene Expression Profiles for Enhanced Lung\n Cancer Detection","summary":" Gene expression profiles obtained through DNA microarray have proven\nsuccessful in providing critical information for cancer detection classifiers.\nHowever, the limited number of samples in these datasets poses a challenge to\nemploy complex methodologies such as deep neural networks for sophisticated\nanalysis. To address this \"small data\" dilemma, Meta-Learning has been\nintroduced as a solution to enhance the optimization of machine learning models\nby utilizing similar datasets, thereby facilitating a quicker adaptation to\ntarget datasets without the requirement of sufficient samples. In this study,\nwe present a meta-learning-based approach for predicting lung cancer from gene\nexpression profiles. We apply this framework to well-established deep learning\nmethodologies and employ four distinct datasets for the meta-learning tasks,\nwhere one as the target dataset and the rest as source datasets. Our approach\nis evaluated against both traditional and deep learning methodologies, and the\nresults show the superior performance of meta-learning on augmented source data\ncompared to the baselines trained on single datasets. Moreover, we conduct the\ncomparative analysis between meta-learning and transfer learning methodologies\nto highlight the efficiency of the proposed approach in addressing the\nchallenges associated with limited sample sizes. Finally, we incorporate the\nexplainability study to illustrate the distinctiveness of decisions made by\nmeta-learning.\n","authors":["Arya Hadizadeh Moghaddam","Mohsen Nayebi Kerdabadi","Cuncong Zhong","Zijun Yao"],"pdf_url":"https://arxiv.org/pdf/2408.09635v1.pdf","comment":"Accepted to AMIA 2024 Annual Symposium"},{"id":"http://arxiv.org/abs/2408.09634v1","updated":"2024-08-19T01:37:14Z","published":"2024-08-19T01:37:14Z","title":"Branch and Bound to Assess Stability of Regression Coefficients in\n Uncertain Models","summary":" It can be difficult to interpret a coefficient of an uncertain model. A slope\ncoefficient of a regression model may change as covariates are added or removed\nfrom the model. In the context of high-dimensional data, there are too many\nmodel extensions to check. However, as we show here, it is possible to\nefficiently search, with a branch and bound algorithm, for maximum and minimum\nvalues of that adjusted slope coefficient over a discrete space of regularized\nregression models. Here we introduce our algorithm, along with supporting\nmathematical results, an example application, and a link to our computer code,\nto help researchers summarize high-dimensional data and assess the stability of\nregression coefficients in uncertain models.\n","authors":["Brian Knaeble","R. Mitchell Hughes","George Rudolph","Mark A. Abramson","Daniel Razo"],"pdf_url":"https://arxiv.org/pdf/2408.09634v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20602v2","updated":"2024-08-19T01:36:45Z","published":"2024-05-31T03:26:42Z","title":"Masked Language Modeling Becomes Conditional Density Estimation for\n Tabular Data Synthesis","summary":" In this paper, our goal is to generate synthetic data for heterogeneous\n(mixed-type) tabular datasets with high machine learning utility (MLu). Since\nthe MLu performance depends on accurately approximating the conditional\ndistributions, we focus on devising a synthetic data generation method based on\nconditional distribution estimation. We introduce MaCoDE by redefining the\nconsecutive multi-class classification task of Masked Language Modeling (MLM)\nas histogram-based non-parametric conditional density estimation. Our approach\nenables the estimation of conditional densities across arbitrary combinations\nof target and conditional variables. We bridge the theoretical gap between\ndistributional learning and MLM by demonstrating that minimizing the orderless\nmulti-class classification loss leads to minimizing the total variation\ndistance between conditional distributions. To validate our proposed model, we\nevaluate its performance in synthetic data generation across 10 real-world\ndatasets, demonstrating its ability to adjust data privacy levels easily\nwithout re-training. Additionally, since masked input tokens in MLM are\nanalogous to missing data, we further assess its effectiveness in handling\ntraining datasets with missing values, including multiple imputations of the\nmissing entries.\n","authors":["Seunghwan An","Gyeongdong Woo","Jaesung Lim","ChangHyun Kim","Sungchul Hong","Jong-June Jeon"],"pdf_url":"https://arxiv.org/pdf/2405.20602v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07314v2","updated":"2024-08-19T01:34:53Z","published":"2024-08-14T06:15:55Z","title":"Kolmogorov-Arnold Networks (KAN) for Time Series Classification and\n Robust Analysis","summary":" Kolmogorov-Arnold Networks (KAN) has recently attracted significant attention\nas a promising alternative to traditional Multi-Layer Perceptrons (MLP).\nDespite their theoretical appeal, KAN require validation on large-scale\nbenchmark datasets. Time series data, which has become increasingly prevalent\nin recent years, especially univariate time series are naturally suited for\nvalidating KAN. Therefore, we conducted a fair comparison among KAN, MLP, and\nmixed structures. The results indicate that KAN can achieve performance\ncomparable to, or even slightly better than, MLP across 128 time series\ndatasets. We also performed an ablation study on KAN, revealing that the output\nis primarily determined by the base component instead of b-spline function.\nFurthermore, we assessed the robustness of these models and found that KAN and\nthe hybrid structure MLP\\_KAN exhibit significant robustness advantages,\nattributed to their lower Lipschitz constants. This suggests that KAN and KAN\nlayers hold strong potential to be robust models or to improve the adversarial\nrobustness of other models.\n","authors":["Chang Dong","Liangwei Zheng","Weitong Chen","Wei Emma Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.07314v2.pdf","comment":"14 pages, 8 figs"},{"id":"http://arxiv.org/abs/2408.09632v1","updated":"2024-08-19T01:30:14Z","published":"2024-08-19T01:30:14Z","title":"MoDeGPT: Modular Decomposition for Large Language Model Compression","summary":" Large Language Models (LLMs) have reshaped the landscape of artificial\nintelligence by demonstrating exceptional performance across various tasks.\nHowever, substantial computational requirements make their deployment\nchallenging on devices with limited resources. Recently, compression methods\nusing low-rank matrix techniques have shown promise, yet these often lead to\ndegraded accuracy or introduce significant overhead in parameters and inference\nlatency. This paper introduces \\textbf{Mo}dular \\textbf{De}composition\n(MoDeGPT), a novel structured compression framework that does not need recovery\nfine-tuning while resolving the above drawbacks. MoDeGPT partitions the\nTransformer block into modules comprised of matrix pairs and reduces the hidden\ndimensions via reconstructing the module-level outputs. MoDeGPT is developed\nbased on a theoretical framework that utilizes three well-established matrix\ndecomposition algorithms -- Nystr\\\"om approximation, CR decomposition, and SVD\n-- and applies them to our redefined transformer modules. Our comprehensive\nexperiments show MoDeGPT, without backward propagation, matches or surpasses\nprevious structured compression methods that rely on gradient information, and\nsaves 98% of compute costs on compressing a 13B model. On \\textsc{Llama}-2/3\nand OPT models, MoDeGPT maintains 90-95% zero-shot performance with 25-30%\ncompression rates. Moreover, the compression can be done on a single GPU within\na few hours and increases the inference throughput by up to 46%.\n","authors":["Chi-Heng Lin","Shangqian Gao","James Seale Smith","Abhishek Patel","Shikhar Tuli","Yilin Shen","Hongxia Jin","Yen-Chang Hsu"],"pdf_url":"https://arxiv.org/pdf/2408.09632v1.pdf","comment":"31 pages, 9 figures"},{"id":"http://arxiv.org/abs/2408.09624v1","updated":"2024-08-19T00:56:44Z","published":"2024-08-19T00:56:44Z","title":"Attention is a smoothed cubic spline","summary":" We highlight a perhaps important but hitherto unobserved insight: The\nattention module in a transformer is a smoothed cubic spline. Viewed in this\nmanner, this mysterious but critical component of a transformer becomes a\nnatural development of an old notion deeply entrenched in classical\napproximation theory. More precisely, we show that with ReLU-activation,\nattention, masked attention, encoder-decoder attention are all cubic splines.\nAs every component in a transformer is constructed out of compositions of\nvarious attention modules (= cubic splines) and feed forward neural networks (=\nlinear splines), all its components -- encoder, decoder, and encoder-decoder\nblocks; multilayered encoders and decoders; the transformer itself -- are cubic\nor higher-order splines. If we assume the Pierce-Birkhoff conjecture, then the\nconverse also holds, i.e., every spline is a ReLU-activated encoder. Since a\nspline is generally just $C^2$, one way to obtain a smoothed $C^\\infty$-version\nis by replacing ReLU with a smooth activation; and if this activation is chosen\nto be SoftMax, we recover the original transformer as proposed by Vaswani et\nal. This insight sheds light on the nature of the transformer by casting it\nentirely in terms of splines, one of the best known and thoroughly understood\nobjects in applied mathematics.\n","authors":["Zehua Lai","Lek-Heng Lim","Yucong Liu"],"pdf_url":"https://arxiv.org/pdf/2408.09624v1.pdf","comment":"20 pages, 2 figures"},{"id":"http://arxiv.org/abs/2408.10457v1","updated":"2024-08-19T23:58:25Z","published":"2024-08-19T23:58:25Z","title":"Parkinson's Disease Classification via EEG: All You Need is a Single\n Convolutional Layer","summary":" In this work, we introduce LightCNN, a minimalist Convolutional Neural\nNetwork (CNN) architecture designed for Parkinson's disease (PD) classification\nusing EEG data. LightCNN's strength lies in its simplicity, utilizing just a\nsingle convolutional layer. Embracing Leonardo da Vinci's principle that\n\"simplicity is the ultimate sophistication,\" LightCNN demonstrates that\ncomplexity is not required to achieve outstanding results. We benchmarked\nLightCNN against several state-of-the-art deep learning models known for their\neffectiveness in EEG-based PD classification. Remarkably, LightCNN outperformed\nall these complex architectures, with a 2.3% improvement in recall, a 4.6%\nincrease in precision, a 0.1% edge in AUC, a 4% boost in F1-score, and a 3.3%\nhigher accuracy compared to the closest competitor. Furthermore, LightCNN\nidentifies known pathological brain rhythms associated with PD and effectively\ncaptures clinically relevant neurophysiological changes in EEG. Its simplicity\nand interpretability make it ideal for deployment in resource-constrained\nenvironments, such as mobile or embedded systems for EEG analysis. In\nconclusion, LightCNN represents a significant step forward in efficient\nEEG-based PD classification, demonstrating that a well-designed, lightweight\nmodel can achieve superior performance over more complex architectures. This\nwork underscores the potential for minimalist models to meet the needs of\nmodern healthcare applications, particularly where resources are limited.\n","authors":["Md Fahim Anjum"],"pdf_url":"https://arxiv.org/pdf/2408.10457v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.10456v1","updated":"2024-08-19T23:57:31Z","published":"2024-08-19T23:57:31Z","title":"Differentially Private Stochastic Gradient Descent with Fixed-Size\n Minibatches: Tighter RDP Guarantees with or without Replacement","summary":" Differentially private stochastic gradient descent (DP-SGD) has been\ninstrumental in privately training deep learning models by providing a\nframework to control and track the privacy loss incurred during training. At\nthe core of this computation lies a subsampling method that uses a privacy\namplification lemma to enhance the privacy guarantees provided by the additive\nnoise. Fixed size subsampling is appealing for its constant memory usage,\nunlike the variable sized minibatches in Poisson subsampling. It is also of\ninterest in addressing class imbalance and federated learning. However, the\ncurrent computable guarantees for fixed-size subsampling are not tight and do\nnot consider both add/remove and replace-one adjacency relationships. We\npresent a new and holistic R{\\'e}nyi differential privacy (RDP) accountant for\nDP-SGD with fixed-size subsampling without replacement (FSwoR) and with\nreplacement (FSwR). For FSwoR we consider both add/remove and replace-one\nadjacency. Our FSwoR results improves on the best current computable bound by a\nfactor of $4$. We also show for the first time that the widely-used Poisson\nsubsampling and FSwoR with replace-one adjacency have the same privacy to\nleading order in the sampling probability. Accordingly, our work suggests that\nFSwoR is often preferable to Poisson subsampling due to constant memory usage.\nOur FSwR accountant includes explicit non-asymptotic upper and lower bounds\nand, to the authors' knowledge, is the first such analysis of fixed-size RDP\nwith replacement for DP-SGD. We analytically and empirically compare fixed size\nand Poisson subsampling, and show that DP-SGD gradients in a fixed-size\nsubsampling regime exhibit lower variance in practice in addition to memory\nusage benefits.\n","authors":["Jeremiah Birrell","Reza Ebrahimi","Rouzbeh Behnia","Jason Pacheco"],"pdf_url":"https://arxiv.org/pdf/2408.10456v1.pdf","comment":"39 pages, 10 figures"},{"id":"http://arxiv.org/abs/2406.05175v2","updated":"2024-08-19T23:56:36Z","published":"2024-06-07T16:33:23Z","title":"Robust quantum dots charge autotuning using neural networks uncertainty","summary":" This study presents a machine-learning-based procedure to automate the charge\ntuning of semiconductor spin qubits with minimal human intervention, addressing\none of the significant challenges in scaling up quantum dot technologies. This\nmethod exploits artificial neural networks to identify noisy transition lines\nin stability diagrams, guiding a robust exploration strategy leveraging neural\nnetworks' uncertainty estimations. Tested across three distinct offline\nexperimental datasets representing different single quantum dot technologies,\nthe approach achieves over 99% tuning success rate in optimal cases, where more\nthan 10% of the success is directly attributable to uncertainty exploitation.\nThe challenging constraints of small training sets containing high\ndiagram-to-diagram variability allowed us to evaluate the capabilities and\nlimits of the proposed procedure.\n","authors":["Victor Yon","Bastien Galaup","Claude Rohrbacher","Joffrey Rivard","Clément Godfrin","Ruoyu Li","Stefan Kubicek","Kristiaan De Greve","Louis Gaudreau","Eva Dupont-Ferrier","Yann Beilliard","Roger G. Melko","Dominique Drouin"],"pdf_url":"https://arxiv.org/pdf/2406.05175v2.pdf","comment":"12 pages (main) + 13 pages (supplementary)"},{"id":"http://arxiv.org/abs/2208.05949v3","updated":"2024-08-19T23:50:28Z","published":"2022-08-11T17:40:45Z","title":"Valid Inference After Causal Discovery","summary":" Causal discovery and causal effect estimation are two fundamental tasks in\ncausal inference. While many methods have been developed for each task\nindividually, statistical challenges arise when applying these methods jointly:\nestimating causal effects after running causal discovery algorithms on the same\ndata leads to \"double dipping,\" invalidating the coverage guarantees of\nclassical confidence intervals. To this end, we develop tools for valid\npost-causal-discovery inference. Across empirical studies, we show that a naive\ncombination of causal discovery and subsequent inference algorithms leads to\nhighly inflated miscoverage rates; on the other hand, applying our method\nprovides reliable coverage while achieving more accurate causal discovery than\ndata splitting.\n","authors":["Paula Gradu","Tijana Zrnic","Yixin Wang","Michael I. Jordan"],"pdf_url":"https://arxiv.org/pdf/2208.05949v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.07201v2","updated":"2024-08-19T23:34:50Z","published":"2022-01-18T18:53:21Z","title":"Bridging the Language Gap: An Empirical Study of Bindings for Open\n Source Machine Learning Libraries Across Software Package Ecosystems","summary":" Open source machine learning (ML) libraries enable developers to integrate\nadvanced ML functionality into their own applications. However, popular ML\nlibraries, such as TensorFlow, are not available natively in all programming\nlanguages and software package ecosystems. Hence, developers who wish to use an\nML library which is not available in their programming language or ecosystem of\nchoice, may need to resort to using a so-called binding library (or binding).\nBindings provide support across programming languages and package ecosystems\nfor reusing a host library. For example, the Keras .NET binding provides\nsupport for the Keras library in the NuGet (.NET) ecosystem even though the\nKeras library was written in Python. In this paper, we collect 2,436\ncross-ecosystem bindings for 546 ML libraries across 13 software package\necosystems by using an approach called BindFind, which can automatically\nidentify bindings and link them to their host libraries. Furthermore, we\nconduct an in-depth study of 133 cross-ecosystem bindings and their development\nfor 40 popular open source ML libraries. Our findings reveal that the majority\nof ML library bindings are maintained by the community, with npm being the most\npopular ecosystem for these bindings. Our study also indicates that most\nbindings cover only a limited range of the host library's releases, often\nexperience considerable delays in supporting new releases, and have widespread\ntechnical lag. Our findings highlight key factors to consider for developers\nintegrating bindings for ML libraries and open avenues for researchers to\nfurther investigate bindings in software package ecosystems.\n","authors":["Hao Li","Cor-Paul Bezemer"],"pdf_url":"https://arxiv.org/pdf/2201.07201v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10443v1","updated":"2024-08-19T22:44:10Z","published":"2024-08-19T22:44:10Z","title":"Federated Learning of Large ASR Models in the Real World","summary":" Federated learning (FL) has shown promising results on training machine\nlearning models with privacy preservation. However, for large models with over\n100 million parameters, the training resource requirement becomes an obstacle\nfor FL because common devices do not have enough memory and computation power\nto finish the FL tasks. Although efficient training methods have been proposed,\nit is still a challenge to train the large models like Conformer based ASR.\nThis paper presents a systematic solution to train the full-size ASR models of\n130M parameters with FL. To our knowledge, this is the first real-world FL\napplication of the Conformer model, which is also the largest model ever\ntrained with FL so far. And this is the first paper showing FL can improve the\nASR model quality with a set of proposed methods to refine the quality of data\nand labels of clients. We demonstrate both the training efficiency and the\nmodel quality improvement in real-world experiments.\n","authors":["Yonghui Xiao","Yuxin Ding","Changwan Ryu","Petr Zadrazil","Francoise Beaufays"],"pdf_url":"https://arxiv.org/pdf/2408.10443v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16635v4","updated":"2024-08-19T22:34:28Z","published":"2023-05-26T05:19:24Z","title":"Impossible Distillation: from Low-Quality Model to High-Quality Dataset\n & Model for Summarization and Paraphrasing","summary":" We present Impossible Distillation, a novel framework for paraphrasing and\nsentence summarization, that distills a high-quality dataset and model from a\nlow-quality teacher that itself cannot perform these tasks. Unlike prior works\nthat rely on an extreme-scale teacher model (e.g., GPT3) or task-specific\narchitecture, we hypothesize and verify the paraphrastic proximity intrinsic to\npre-trained LMs (e.g., GPT2), where paraphrases occupy a proximal subspace in\nthe LM distribution. By identifying and distilling generations from these\nsubspaces, Impossible Distillation produces a high-quality dataset and model\neven from GPT2-scale LMs. We evaluate our method on multiple benchmarks\nspanning unconstrained / syntax-controlled paraphrase generation and sentence\nsummarization. Our model with 770M parameters consistently outperforms strong\nbaselines, including models distilled from ChatGPT, and sometimes, even ChatGPT\nitself. Also, we find that our distilled dataset from 1.5B LMs exhibits higher\ndiversity and fidelity than up to 13 times larger datasets.\n","authors":["Jaehun Jung","Peter West","Liwei Jiang","Faeze Brahman","Ximing Lu","Jillian Fisher","Taylor Sorensen","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2305.16635v4.pdf","comment":"NAACL 2024"},{"id":"http://arxiv.org/abs/2408.05431v2","updated":"2024-08-19T22:30:20Z","published":"2024-08-10T04:26:19Z","title":"Simple and Nearly-Optimal Sampling for Rank-1 Tensor Completion via\n Gauss-Jordan","summary":" We revisit the sample and computational complexity of completing a rank-1\ntensor in $\\otimes_{i=1}^{N} \\mathbb{R}^{d}$, given a uniformly sampled subset\nof its entries. We present a characterization of the problem (i.e. nonzero\nentries) which admits an algorithm amounting to Gauss-Jordan on a pair of\nrandom linear systems. For example, when $N = \\Theta(1)$, we prove it uses no\nmore than $m = O(d^2 \\log d)$ samples and runs in $O(md^2)$ time. Moreover, we\nshow any algorithm requires $\\Omega(d\\log d)$ samples.\n By contrast, existing upper bounds on the sample complexity are at least as\nlarge as $d^{1.5} \\mu^{\\Omega(1)} \\log^{\\Omega(1)} d$, where $\\mu$ can be\n$\\Theta(d)$ in the worst case. Prior work obtained these looser guarantees in\nhigher rank versions of our problem, and tend to involve more complicated\nalgorithms.\n","authors":["Alejandro Gomez-Leos","Oscar López"],"pdf_url":"https://arxiv.org/pdf/2408.05431v2.pdf","comment":"16 pages; corrected typos in Prior Work section & Theorem 1.5"},{"id":"http://arxiv.org/abs/2309.06642v3","updated":"2024-08-19T22:29:51Z","published":"2023-09-12T23:41:29Z","title":"Adapt and Diffuse: Sample-adaptive Reconstruction via Latent Diffusion\n Models","summary":" Inverse problems arise in a multitude of applications, where the goal is to\nrecover a clean signal from noisy and possibly (non)linear observations. The\ndifficulty of a reconstruction problem depends on multiple factors, such as the\nground truth signal structure, the severity of the degradation and the complex\ninteractions between the above. This results in natural sample-by-sample\nvariation in the difficulty of a reconstruction problem. Our key observation is\nthat most existing inverse problem solvers lack the ability to adapt their\ncompute power to the difficulty of the reconstruction task, resulting in subpar\nperformance and wasteful resource allocation. We propose a novel method,\n$\\textit{severity encoding}$, to estimate the degradation severity of corrupted\nsignals in the latent space of an autoencoder. We show that the estimated\nseverity has strong correlation with the true corruption level and can provide\nuseful hints on the difficulty of reconstruction problems on a sample-by-sample\nbasis. Furthermore, we propose a reconstruction method based on latent\ndiffusion models that leverages the predicted degradation severities to\nfine-tune the reverse diffusion sampling trajectory and thus achieve\nsample-adaptive inference times. Our framework, Flash-Diffusion, acts as a\nwrapper that can be combined with any latent diffusion-based baseline solver,\nimbuing it with sample-adaptivity and acceleration. We perform experiments on\nboth linear and nonlinear inverse problems and demonstrate that our technique\ngreatly improves the performance of the baseline solver and achieves up to\n$10\\times$ acceleration in mean sampling speed.\n","authors":["Zalan Fabian","Berk Tinaz","Mahdi Soltanolkotabi"],"pdf_url":"https://arxiv.org/pdf/2309.06642v3.pdf","comment":"30 pages, 21 figures, published at the 41st International Conference\n on Machine Learning, Vienna, Austria, 2024"},{"id":"http://arxiv.org/abs/2206.00794v2","updated":"2024-08-19T22:20:16Z","published":"2022-06-01T22:57:52Z","title":"Sequential Bayesian Neural Subnetwork Ensembles","summary":" Deep ensembles have emerged as a powerful technique for improving predictive\nperformance and enhancing model robustness across various applications by\nleveraging model diversity. However, traditional deep ensemble methods are\noften computationally expensive and rely on deterministic models, which may\nlimit their flexibility. Additionally, while sparse subnetworks of dense models\nhave shown promise in matching the performance of their dense counterparts and\neven enhancing robustness, existing methods for inducing sparsity typically\nincur training costs comparable to those of training a single dense model, as\nthey either gradually prune the network during training or apply thresholding\npost-training. In light of these challenges, we propose an approach for\nsequential ensembling of dynamic Bayesian neural subnetworks that consistently\nmaintains reduced model complexity throughout the training process while\ngenerating diverse ensembles in a single forward pass. Our approach involves an\ninitial exploration phase to identify high-performing regions within the\nparameter space, followed by multiple exploitation phases that take advantage\nof the compactness of the sparse model. These exploitation phases quickly\nconverge to different minima in the energy landscape, corresponding to\nhigh-performing subnetworks that together form a diverse and robust ensemble.\nWe empirically demonstrate that our proposed approach outperforms traditional\ndense and sparse deterministic and Bayesian ensemble models in terms of\nprediction accuracy, uncertainty estimation, out-of-distribution detection, and\nadversarial robustness.\n","authors":["Sanket Jantre","Shrijita Bhattacharya","Nathan M. Urban","Byung-Jun Yoon","Tapabrata Maiti","Prasanna Balaprakash","Sandeep Madireddy"],"pdf_url":"https://arxiv.org/pdf/2206.00794v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10437v1","updated":"2024-08-19T22:07:05Z","published":"2024-08-19T22:07:05Z","title":"Understanding Generative AI Content with Embedding Models","summary":" The construction of high-quality numerical features is critical to any\nquantitative data analysis. Feature engineering has been historically addressed\nby carefully hand-crafting data representations based on domain expertise. This\nwork views the internal representations of modern deep neural networks (DNNs),\ncalled embeddings, as an automated form of traditional feature engineering. For\ntrained DNNs, we show that these embeddings can reveal interpretable,\nhigh-level concepts in unstructured sample data. We use these embeddings in\nnatural language and computer vision tasks to uncover both inherent\nheterogeneity in the underlying data and human-understandable explanations for\nit. In particular, we find empirical evidence that there is inherent\nseparability between real data and that generated from AI models.\n","authors":["Max Vargas","Reilly Cannon","Andrew Engel","Anand D. Sarwate","Tony Chiang"],"pdf_url":"https://arxiv.org/pdf/2408.10437v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.14353v2","updated":"2024-08-19T22:04:25Z","published":"2023-03-25T04:37:20Z","title":"DiracDiffusion: Denoising and Incremental Reconstruction with Assured\n Data-Consistency","summary":" Diffusion models have established new state of the art in a multitude of\ncomputer vision tasks, including image restoration. Diffusion-based inverse\nproblem solvers generate reconstructions of exceptional visual quality from\nheavily corrupted measurements. However, in what is widely known as the\nperception-distortion trade-off, the price of perceptually appealing\nreconstructions is often paid in declined distortion metrics, such as PSNR.\nDistortion metrics measure faithfulness to the observation, a crucial\nrequirement in inverse problems. In this work, we propose a novel framework for\ninverse problem solving, namely we assume that the observation comes from a\nstochastic degradation process that gradually degrades and noises the original\nclean image. We learn to reverse the degradation process in order to recover\nthe clean image. Our technique maintains consistency with the original\nmeasurement throughout the reverse process, and allows for great flexibility in\ntrading off perceptual quality for improved distortion metrics and sampling\nspeedup via early-stopping. We demonstrate the efficiency of our method on\ndifferent high-resolution datasets and inverse problems, achieving great\nimprovements over other state-of-the-art diffusion-based methods with respect\nto both perceptual and distortion metrics.\n","authors":["Zalan Fabian","Berk Tinaz","Mahdi Soltanolkotabi"],"pdf_url":"https://arxiv.org/pdf/2303.14353v2.pdf","comment":"30 pages, 15 figures, published at the 41st International Conference\n on Machine Learning, Vienna, Austria, 2024"},{"id":"http://arxiv.org/abs/2408.10436v1","updated":"2024-08-19T22:03:02Z","published":"2024-08-19T22:03:02Z","title":"Learning Regularization for Graph Inverse Problems","summary":" In recent years, Graph Neural Networks (GNNs) have been utilized for various\napplications ranging from drug discovery to network design and social networks.\nIn many applications, it is impossible to observe some properties of the graph\ndirectly; instead, noisy and indirect measurements of these properties are\navailable. These scenarios are coined as Graph Inverse Problems (GRIP). In this\nwork, we introduce a framework leveraging GNNs to solve GRIPs. The framework is\nbased on a combination of likelihood and prior terms, which are used to find a\nsolution that fits the data while adhering to learned prior information.\nSpecifically, we propose to combine recent deep learning techniques that were\ndeveloped for inverse problems, together with GNN architectures, to formulate\nand solve GRIP. We study our approach on a number of representative problems\nthat demonstrate the effectiveness of the framework.\n","authors":["Moshe Eliasof","Md Shahriar Rahim Siddiqui","Carola-Bibiane Schönlieb","Eldad Haber"],"pdf_url":"https://arxiv.org/pdf/2408.10436v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2408.10134v1","updated":"2024-08-19T16:28:05Z","published":"2024-08-19T16:28:05Z","title":"Perceptual Depth Quality Assessment of Stereoscopic Omnidirectional\n Images","summary":" Depth perception plays an essential role in the viewer experience for\nimmersive virtual reality (VR) visual environments. However, previous research\ninvestigations in the depth quality of 3D/stereoscopic images are rather\nlimited, and in particular, are largely lacking for 3D viewing of 360-degree\nomnidirectional content. In this work, we make one of the first attempts to\ndevelop an objective quality assessment model named depth quality index (DQI)\nfor efficient no-reference (NR) depth quality assessment of stereoscopic\nomnidirectional images. Motivated by the perceptual characteristics of the\nhuman visual system (HVS), the proposed DQI is built upon multi-color-channel,\nadaptive viewport selection, and interocular discrepancy features. Experimental\nresults demonstrate that the proposed method outperforms state-of-the-art image\nquality assessment (IQA) and depth quality assessment (DQA) approaches in\npredicting the perceptual depth quality when tested using both single-viewport\nand omnidirectional stereoscopic image databases. Furthermore, we demonstrate\nthat combining the proposed depth quality model with existing IQA methods\nsignificantly boosts the performance in predicting the overall quality of 3D\nomnidirectional images.\n","authors":["Wei Zhou","Zhou Wang"],"pdf_url":"https://arxiv.org/pdf/2408.10134v1.pdf","comment":"Accepted by IEEE TCSVT"},{"id":"http://arxiv.org/abs/2406.14176v3","updated":"2024-08-19T13:14:28Z","published":"2024-06-20T10:33:15Z","title":"A Multi-Stream Fusion Approach with One-Class Learning for Audio-Visual\n Deepfake Detection","summary":" This paper addresses the challenge of developing a robust audio-visual\ndeepfake detection model. In practical use cases, new generation algorithms are\ncontinually emerging, and these algorithms are not encountered during the\ndevelopment of detection methods. This calls for the generalization ability of\nthe method. Additionally, to ensure the credibility of detection methods, it is\nbeneficial for the model to interpret which cues from the video indicate it is\nfake. Motivated by these considerations, we then propose a multi-stream fusion\napproach with one-class learning as a representation-level regularization\ntechnique. We study the generalization problem of audio-visual deepfake\ndetection by creating a new benchmark by extending and re-splitting the\nexisting FakeAVCeleb dataset. The benchmark contains four categories of fake\nvideos (Real Audio-Fake Visual, Fake Audio-Fake Visual, Fake Audio-Real Visual,\nand Unsynchronized videos). The experimental results demonstrate that our\napproach surpasses the previous models by a large margin. Furthermore, our\nproposed framework offers interpretability, indicating which modality the model\nidentifies as more likely to be fake. The source code is released at\nhttps://github.com/bok-bok/MSOC.\n","authors":["Kyungbok Lee","You Zhang","Zhiyao Duan"],"pdf_url":"https://arxiv.org/pdf/2406.14176v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06365v2","updated":"2024-08-19T12:23:37Z","published":"2024-04-09T15:02:01Z","title":"Dynamic Resolution Guidance for Facial Expression Recognition","summary":" Facial expression recognition (FER) is vital for human-computer interaction\nand emotion analysis, yet recognizing expressions in low-resolution images\nremains challenging. This paper introduces a practical method called Dynamic\nResolution Guidance for Facial Expression Recognition (DRGFER) to effectively\nrecognize facial expressions in images with varying resolutions without\ncompromising FER model accuracy. Our framework comprises two main components:\nthe Resolution Recognition Network (RRN) and the Multi-Resolution Adaptation\nFacial Expression Recognition Network (MRAFER). The RRN determines image\nresolution, outputs a binary vector, and the MRAFER assigns images to suitable\nfacial expression recognition networks based on resolution. We evaluated DRGFER\non widely-used datasets RAFDB and FERPlus, demonstrating that our method\nretains optimal model performance at each resolution and outperforms\nalternative resolution approaches. The proposed framework exhibits robustness\nagainst resolution variations and facial expressions, offering a promising\nsolution for real-world applications.\n","authors":["Songpan Wang","Xu Li","Tianxiang Jiang","Yuanlun Xie"],"pdf_url":"https://arxiv.org/pdf/2404.06365v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09920v1","updated":"2024-08-19T11:55:32Z","published":"2024-08-19T11:55:32Z","title":"Sliced Maximal Information Coefficient: A Training-Free Approach for\n Image Quality Assessment Enhancement","summary":" Full-reference image quality assessment (FR-IQA) models generally operate by\nmeasuring the visual differences between a degraded image and its reference.\nHowever, existing FR-IQA models including both the classical ones (eg, PSNR and\nSSIM) and deep-learning based measures (eg, LPIPS and DISTS) still exhibit\nlimitations in capturing the full perception characteristics of the human\nvisual system (HVS). In this paper, instead of designing a new FR-IQA measure,\nwe aim to explore a generalized human visual attention estimation strategy to\nmimic the process of human quality rating and enhance existing IQA models. In\nparticular, we model human attention generation by measuring the statistical\ndependency between the degraded image and the reference image. The dependency\nis captured in a training-free manner by our proposed sliced maximal\ninformation coefficient and exhibits surprising generalization in different IQA\nmeasures. Experimental results verify the performance of existing IQA models\ncan be consistently improved when our attention module is incorporated. The\nsource code is available at https://github.com/KANGX99/SMIC.\n","authors":["Kang Xiao","Xu Wang","Yulin He","Baoliang Chen","Xuelin Shen"],"pdf_url":"https://arxiv.org/pdf/2408.09920v1.pdf","comment":"6 pages, 5 figures, accepted by ICME2024"},{"id":"http://arxiv.org/abs/2408.09787v1","updated":"2024-08-19T08:27:31Z","published":"2024-08-19T08:27:31Z","title":"Anim-Director: A Large Multimodal Model Powered Agent for Controllable\n Animation Video Generation","summary":" Traditional animation generation methods depend on training generative models\nwith human-labelled data, entailing a sophisticated multi-stage pipeline that\ndemands substantial human effort and incurs high training costs. Due to limited\nprompting plans, these methods typically produce brief, information-poor, and\ncontext-incoherent animations. To overcome these limitations and automate the\nanimation process, we pioneer the introduction of large multimodal models\n(LMMs) as the core processor to build an autonomous animation-making agent,\nnamed Anim-Director. This agent mainly harnesses the advanced understanding and\nreasoning capabilities of LMMs and generative AI tools to create animated\nvideos from concise narratives or simple instructions. Specifically, it\noperates in three main stages: Firstly, the Anim-Director generates a coherent\nstoryline from user inputs, followed by a detailed director's script that\nencompasses settings of character profiles and interior/exterior descriptions,\nand context-coherent scene descriptions that include appearing characters,\ninteriors or exteriors, and scene events. Secondly, we employ LMMs with the\nimage generation tool to produce visual images of settings and scenes. These\nimages are designed to maintain visual consistency across different scenes\nusing a visual-language prompting method that combines scene descriptions and\nimages of the appearing character and setting. Thirdly, scene images serve as\nthe foundation for producing animated videos, with LMMs generating prompts to\nguide this process. The whole process is notably autonomous without manual\nintervention, as the LMMs interact seamlessly with generative tools to generate\nprompts, evaluate visual quality, and select the best one to optimize the final\noutput.\n","authors":["Yunxin Li","Haoyuan Shi","Baotian Hu","Longyue Wang","Jiashun Zhu","Jinyi Xu","Zhen Zhao","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.09787v1.pdf","comment":"Accepted by SIGGRAPH Asia 2024, Project and Codes:\n https://github.com/HITsz-TMG/Anim-Director"},{"id":"http://arxiv.org/abs/2408.09650v1","updated":"2024-08-19T02:16:47Z","published":"2024-08-19T02:16:47Z","title":"ExpoMamba: Exploiting Frequency SSM Blocks for Efficient and Effective\n Image Enhancement","summary":" Low-light image enhancement remains a challenging task in computer vision,\nwith existing state-of-the-art models often limited by hardware constraints and\ncomputational inefficiencies, particularly in handling high-resolution images.\nRecent foundation models, such as transformers and diffusion models, despite\ntheir efficacy in various domains, are limited in use on edge devices due to\ntheir computational complexity and slow inference times. We introduce\nExpoMamba, a novel architecture that integrates components of the frequency\nstate space within a modified U-Net, offering a blend of efficiency and\neffectiveness. This model is specifically optimized to address mixed exposure\nchallenges, a common issue in low-light image enhancement, while ensuring\ncomputational efficiency. Our experiments demonstrate that ExpoMamba enhances\nlow-light images up to 2-3x faster than traditional models with an inference\ntime of 36.6 ms and achieves a PSNR improvement of approximately 15-20% over\ncompeting models, making it highly suitable for real-time image processing\napplications.\n","authors":["Eashan Adhikarla","Kai Zhang","John Nicholson","Brian D. Davison"],"pdf_url":"https://arxiv.org/pdf/2408.09650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10453v1","updated":"2024-08-19T23:31:02Z","published":"2024-08-19T23:31:02Z","title":"Kubrick: Multimodal Agent Collaborations for Synthetic Video Generation","summary":" Text-to-video generation has been dominated by end-to-end diffusion-based or\nautoregressive models. On one hand, those novel models provide plausible\nversatility, but they are criticized for physical correctness, shading and\nillumination, camera motion, and temporal consistency. On the other hand, film\nindustry relies on manually-edited Computer-Generated Imagery (CGI) using 3D\nmodeling software. Human-directed 3D synthetic videos and animations address\nthe aforementioned shortcomings, but it is extremely tedious and requires tight\ncollaboration between movie makers and 3D rendering experts. In this paper, we\nintroduce an automatic synthetic video generation pipeline based on Vision\nLarge Language Model (VLM) agent collaborations. Given a natural language\ndescription of a video, multiple VLM agents auto-direct various processes of\nthe generation pipeline. They cooperate to create Blender scripts which render\na video that best aligns with the given description. Based on film making\ninspiration and augmented with Blender-based movie making knowledge, the\nDirector agent decomposes the input text-based video description into\nsub-processes. For each sub-process, the Programmer agent produces Python-based\nBlender scripts based on customized function composing and API calling. Then,\nthe Reviewer agent, augmented with knowledge of video reviewing, character\nmotion coordinates, and intermediate screenshots uses its compositional\nreasoning ability to provide feedback to the Programmer agent. The Programmer\nagent iteratively improves the scripts to yield the best overall video outcome.\nOur generated videos show better quality than commercial video generation\nmodels in 5 metrics on video quality and instruction-following performance.\nMoreover, our framework outperforms other approaches in a comprehensive user\nstudy on quality, consistency, and rationality.\n","authors":["Liu He","Yizhi Song","Hejun Huang","Daniel Aliaga","Xin Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.10453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10397v1","updated":"2024-08-19T20:28:39Z","published":"2024-08-19T20:28:39Z","title":"Webcam-based Pupil Diameter Prediction Benefits from Upscaling","summary":" Capturing pupil diameter is essential for assessing psychological and\nphysiological states such as stress levels and cognitive load. However, the low\nresolution of images in eye datasets often hampers precise measurement. This\nstudy evaluates the impact of various upscaling methods, ranging from bicubic\ninterpolation to advanced super-resolution, on pupil diameter predictions. We\ncompare several pre-trained methods, including CodeFormer, GFPGAN, Real-ESRGAN,\nHAT, and SRResNet. Our findings suggest that pupil diameter prediction models\ntrained on upscaled datasets are highly sensitive to the selected upscaling\nmethod and scale. Our results demonstrate that upscaling methods consistently\nenhance the accuracy of pupil diameter prediction models, highlighting the\nimportance of upscaling in pupilometry. Overall, our work provides valuable\ninsights for selecting upscaling techniques, paving the way for more accurate\nassessments in psychological and physiological research.\n","authors":["Vijul Shah","Brian B. Moser","Ko Watanabe","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2408.10397v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10846v1","updated":"2024-08-19T12:06:25Z","published":"2024-08-19T12:06:25Z","title":"Harmonizing Attention: Training-free Texture-aware Geometry Transfer","summary":" Extracting geometry features from photographic images independently of\nsurface texture and transferring them onto different materials remains a\ncomplex challenge. In this study, we introduce Harmonizing Attention, a novel\ntraining-free approach that leverages diffusion models for texture-aware\ngeometry transfer. Our method employs a simple yet effective modification of\nself-attention layers, allowing the model to query information from multiple\nreference images within these layers. This mechanism is seamlessly integrated\ninto the inversion process as Texture-aligning Attention and into the\ngeneration process as Geometry-aligning Attention. This dual-attention approach\nensures the effective capture and transfer of material-independent geometry\nfeatures while maintaining material-specific textural continuity, all without\nthe need for model fine-tuning.\n","authors":["Eito Ikuta","Yohan Lee","Akihiro Iohara","Yu Saito","Toshiyuki Tanaka"],"pdf_url":"https://arxiv.org/pdf/2408.10846v1.pdf","comment":"10 pages, 6 figures"}]},"2024-08-18T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2404.01266v3","updated":"2024-08-18T23:48:44Z","published":"2024-04-01T17:43:27Z","title":"IsoBench: Benchmarking Multimodal Foundation Models on Isomorphic\n Representations","summary":" Current foundation models exhibit impressive capabilities when prompted\neither with text only or with both image and text inputs. But do their\ncapabilities change depending on the input modality? In this work, we propose\n$\\textbf{IsoBench}$, a benchmark dataset containing problems from four major\nareas: math, science, algorithms, and games. Each example is presented with\nmultiple $\\textbf{isomorphic representations}$ of inputs, such as visual,\ntextual, and mathematical presentations. IsoBench provides fine-grained\nfeedback to diagnose performance gaps caused by the form of the representation.\nAcross various foundation models, we observe that on the same problem, models\nhave a consistent preference towards textual representations. Most prominently,\nwhen evaluated on all IsoBench problems, Claude-3 Opus performs 28.7 points\nworse when provided with images instead of text; similarly, GPT-4 Turbo is 18.7\npoints worse and Gemini Pro is 14.9 points worse. Finally, we present two\nprompting techniques, $\\textit{IsoCombination}$ and $\\textit{IsoScratchPad}$,\nwhich improve model performance by considering combinations of, and\ntranslations between, different input representations.\n","authors":["Deqing Fu","Ruohao Guo","Ghazal Khalighinejad","Ollie Liu","Bhuwan Dhingra","Dani Yogatama","Robin Jia","Willie Neiswanger"],"pdf_url":"https://arxiv.org/pdf/2404.01266v3.pdf","comment":"1st Conference on Language Modeling (COLM), 2024"},{"id":"http://arxiv.org/abs/2303.07103v3","updated":"2024-08-18T22:53:56Z","published":"2023-03-04T19:14:20Z","title":"Could a Large Language Model be Conscious?","summary":" There has recently been widespread discussion of whether large language\nmodels might be sentient. Should we take this idea seriously? I will break down\nthe strongest reasons for and against. Given mainstream assumptions in the\nscience of consciousness, there are significant obstacles to consciousness in\ncurrent models: for example, their lack of recurrent processing, a global\nworkspace, and unified agency. At the same time, it is quite possible that\nthese obstacles will be overcome in the next decade or so. I conclude that\nwhile it is somewhat unlikely that current large language models are conscious,\nwe should take seriously the possibility that successors to large language\nmodels may be conscious in the not-too-distant future.\n","authors":["David J. Chalmers"],"pdf_url":"https://arxiv.org/pdf/2303.07103v3.pdf","comment":"Invited lecture at NeurIPS, November 28, 2022"},{"id":"http://arxiv.org/abs/2312.00029v3","updated":"2024-08-18T22:26:13Z","published":"2023-11-16T07:31:18Z","title":"Bergeron: Combating Adversarial Attacks through a Conscience-Based\n Alignment Framework","summary":" Research into AI alignment has grown considerably since the recent\nintroduction of increasingly capable Large Language Models (LLMs).\nUnfortunately, modern methods of alignment still fail to fully prevent harmful\nresponses when models are deliberately attacked. Such vulnerabilities can lead\nto LLMs being manipulated into generating hazardous content: from instructions\nfor creating dangerous materials to inciting violence or endorsing unethical\nbehaviors. To help mitigate this issue, we introduce Bergeron: a framework\ndesigned to improve the robustness of LLMs against attacks without any\nadditional parameter fine-tuning. Bergeron is organized into two tiers; with a\nsecondary LLM acting as a guardian to the primary LLM. This framework better\nsafeguards the primary model against incoming attacks while monitoring its\noutput for any harmful content. Empirical analysis reviews that by using\nBergeron to complement models with existing alignment training, we can\nsignificantly improve the robustness and safety of multiple, commonly used\ncommercial and open-source LLMs. Specifically, we found that models integrated\nwith Bergeron are, on average, nearly seven times more resistant to attacks\ncompared to models without such support.\n","authors":["Matthew Pisano","Peter Ly","Abraham Sanders","Bingsheng Yao","Dakuo Wang","Tomek Strzalkowski","Mei Si"],"pdf_url":"https://arxiv.org/pdf/2312.00029v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02885v3","updated":"2024-08-18T21:23:42Z","published":"2024-07-03T07:59:52Z","title":"CogErgLLM: Exploring Large Language Model Systems Design Perspective\n Using Cognitive Ergonomics","summary":" Integrating cognitive ergonomics with LLMs is essential for enhancing safety,\nreliability, and user satisfaction in human-AI interactions. Current LLM design\noften lacks this integration, leading to systems that may not fully align with\nhuman cognitive capabilities and limitations. Insufficient focus on\nincorporating cognitive science methods exacerbates biases in LLM outputs,\nwhile inconsistent application of user-centered design principles results in\nsub-optimal user experiences. To address these challenges, our position paper\nexplores the critical integration of cognitive ergonomics principles into LLM\ndesign, aiming to provide a comprehensive framework and practical guidelines\nfor ethical LLM development. Through our contributions, we seek to advance\nunderstanding and practice in integrating cognitive ergonomics into LLM\nsystems, fostering safer, more reliable, and ethically sound human-AI\ninteractions.\n","authors":["Azmine Toushik Wasi"],"pdf_url":"https://arxiv.org/pdf/2407.02885v3.pdf","comment":"8 Page, 3 Figures. Accepted to Large Language Models and Cognition @\n ICML 2024 (https://llm-cognition.github.io/#:~:text=CogErgLLM); Read in\n OpenReview: https://openreview.net/forum?id=63C9YSc77p"},{"id":"http://arxiv.org/abs/2109.02383v2","updated":"2024-08-18T20:32:42Z","published":"2021-09-06T12:00:29Z","title":"Data Science Kitchen at GermEval 2021: A Fine Selection of Hand-Picked\n Features, Delivered Fresh from the Oven","summary":" This paper presents the contribution of the Data Science Kitchen at GermEval\n2021 shared task on the identification of toxic, engaging, and fact-claiming\ncomments. The task aims at extending the identification of offensive language,\nby including additional subtasks that identify comments which should be\nprioritized for fact-checking by moderators and community managers. Our\ncontribution focuses on a feature-engineering approach with a conventional\nclassification backend. We combine semantic and writing style embeddings\nderived from pre-trained deep neural networks with additional numerical\nfeatures, specifically designed for this task. Classifier ensembles are used to\nderive predictions for each subtask via a majority voting scheme. Our best\nsubmission achieved macro-averaged F1-scores of 66.8\\%,\\,69.9\\% and 72.5\\% for\nthe identification of toxic, engaging, and fact-claiming comments.\n","authors":["Niclas Hildebrandt","Benedikt Boenninghoff","Dennis Orth","Christopher Schymura"],"pdf_url":"https://arxiv.org/pdf/2109.02383v2.pdf","comment":"Accepted at 17th Conference on Natural Language Processing (KONVENS\n 2021)"},{"id":"http://arxiv.org/abs/2407.11068v3","updated":"2024-08-18T19:44:35Z","published":"2024-07-12T14:17:26Z","title":"Show, Don't Tell: Evaluating Large Language Models Beyond Textual\n Understanding with ChildPlay","summary":" We explore the hypothesis that LLMs, such as GPT-3.5 and GPT-4, possess\nbroader cognitive functions, particularly in non-linguistic domains. Our\napproach extends beyond standard linguistic benchmarks by incorporating games\nlike Tic-Tac-Toe, Connect Four, and Battleship, encoded via ASCII, to assess\nstrategic thinking and decision-making. To evaluate the models' ability to\ngeneralize beyond their training data, we introduce two additional games. The\nfirst game, LEGO Connect Language (LCL), tests the models' capacity to\nunderstand spatial logic and follow assembly instructions. The second game, the\ngame of shapes, challenges the models to identify shapes represented by 1s\nwithin a matrix of zeros, further testing their spatial reasoning skills. This\n\"show, don't tell\" strategy uses games instead of simply querying the models.\nOur results show that despite their proficiency on standard benchmarks, GPT-3.5\nand GPT-4's abilities to play and reason about fully observable games without\npre-training is mediocre. Both models fail to anticipate losing moves in\nTic-Tac-Toe and Connect Four, and they are unable to play Battleship correctly.\nWhile GPT-4 shows some success in the game of shapes, both models fail at the\nassembly tasks presented in the LCL game. These results suggest that while GPT\nmodels can emulate conversational proficiency and basic rule comprehension,\ntheir performance in strategic gameplay and spatial reasoning tasks is very\nlimited. Importantly, this reveals a blind spot in current LLM benchmarks that\nwe highlight with our gameplay benchmark suite ChildPlay\n(https://github.com/child-play-neurips/child-play). Our findings provide a\ncautionary tale about claims of emergent intelligence and reasoning\ncapabilities of LLMs that are roughly the size of GPT-3.5 and GPT-4.\n","authors":["Gonçalo Hora de Carvalho","Oscar Knap","Robert Pollice"],"pdf_url":"https://arxiv.org/pdf/2407.11068v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01831v2","updated":"2024-08-18T19:30:30Z","published":"2023-08-03T15:47:04Z","title":"Textless Unit-to-Unit training for Many-to-Many Multilingual\n Speech-to-Speech Translation","summary":" This paper proposes a textless training method for many-to-many multilingual\nspeech-to-speech translation that can also benefit the transfer of pre-trained\nknowledge to text-based systems, text-to-speech synthesis and text-to-speech\ntranslation. To this end, we represent multilingual speech with speech units\nthat are the discretized representations of speech features derived from a\nself-supervised speech model. By treating the speech units as pseudo-text, we\ncan focus on the linguistic content of the speech, which can be easily\nassociated with both speech and text modalities at the phonetic level\ninformation. By setting both the inputs and outputs of our learning problem as\nspeech units, we propose to train an encoder-decoder model in a many-to-many\nspoken language translation setting, namely Unit-to-Unit Translation (UTUT).\nSpecifically, the encoder is conditioned on the source language token to\ncorrectly understand the input spoken language, while the decoder is\nconditioned on the target language token to generate the translated speech in\nthe target language. Therefore, during the training, the model can build the\nknowledge of how languages are comprehended and how to relate them to different\nlanguages. Since speech units can be easily associated from both audio and text\nby quantization and phonemization respectively, the trained model can easily\ntransferred to text-related tasks, even if it is trained in a textless manner.\nWe demonstrate that the proposed UTUT model can be effectively utilized not\nonly for Speech-to-Speech Translation (S2ST) but also for multilingual\nText-to-Speech Synthesis (T2S) and Text-to-Speech Translation (T2ST), requiring\nonly minimal fine-tuning steps on text inputs. By conducting comprehensive\nexperiments encompassing various languages, we validate the efficacy of the\nproposed method across diverse multilingual tasks.\n","authors":["Minsu Kim","Jeongsoo Choi","Dahun Kim","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2308.01831v2.pdf","comment":"TASLP"},{"id":"http://arxiv.org/abs/2408.09574v1","updated":"2024-08-18T19:18:12Z","published":"2024-08-18T19:18:12Z","title":"PhysBERT: A Text Embedding Model for Physics Scientific Literature","summary":" The specialized language and complex concepts in physics pose significant\nchallenges for information extraction through Natural Language Processing\n(NLP). Central to effective NLP applications is the text embedding model, which\nconverts text into dense vector representations for efficient information\nretrieval and semantic analysis. In this work, we introduce PhysBERT, the first\nphysics-specific text embedding model. Pre-trained on a curated corpus of 1.2\nmillion arXiv physics papers and fine-tuned with supervised data, PhysBERT\noutperforms leading general-purpose models on physics-specific tasks including\nthe effectiveness in fine-tuning for specific physics subdomains.\n","authors":["Thorsten Hellert","João Montenegro","Andrea Pollastro"],"pdf_url":"https://arxiv.org/pdf/2408.09574v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07702v2","updated":"2024-08-18T19:06:04Z","published":"2024-08-14T17:59:04Z","title":"The Death of Schema Linking? Text-to-SQL in the Age of Well-Reasoned\n Language Models","summary":" Schema linking is a crucial step in Text-to-SQL pipelines. Its goal is to\nretrieve the relevant tables and columns of a target database for a user's\nquery while disregarding irrelevant ones. However, imperfect schema linking can\noften exclude required columns needed for accurate query generation. In this\nwork, we revisit schema linking when using the latest generation of large\nlanguage models (LLMs). We find empirically that newer models are adept at\nutilizing relevant schema elements during generation even in the presence of\nlarge numbers of irrelevant ones. As such, our Text-to-SQL pipeline entirely\nforgoes schema linking in cases where the schema fits within the model's\ncontext window in order to minimize issues due to filtering required schema\nelements. Furthermore, instead of filtering contextual information, we\nhighlight techniques such as augmentation, selection, and correction, and adopt\nthem to improve the accuracy of our Text-to-SQL pipeline. Our approach ranks\nfirst on the BIRD benchmark achieving an accuracy of 71.83%.\n","authors":["Karime Maamari","Fadhil Abubaker","Daniel Jaroslawicz","Amine Mhedhbi"],"pdf_url":"https://arxiv.org/pdf/2408.07702v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09565v1","updated":"2024-08-18T18:31:55Z","published":"2024-08-18T18:31:55Z","title":"Grammatical Error Feedback: An Implicit Evaluation Approach","summary":" Grammatical feedback is crucial for consolidating second language (L2)\nlearning. Most research in computer-assisted language learning has focused on\nfeedback through grammatical error correction (GEC) systems, rather than\nexamining more holistic feedback that may be more useful for learners. This\nholistic feedback will be referred to as grammatical error feedback (GEF). In\nthis paper, we present a novel implicit evaluation approach to GEF that\neliminates the need for manual feedback annotations. Our method adopts a\ngrammatical lineup approach where the task is to pair feedback and essay\nrepresentations from a set of possible alternatives. This matching process can\nbe performed by appropriately prompting a large language model (LLM). An\nimportant aspect of this process, explored here, is the form of the lineup,\ni.e., the selection of foils. This paper exploits this framework to examine the\nquality and need for GEC to generate feedback, as well as the system used to\ngenerate feedback, using essays from the Cambridge Learner Corpus.\n","authors":["Stefano Bannò","Kate Knill","Mark J. F. Gales"],"pdf_url":"https://arxiv.org/pdf/2408.09565v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09559v1","updated":"2024-08-18T17:59:49Z","published":"2024-08-18T17:59:49Z","title":"HiAgent: Hierarchical Working Memory Management for Solving Long-Horizon\n Agent Tasks with Large Language Model","summary":" Large Language Model (LLM)-based agents exhibit significant potential across\nvarious domains, operating as interactive systems that process environmental\nobservations to generate executable actions for target tasks. The effectiveness\nof these agents is significantly influenced by their memory mechanism, which\nrecords historical experiences as sequences of action-observation pairs. We\ncategorize memory into two types: cross-trial memory, accumulated across\nmultiple attempts, and in-trial memory (working memory), accumulated within a\nsingle attempt. While considerable research has optimized performance through\ncross-trial memory, the enhancement of agent performance through improved\nworking memory utilization remains underexplored. Instead, existing approaches\noften involve directly inputting entire historical action-observation pairs\ninto LLMs, leading to redundancy in long-horizon tasks. Inspired by human\nproblem-solving strategies, this paper introduces HiAgent, a framework that\nleverages subgoals as memory chunks to manage the working memory of LLM-based\nagents hierarchically. Specifically, HiAgent prompts LLMs to formulate subgoals\nbefore generating executable actions and enables LLMs to decide proactively to\nreplace previous subgoals with summarized observations, retaining only the\naction-observation pairs relevant to the current subgoal. Experimental results\nacross five long-horizon tasks demonstrate that HiAgent achieves a twofold\nincrease in success rate and reduces the average number of steps required by\n3.8. Additionally, our analysis shows that HiAgent consistently improves\nperformance across various steps, highlighting its robustness and\ngeneralizability. Project Page: https://github.com/HiAgent2024/HiAgent .\n","authors":["Mengkang Hu","Tianxing Chen","Qiguang Chen","Yao Mu","Wenqi Shao","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2408.09559v1.pdf","comment":"Project Page: https://github.com/HiAgent2024/HiAgent"},{"id":"http://arxiv.org/abs/2408.07092v2","updated":"2024-08-18T17:27:17Z","published":"2024-08-11T18:40:36Z","title":"Post-Training Sparse Attention with Double Sparsity","summary":" The inference process for large language models is slow and memory-intensive,\nwith one of the most critical bottlenecks being excessive Key-Value (KV) cache\naccesses. This paper introduces \"Double Sparsity,\" a novel post-training sparse\nattention technique designed to alleviate this bottleneck by reducing KV cache\naccess. Double Sparsity combines token sparsity, which focuses on utilizing\nonly the important tokens for computing self-attention, with channel sparsity,\nan approach that uses important feature channels for identifying important\ntokens. Our key insight is that the pattern of channel sparsity is relatively\nstatic, allowing us to use offline calibration to make it efficient at runtime,\nthereby enabling accurate and efficient identification of important tokens.\nMoreover, this method can be combined with offloading to achieve significant\nmemory usage reduction. Experimental results demonstrate that Double Sparsity\ncan achieve $\\frac{1}{16}$ token and channel sparsity with minimal impact on\naccuracy across various tasks, including wiki-2 perplexity, key-value\nretrieval, and long context benchmarks with models including Llama-2-7B,\nLlama-2-70B, and Mixtral-8x7B. It brings up to a 14.1$\\times$ acceleration in\nattention operations and a 1.9$\\times$ improvement in end-to-end inference on\nGPUs. With offloading, it achieves a decoding speed acceleration of\n16.3$\\times$ compared to state-of-the-art solutions at a sequence length of\n256K. Our code is publicly available at\nhttps://github.com/andy-yang-1/DoubleSparse.\n","authors":["Shuo Yang","Ying Sheng","Joseph E. Gonzalez","Ion Stoica","Lianmin Zheng"],"pdf_url":"https://arxiv.org/pdf/2408.07092v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09544v1","updated":"2024-08-18T17:01:42Z","published":"2024-08-18T17:01:42Z","title":"No Such Thing as a General Learner: Language models and their dual\n optimization","summary":" What role can the otherwise successful Large Language Models (LLMs) play in\nthe understanding of human cognition, and in particular in terms of informing\nlanguage acquisition debates? To contribute to this question, we first argue\nthat neither humans nor LLMs are general learners, in a variety of senses. We\nmake a novel case for how in particular LLMs follow a dual-optimization\nprocess: they are optimized during their training (which is typically compared\nto language acquisition), and modern LLMs have also been selected, through a\nprocess akin to natural selection in a species. From this perspective, we argue\nthat the performance of LLMs, whether similar or dissimilar to that of humans,\ndoes not weigh easily on important debates about the importance of human\ncognitive biases for language.\n","authors":["Emmanuel Chemla","Ryan M. Nefdt"],"pdf_url":"https://arxiv.org/pdf/2408.09544v1.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2311.09101v3","updated":"2024-08-18T16:52:14Z","published":"2023-11-15T16:47:57Z","title":"Towards A Unified View of Answer Calibration for Multi-Step Reasoning","summary":" Large Language Models (LLMs) employing Chain-of-Thought (CoT) prompting have\nbroadened the scope for improving multi-step reasoning capabilities. We\ngenerally divide multi-step reasoning into two phases: path generation to\ngenerate the reasoning path(s); and answer calibration post-processing the\nreasoning path(s) to obtain a final answer. However, the existing literature\nlacks systematic analysis on different answer calibration approaches. In this\npaper, we summarize the taxonomy of recent answer calibration techniques and\nbreak them down into step-level and path-level strategies. We then conduct a\nthorough evaluation on these strategies from a unified view, systematically\nscrutinizing step-level and path-level answer calibration across multiple\npaths. Experimental results reveal that integrating the dominance of both\nstrategies tends to derive optimal outcomes. Our study holds the potential to\nilluminate key insights for optimizing multi-step reasoning with answer\ncalibration.\n","authors":["Shumin Deng","Ningyu Zhang","Nay Oo","Bryan Hooi"],"pdf_url":"https://arxiv.org/pdf/2311.09101v3.pdf","comment":"Accepted by NLRSE@ACL2024"},{"id":"http://arxiv.org/abs/2408.09540v1","updated":"2024-08-18T16:51:28Z","published":"2024-08-18T16:51:28Z","title":"Using ChatGPT to Score Essays and Short-Form Constructed Responses","summary":" This study aimed to determine if ChatGPT's large language models could match\nthe scoring accuracy of human and machine scores from the ASAP competition. The\ninvestigation focused on various prediction models, including linear\nregression, random forest, gradient boost, and boost. ChatGPT's performance was\nevaluated against human raters using quadratic weighted kappa (QWK) metrics.\nResults indicated that while ChatGPT's gradient boost model achieved QWKs close\nto human raters for some data sets, its overall performance was inconsistent\nand often lower than human scores. The study highlighted the need for further\nrefinement, particularly in handling biases and ensuring scoring fairness.\nDespite these challenges, ChatGPT demonstrated potential for scoring\nefficiency, especially with domain-specific fine-tuning. The study concludes\nthat ChatGPT can complement human scoring but requires additional development\nto be reliable for high-stakes assessments. Future research should improve\nmodel accuracy, address ethical considerations, and explore hybrid models\ncombining ChatGPT with empirical methods.\n","authors":["Mark D. Shermis"],"pdf_url":"https://arxiv.org/pdf/2408.09540v1.pdf","comment":"35 pages, 8 tables, 2 Figures, 27 references"},{"id":"http://arxiv.org/abs/2408.09529v1","updated":"2024-08-18T16:26:39Z","published":"2024-08-18T16:26:39Z","title":"Revisiting the Graph Reasoning Ability of Large Language Models: Case\n Studies in Translation, Connectivity and Shortest Path","summary":" Large Language Models (LLMs) have achieved great success in various reasoning\ntasks. In this work, we focus on the graph reasoning ability of LLMs. Although\ntheoretical studies proved that LLMs are capable of handling graph reasoning\ntasks, empirical evaluations reveal numerous failures. To deepen our\nunderstanding on this discrepancy, we revisit the ability of LLMs on three\nfundamental graph tasks: graph description translation, graph connectivity, and\nthe shortest-path problem. Our findings suggest that LLMs can fail to\nunderstand graph structures through text descriptions and exhibit varying\nperformance for all these three fundamental tasks. Meanwhile, we perform a\nreal-world investigation on knowledge graphs and make consistent observations\nwith our findings. The codes and datasets are available.\n","authors":["Xinnan Dai","Qihao Wen","Yifei Shen","Hongzhi Wen","Dongsheng Li","Jiliang Tang","Caihua Shan"],"pdf_url":"https://arxiv.org/pdf/2408.09529v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03428v2","updated":"2024-08-18T16:22:58Z","published":"2024-04-04T13:15:28Z","title":"Edisum: Summarizing and Explaining Wikipedia Edits at Scale","summary":" An edit summary is a succinct comment written by a Wikipedia editor\nexplaining the nature of, and reasons for, an edit to a Wikipedia page. Edit\nsummaries are crucial for maintaining the encyclopedia: they are the first\nthing seen by content moderators and they help them decide whether to accept or\nreject an edit. Additionally, edit summaries constitute a valuable data source\nfor researchers. Unfortunately, as we show, for many edits, summaries are\neither missing or incomplete. To overcome this problem and help editors write\nuseful edit summaries, we propose a model for recommending edit summaries\ngenerated by a language model trained to produce good edit summaries given the\nrepresentation of an edit diff. To overcome the challenges of mixed-quality\ntraining data and efficiency requirements imposed by the scale of Wikipedia, we\nfine-tune a small generative language model on a curated mix of human and\nsynthetic data. Our model performs on par with human editors. Commercial large\nlanguage models are able to solve this task better than human editors, but are\nnot well suited for Wikipedia, while open-source ones fail on this task. More\nbroadly, we showcase how language modeling technology can be used to support\nhumans in maintaining one of the largest and most visible projects on the Web.\n","authors":["Marija Šakota","Isaac Johnson","Guosheng Feng","Robert West"],"pdf_url":"https://arxiv.org/pdf/2404.03428v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06643v3","updated":"2024-08-18T16:00:31Z","published":"2024-01-12T15:46:43Z","title":"Effects of diversity incentives on sample diversity and downstream model\n performance in LLM-based text augmentation","summary":" The latest generative large language models (LLMs) have found their\napplication in data augmentation tasks, where small numbers of text samples are\nLLM-paraphrased and then used to fine-tune downstream models. However, more\nresearch is needed to assess how different prompts, seed data selection\nstrategies, filtering methods, or model settings affect the quality of\nparaphrased data (and downstream models). In this study, we investigate three\ntext diversity incentive methods well established in crowdsourcing: taboo\nwords, hints by previous outlier solutions, and chaining on previous outlier\nsolutions. Using these incentive methods as part of instructions to LLMs\naugmenting text datasets, we measure their effects on generated texts lexical\ndiversity and downstream model performance. We compare the effects over 5\ndifferent LLMs, 6 datasets and 2 downstream models. We show that diversity is\nmost increased by taboo words, but downstream model performance is highest with\nhints.\n","authors":["Jan Cegin","Branislav Pecher","Jakub Simko","Ivan Srba","Maria Bielikova","Peter Brusilovsky"],"pdf_url":"https://arxiv.org/pdf/2401.06643v3.pdf","comment":"ACL'24 version, 24 pages"},{"id":"http://arxiv.org/abs/2408.01262v2","updated":"2024-08-18T15:48:02Z","published":"2024-08-02T13:35:11Z","title":"RAGEval: Scenario Specific RAG Evaluation Dataset Generation Framework","summary":" Retrieval-Augmented Generation (RAG) systems have demonstrated their\nadvantages in alleviating the hallucination of Large Language Models (LLMs).\nExisting RAG benchmarks mainly focus on evaluating whether LLMs can correctly\nanswer the general knowledge. However, they are unable to evaluate the\neffectiveness of the RAG system in dealing with the data from different\nvertical domains. This paper introduces RAGEval, a framework for automatically\ngenerating evaluation datasets to evaluate the knowledge usage ability of\ndifferent LLMs in different scenarios. Specifically, RAGEval summarizes a\nschema from seed documents, applies the configurations to generate diverse\ndocuments, and constructs question-answering pairs according to both articles\nand configurations. We propose three novel metrics, Completeness,\nHallucination, and Irrelevance, to carefully evaluate the responses generated\nby LLMs. By benchmarking RAG models in vertical domains, RAGEval has the\nability to better evaluate the knowledge usage ability of LLMs, which avoids\nthe confusion regarding the source of knowledge in answering question in\nexisting QA datasets--whether it comes from parameterized memory or retrieval.\nThe code and dataset will be released.\n","authors":["Kunlun Zhu","Yifan Luo","Dingling Xu","Ruobing Wang","Shi Yu","Shuo Wang","Yukun Yan","Zhenghao Liu","Xu Han","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2408.01262v2.pdf","comment":"16 pages, 9 figures"},{"id":"http://arxiv.org/abs/2408.09503v1","updated":"2024-08-18T14:52:25Z","published":"2024-08-18T14:52:25Z","title":"Out-of-distribution generalization via composition: a lens through\n induction heads in Transformers","summary":" Large language models (LLMs) such as GPT-4 sometimes appear to be creative,\nsolving novel tasks often with a few demonstrations in the prompt. These tasks\nrequire the models to generalize on distributions different from those from\ntraining data -- which is known as out-of-distribution (OOD) generalization.\nDespite the tremendous success of LLMs, how they approach OOD generalization\nremains an open and underexplored question. We examine OOD generalization in\nsettings where instances are generated according to hidden rules, including\nin-context learning with symbolic reasoning. Models are required to infer the\nhidden rules behind input prompts without any fine-tuning.\n We empirically examined the training dynamics of Transformers on a synthetic\nexample and conducted extensive experiments on a variety of pretrained LLMs,\nfocusing on a type of components known as induction heads. We found that OOD\ngeneralization and composition are tied together -- models can learn rules by\ncomposing two self-attention layers, thereby achieving OOD generalization.\nFurthermore, a shared latent subspace in the embedding (or feature) space acts\nas a bridge for composition by aligning early layers and later layers, which we\nrefer to as the common bridge representation hypothesis.\n","authors":["Jiajun Song","Zhuoyan Xu","Yiqiao Zhong"],"pdf_url":"https://arxiv.org/pdf/2408.09503v1.pdf","comment":"41 pages, 25 figures"},{"id":"http://arxiv.org/abs/2408.09489v1","updated":"2024-08-18T14:08:31Z","published":"2024-08-18T14:08:31Z","title":"REFINE-LM: Mitigating Language Model Stereotypes via Reinforcement\n Learning","summary":" With the introduction of (large) language models, there has been significant\nconcern about the unintended bias such models may inherit from their training\ndata. A number of studies have shown that such models propagate gender\nstereotypes, as well as geographical and racial bias, among other biases. While\nexisting works tackle this issue by preprocessing data and debiasing\nembeddings, the proposed methods require a lot of computational resources and\nannotation effort while being limited to certain types of biases. To address\nthese issues, we introduce REFINE-LM, a debiasing method that uses\nreinforcement learning to handle different types of biases without any\nfine-tuning. By training a simple model on top of the word probability\ndistribution of a LM, our bias agnostic reinforcement learning method enables\nmodel debiasing without human annotations or significant computational\nresources. Experiments conducted on a wide range of models, including several\nLMs, show that our method (i) significantly reduces stereotypical biases while\npreserving LMs performance; (ii) is applicable to different types of biases,\ngeneralizing across contexts such as gender, ethnicity, religion, and\nnationality-based biases; and (iii) it is not expensive to train.\n","authors":["Rameez Qureshi","Naïm Es-Sebbani","Luis Galárraga","Yvette Graham","Miguel Couceiro","Zied Bouraoui"],"pdf_url":"https://arxiv.org/pdf/2408.09489v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09485v1","updated":"2024-08-18T14:00:00Z","published":"2024-08-18T14:00:00Z","title":"Activated Parameter Locating via Causal Intervention for Model Merging","summary":" Model merging combines multiple homologous models into one model, achieving\nconvincing generalization without the necessity of additional training. A key\nchallenge in this problem is resolving parameter redundancies and conflicts\nacross multiple models. Existing models have demonstrated that dropping a\nportion of delta parameters can alleviate conflicts while maintaining\nperformance. However, these methods often drop parameters either randomly or\nbased on magnitude, overlooking task-specific information embedded in\nfine-tuned models. In this paper, we propose an Activated Parameter Locating\n(APL) method that utilizes causal intervention to estimate parameter\nimportance, enabling more precise parameter drops and better conflict\nmitigation. Moreover, to reduce the computational complexity associated with a\nlarge number of parameter partitions, we also introduce a theoretically\nsupported gradient approximation strategy for APL. Experiments on model merging\nwithin both in-domain and out-of-domain settings, along with associated\nanalyses, showcase the effectiveness of APL.\n","authors":["Fanshuang Kong","Richong Zhang","Ziqiao Wang"],"pdf_url":"https://arxiv.org/pdf/2408.09485v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09481v1","updated":"2024-08-18T13:51:01Z","published":"2024-08-18T13:51:01Z","title":"PanoSent: A Panoptic Sextuple Extraction Benchmark for Multimodal\n Conversational Aspect-based Sentiment Analysis","summary":" While existing Aspect-based Sentiment Analysis (ABSA) has received extensive\neffort and advancement, there are still gaps in defining a more holistic\nresearch target seamlessly integrating multimodality, conversation context,\nfine-granularity, and also covering the changing sentiment dynamics as well as\ncognitive causal rationales. This paper bridges the gaps by introducing a\nmultimodal conversational ABSA, where two novel subtasks are proposed: 1)\nPanoptic Sentiment Sextuple Extraction, panoramically recognizing holder,\ntarget, aspect, opinion, sentiment, rationale from multi-turn multi-party\nmultimodal dialogue. 2) Sentiment Flipping Analysis, detecting the dynamic\nsentiment transformation throughout the conversation with the causal reasons.\nTo benchmark the tasks, we construct PanoSent, a dataset annotated both\nmanually and automatically, featuring high quality, large scale, multimodality,\nmultilingualism, multi-scenarios, and covering both implicit and explicit\nsentiment elements. To effectively address the tasks, we devise a novel\nChain-of-Sentiment reasoning framework, together with a novel multimodal large\nlanguage model (namely Sentica) and a paraphrase-based verification mechanism.\nExtensive evaluations demonstrate the superiority of our methods over strong\nbaselines, validating the efficacy of all our proposed methods. The work is\nexpected to open up a new era for the ABSA community, and thus all our codes\nand data are open at https://PanoSent.github.io/\n","authors":["Meng Luo","Hao Fei","Bobo Li","Shengqiong Wu","Qian Liu","Soujanya Poria","Erik Cambria","Mong-Li Lee","Wynne Hsu"],"pdf_url":"https://arxiv.org/pdf/2408.09481v1.pdf","comment":"Accepted by ACM MM 2024 (Oral)"},{"id":"http://arxiv.org/abs/2305.13168v3","updated":"2024-08-18T13:45:17Z","published":"2023-05-22T15:56:44Z","title":"LLMs for Knowledge Graph Construction and Reasoning: Recent Capabilities\n and Future Opportunities","summary":" This paper presents an exhaustive quantitative and qualitative evaluation of\nLarge Language Models (LLMs) for Knowledge Graph (KG) construction and\nreasoning. We engage in experiments across eight diverse datasets, focusing on\nfour representative tasks encompassing entity and relation extraction, event\nextraction, link prediction, and question-answering, thereby thoroughly\nexploring LLMs' performance in the domain of construction and inference.\nEmpirically, our findings suggest that LLMs, represented by GPT-4, are more\nsuited as inference assistants rather than few-shot information extractors.\nSpecifically, while GPT-4 exhibits good performance in tasks related to KG\nconstruction, it excels further in reasoning tasks, surpassing fine-tuned\nmodels in certain cases. Moreover, our investigation extends to the potential\ngeneralization ability of LLMs for information extraction, leading to the\nproposition of a Virtual Knowledge Extraction task and the development of the\ncorresponding VINE dataset. Based on these empirical findings, we further\npropose AutoKG, a multi-agent-based approach employing LLMs and external\nsources for KG construction and reasoning. We anticipate that this research can\nprovide invaluable insights for future undertakings in the field of knowledge\ngraphs. The code and datasets are in https://github.com/zjunlp/AutoKG.\n","authors":["Yuqi Zhu","Xiaohan Wang","Jing Chen","Shuofei Qiao","Yixin Ou","Yunzhi Yao","Shumin Deng","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.13168v3.pdf","comment":"World Wide Web Journal"},{"id":"http://arxiv.org/abs/2408.09474v1","updated":"2024-08-18T13:39:43Z","published":"2024-08-18T13:39:43Z","title":"Image-Based Geolocation Using Large Vision-Language Models","summary":" Geolocation is now a vital aspect of modern life, offering numerous benefits\nbut also presenting serious privacy concerns. The advent of large\nvision-language models (LVLMs) with advanced image-processing capabilities\nintroduces new risks, as these models can inadvertently reveal sensitive\ngeolocation information. This paper presents the first in-depth study analyzing\nthe challenges posed by traditional deep learning and LVLM-based geolocation\nmethods. Our findings reveal that LVLMs can accurately determine geolocations\nfrom images, even without explicit geographic training.\n To address these challenges, we introduce \\tool{}, an innovative framework\nthat significantly enhances image-based geolocation accuracy. \\tool{} employs a\nsystematic chain-of-thought (CoT) approach, mimicking human geoguessing\nstrategies by carefully analyzing visual and contextual cues such as vehicle\ntypes, architectural styles, natural landscapes, and cultural elements.\nExtensive testing on a dataset of 50,000 ground-truth data points shows that\n\\tool{} outperforms both traditional models and human benchmarks in accuracy.\nIt achieves an impressive average score of 4550.5 in the GeoGuessr game, with\nan 85.37\\% win rate, and delivers highly precise geolocation predictions, with\nthe closest distances as accurate as 0.3 km. Furthermore, our study highlights\nissues related to dataset integrity, leading to the creation of a more robust\ndataset and a refined framework that leverages LVLMs' cognitive capabilities to\nimprove geolocation precision. These findings underscore \\tool{}'s superior\nability to interpret complex visual data, the urgent need to address emerging\nsecurity vulnerabilities posed by LVLMs, and the importance of responsible AI\ndevelopment to ensure user privacy protection.\n","authors":["Yi Liu","Junchen Ding","Gelei Deng","Yuekang Li","Tianwei Zhang","Weisong Sun","Yaowen Zheng","Jingquan Ge","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2408.09474v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.05232v3","updated":"2024-08-18T13:01:19Z","published":"2024-06-07T19:38:05Z","title":"Improving Logits-based Detector without Logits from Black-box LLMs","summary":" The advent of Large Language Models (LLMs) has revolutionized text\ngeneration, producing outputs that closely mimic human writing. This blurring\nof lines between machine- and human-written text presents new challenges in\ndistinguishing one from the other a task further complicated by the frequent\nupdates and closed nature of leading proprietary LLMs. Traditional logits-based\ndetection methods leverage surrogate models for identifying LLM-generated\ncontent when the exact logits are unavailable from black-box LLMs. However,\nthese methods grapple with the misalignment between the distributions of the\nsurrogate and the often undisclosed target models, leading to performance\ndegradation, particularly with the introduction of new, closed-source models.\nFurthermore, while current methodologies are generally effective when the\nsource model is identified, they falter in scenarios where the model version\nremains unknown, or the test set comprises outputs from various source models.\nTo address these limitations, we present Distribution-Aligned LLMs Detection\n(DALD), an innovative framework that redefines the state-of-the-art performance\nin black-box text detection even without logits from source LLMs. DALD is\ndesigned to align the surrogate model's distribution with that of unknown\ntarget LLMs, ensuring enhanced detection capability and resilience against\nrapid model iterations with minimal training investment. By leveraging corpus\nsamples from publicly accessible outputs of advanced models such as ChatGPT,\nGPT-4 and Claude-3, DALD fine-tunes surrogate models to synchronize with\nunknown source model distributions effectively.\n","authors":["Cong Zeng","Shengkun Tang","Xianjun Yang","Yuanzhou Chen","Yiyou Sun","zhiqiang xu","Yao Li","Haifeng Chen","Wei Cheng","Dongkuan Xu"],"pdf_url":"https://arxiv.org/pdf/2406.05232v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10311v6","updated":"2024-08-18T12:45:09Z","published":"2024-02-15T20:24:39Z","title":"The optimal placement of the head in the noun phrase. The case of\n demonstrative, numeral, adjective and noun","summary":" The word order of a sentence is shaped by multiple principles. The principle\nof syntactic dependency distance minimization is in conflict with the principle\nof surprisal minimization (or predictability maximization) in single head\nsyntactic dependency structures: while the former predicts that the head should\nbe placed at the center of the linear arrangement, the latter predicts that the\nhead should be placed at one of the ends (either first or last). A critical\nquestion is when surprisal minimization (or predictability maximization) should\nsurpass syntactic dependency distance minimization. In the context of single\nhead structures, it has been predicted that this is more likely to happen when\ntwo conditions are met, i.e. (a) fewer words are involved and (b) words are\nshorter. Here we test the prediction on the noun phrase when it is composed of\na demonstrative, a numeral, an adjective and a noun. We find that, across\npreferred orders in languages, the noun tends to be placed at one of the ends,\nconfirming the theoretical prediction. We also show evidence of anti locality\neffects: syntactic dependency distances in preferred orders are longer than\nexpected by chance.\n","authors":["Ramon Ferrer-i-Cancho"],"pdf_url":"https://arxiv.org/pdf/2402.10311v6.pdf","comment":"In press in the Journal of Quantitative Linguistics"},{"id":"http://arxiv.org/abs/2408.09459v1","updated":"2024-08-18T12:37:03Z","published":"2024-08-18T12:37:03Z","title":"WPN: An Unlearning Method Based on N-pair Contrastive Learning in\n Language Models","summary":" Generative language models (LMs) offer numerous advantages but may produce\ninappropriate or harmful outputs due to the harmful knowledge acquired during\npre-training. This knowledge often manifests as undesirable correspondences,\nsuch as \"harmful prompts\" leading to \"harmful outputs,\" which our research aims\nto mitigate through unlearning techniques.However, existing unlearning methods\nbased on gradient ascent can significantly impair the performance of LMs. To\naddress this issue, we propose a novel approach called Weighted Positional\nN-pair (WPN) Learning, which leverages position-weighted mean pooling within an\nn-pair contrastive learning framework. WPN is designed to modify the output\ndistribution of LMs by eliminating specific harmful outputs (e.g., replacing\ntoxic responses with neutral ones), thereby transforming the model's behavior\nfrom \"harmful prompt-harmful output\" to \"harmful prompt-harmless\nresponse\".Experiments on OPT and GPT-NEO LMs show that WPN effectively reduces\nthe proportion of harmful responses, achieving a harmless rate of up to 95.8\\%\nwhile maintaining stable performance on nine common benchmarks (with less than\n2\\% degradation on average). Moreover, we provide empirical evidence to\ndemonstrate WPN's ability to weaken the harmful correspondences in terms of\ngeneralizability and robustness, as evaluated on out-of-distribution test sets\nand under adversarial attacks.\n","authors":["Guitao Chen","Yunshen Wang","Hongye Sun","Guang Chen"],"pdf_url":"https://arxiv.org/pdf/2408.09459v1.pdf","comment":"ECAI 2024"},{"id":"http://arxiv.org/abs/2408.09452v1","updated":"2024-08-18T12:19:18Z","published":"2024-08-18T12:19:18Z","title":"Identifying Speakers and Addressees of Quotations in Novels with Prompt\n Learning","summary":" Quotations in literary works, especially novels, are important to create\ncharacters, reflect character relationships, and drive plot development.\nCurrent research on quotation extraction in novels primarily focuses on\nquotation attribution, i.e., identifying the speaker of the quotation. However,\nthe addressee of the quotation is also important to construct the relationship\nbetween the speaker and the addressee. To tackle the problem of dataset\nscarcity, we annotate the first Chinese quotation corpus with elements\nincluding speaker, addressee, speaking mode and linguistic cue. We propose\nprompt learning-based methods for speaker and addressee identification based on\nfine-tuned pre-trained models. Experiments on both Chinese and English datasets\nshow the effectiveness of the proposed methods, which outperform methods based\non zero-shot and few-shot large language models.\n","authors":["Yuchen Yan","Hanjie Zhao","Senbin Zhu","Hongde Liu","Zhihong Zhang","Yuxiang Jia"],"pdf_url":"https://arxiv.org/pdf/2408.09452v1.pdf","comment":"This paper has been accepted by NLPCC 2024"},{"id":"http://arxiv.org/abs/2306.16837v2","updated":"2024-08-18T11:23:32Z","published":"2023-06-29T10:29:23Z","title":"A Formal Perspective on Byte-Pair Encoding","summary":" Byte-Pair Encoding (BPE) is a popular algorithm used for tokenizing data in\nNLP, despite being devised initially as a compression method. BPE appears to be\na greedy algorithm at face value, but the underlying optimization problem that\nBPE seeks to solve has not yet been laid down. We formalize BPE as a\ncombinatorial optimization problem. Via submodular functions, we prove that the\niterative greedy version is a\n$\\frac{1}{{\\sigma(\\boldsymbol{\\mu}^\\star)}}(1-e^{-{\\sigma(\\boldsymbol{\\mu}^\\star)}})$-approximation\nof an optimal merge sequence, where ${\\sigma(\\boldsymbol{\\mu}^\\star)}$ is the\ntotal backward curvature with respect to the optimal merge sequence\n$\\boldsymbol{\\mu}^\\star$. Empirically the lower bound of the approximation is\n$\\approx 0.37$.\n We provide a faster implementation of BPE which improves the runtime\ncomplexity from $\\mathcal{O}\\left(N M\\right)$ to $\\mathcal{O}\\left(N \\log\nM\\right)$, where $N$ is the sequence length and $M$ is the merge count.\nFinally, we optimize the brute-force algorithm for optimal BPE using\nmemoization.\n","authors":["Vilém Zouhar","Clara Meister","Juan Luis Gastaldi","Li Du","Tim Vieira","Mrinmaya Sachan","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2306.16837v2.pdf","comment":"ACL 2023"},{"id":"http://arxiv.org/abs/2408.09437v1","updated":"2024-08-18T10:55:04Z","published":"2024-08-18T10:55:04Z","title":"Hindi-BEIR : A Large Scale Retrieval Benchmark in Hindi","summary":" Given the large number of Hindi speakers worldwide, there is a pressing need\nfor robust and efficient information retrieval systems for Hindi. Despite\nongoing research, there is a lack of comprehensive benchmark for evaluating\nretrieval models in Hindi. To address this gap, we introduce the Hindi version\nof the BEIR benchmark, which includes a subset of English BEIR datasets\ntranslated to Hindi, existing Hindi retrieval datasets, and synthetically\ncreated datasets for retrieval. The benchmark is comprised of $15$ datasets\nspanning across $8$ distinct tasks. We evaluate state-of-the-art multilingual\nretrieval models on this benchmark to identify task and domain-specific\nchallenges and their impact on retrieval performance. By releasing this\nbenchmark and a set of relevant baselines, we enable researchers to understand\nthe limitations and capabilities of current Hindi retrieval models, promoting\nadvancements in this critical area. The datasets from Hindi-BEIR are publicly\navailable.\n","authors":["Arkadeep Acharya","Rudra Murthy","Vishwajeet Kumar","Jaydeep Sen"],"pdf_url":"https://arxiv.org/pdf/2408.09437v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09434v1","updated":"2024-08-18T10:40:37Z","published":"2024-08-18T10:40:37Z","title":"HySem: A context length optimized LLM pipeline for unstructured tabular\n extraction","summary":" Regulatory compliance reporting in the pharmaceutical industry relies on\ndetailed tables, but these are often under-utilized beyond compliance due to\ntheir unstructured format and arbitrary content. Extracting and semantically\nrepresenting tabular data is challenging due to diverse table presentations.\nLarge Language Models (LLMs) demonstrate substantial potential for semantic\nrepresentation, yet they encounter challenges related to accuracy and context\nsize limitations, which are crucial considerations for the industry\napplications. We introduce HySem, a pipeline that employs a novel context\nlength optimization technique to generate accurate semantic JSON\nrepresentations from HTML tables. This approach utilizes a custom fine-tuned\nmodel specifically designed for cost- and privacy-sensitive small and medium\npharmaceutical enterprises. Running on commodity hardware and leveraging\nopen-source models, our auto-correcting agents rectify both syntax and semantic\nerrors in LLM-generated content. HySem surpasses its peer open-source models in\naccuracy and provides competitive performance when benchmarked against OpenAI\nGPT-4o and effectively addresses context length limitations, which is a crucial\nfactor for supporting larger tables.\n","authors":["Narayanan PP","Anantharaman Palacode Narayana Iyer"],"pdf_url":"https://arxiv.org/pdf/2408.09434v1.pdf","comment":"9 pages, 4 tables, 3 figures, 1 algorithm"},{"id":"http://arxiv.org/abs/2408.09430v1","updated":"2024-08-18T10:12:39Z","published":"2024-08-18T10:12:39Z","title":"FASST: Fast LLM-based Simultaneous Speech Translation","summary":" Simultaneous speech translation (SST) takes streaming speech input and\ngenerates text translation on the fly. Existing methods either have high\nlatency due to recomputation of input representations, or fall behind of\noffline ST in translation quality. In this paper, we propose FASST, a fast\nlarge language model based method for streaming speech translation. We propose\nblockwise-causal speech encoding and consistency mask, so that streaming speech\ninput can be encoded incrementally without recomputation. Furthermore, we\ndevelop a two-stage training strategy to optimize FASST for simultaneous\ninference. We evaluate FASST and multiple strong prior models on MuST-C\ndataset. Experiment results show that FASST achieves the best quality-latency\ntrade-off. It outperforms the previous best model by an average of 1.5 BLEU\nunder the same latency for English to Spanish translation.\n","authors":["Siqi Ouyang","Xi Xu","Chinmay Dandekar","Lei Li"],"pdf_url":"https://arxiv.org/pdf/2408.09430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09422v1","updated":"2024-08-18T09:44:59Z","published":"2024-08-18T09:44:59Z","title":"Distinguish Confusion in Legal Judgment Prediction via Revised Relation\n Knowledge","summary":" Legal Judgment Prediction (LJP) aims to automatically predict a law case's\njudgment results based on the text description of its facts. In practice, the\nconfusing law articles (or charges) problem frequently occurs, reflecting that\nthe law cases applicable to similar articles (or charges) tend to be misjudged.\nAlthough some recent works based on prior knowledge solve this issue well, they\nignore that confusion also occurs between law articles with a high posterior\nsemantic similarity due to the data imbalance problem instead of only between\nthe prior highly similar ones, which is this work's further finding. This paper\nproposes an end-to-end model named \\textit{D-LADAN} to solve the above\nchallenges. On the one hand, D-LADAN constructs a graph among law articles\nbased on their text definition and proposes a graph distillation operation\n(GDO) to distinguish the ones with a high prior semantic similarity. On the\nother hand, D-LADAN presents a novel momentum-updated memory mechanism to\ndynamically sense the posterior similarity between law articles (or charges)\nand a weighted GDO to adaptively capture the distinctions for revising the\ninductive bias caused by the data imbalance problem. We perform extensive\nexperiments to demonstrate that D-LADAN significantly outperforms\nstate-of-the-art methods in accuracy and robustness.\n","authors":["Nuo Xu","Pinghui Wang","Junzhou Zhao","Feiyang Sun","Lin Lan","Jing Tao","Li Pan","Xiaohong Guan"],"pdf_url":"https://arxiv.org/pdf/2408.09422v1.pdf","comment":"Accepted by ACM TOIS"},{"id":"http://arxiv.org/abs/2408.09420v1","updated":"2024-08-18T09:31:13Z","published":"2024-08-18T09:31:13Z","title":"Enhancing Startup Success Predictions in Venture Capital: A GraphRAG\n Augmented Multivariate Time Series Method","summary":" In the Venture Capital(VC) industry, predicting the success of startups is\nchallenging due to limited financial data and the need for subjective revenue\nforecasts. Previous methods based on time series analysis or deep learning\noften fall short as they fail to incorporate crucial inter-company\nrelationships such as competition and collaboration. Regarding the issues, we\npropose a novel approach using GrahphRAG augmented time series model. With\nGraphRAG, time series predictive methods are enhanced by integrating these\nvital relationships into the analysis framework, allowing for a more dynamic\nunderstanding of the startup ecosystem in venture capital. Our experimental\nresults demonstrate that our model significantly outperforms previous models in\nstartup success predictions. To the best of our knowledge, our work is the\nfirst application work of GraphRAG.\n","authors":["Gao Zitian","Xiao Yihao"],"pdf_url":"https://arxiv.org/pdf/2408.09420v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2312.13936,\n arXiv:2312.04876, arXiv:2402.11454 by other authors"},{"id":"http://arxiv.org/abs/2408.09416v1","updated":"2024-08-18T09:15:11Z","published":"2024-08-18T09:15:11Z","title":"Challenges and Responses in the Practice of Large Language Models","summary":" This paper carefully summarizes extensive and profound questions from all\nwalks of life, focusing on the current high-profile AI field, covering multiple\ndimensions such as industry trends, academic research, technological innovation\nand business applications. This paper meticulously curates questions that are\nboth thought-provoking and practically relevant, providing nuanced and\ninsightful answers to each. To facilitate readers' understanding and reference,\nthis paper specifically classifies and organizes these questions systematically\nand meticulously from the five core dimensions of computing power\ninfrastructure, software architecture, data resources, application scenarios,\nand brain science. This work aims to provide readers with a comprehensive,\nin-depth and cutting-edge AI knowledge framework to help people from all walks\nof life grasp the pulse of AI development, stimulate innovative thinking, and\npromote industrial progress.\n","authors":["Hongyin Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.09416v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09404v1","updated":"2024-08-18T08:30:16Z","published":"2024-08-18T08:30:16Z","title":"Comparison between the Structures of Word Co-occurrence and Word\n Similarity Networks for Ill-formed and Well-formed Texts in Taiwan Mandarin","summary":" The study of word co-occurrence networks has attracted the attention of\nresearchers due to their potential significance as well as applications.\nUnderstanding the structure of word co-occurrence networks is therefore\nimportant to fully realize their significance and usages. In past studies, word\nco-occurrence networks built on well-formed texts have been found to possess\ncertain characteristics, including being small-world, following a two-regime\npower law distribution, and being generally disassortative. On the flip side,\npast studies have found that word co-occurrence networks built from ill-formed\ntexts such as microblog posts may behave differently from those built from\nwell-formed documents. While both kinds of word co-occurrence networks are\nsmall-world and disassortative, word co-occurrence networks built from\nill-formed texts are scale-free and follow the power law distribution instead\nof the two-regime power law distribution. However, since past studies on the\nbehavior of word co-occurrence networks built from ill-formed texts only\ninvestigated English, the universality of such characteristics remains to be\nseen among different languages. In addition, it is yet to be investigated\nwhether there could be possible similitude/differences between word\nco-occurrence networks and other potentially comparable networks. This study\ntherefore investigates and compares the structure of word co-occurrence\nnetworks and word similarity networks based on Taiwan Mandarin ill-formed\ninternet forum posts and compare them with those built with well-formed\njudicial judgments, and seeks to find out whether the three aforementioned\nproperties (scale-free, small-world, and disassortative) for ill-formed and\nwell-formed texts are universal among different languages and between word\nco-occurrence and word similarity networks.\n","authors":["Po-Hsuan Huang","Hsuan-Lei Shao"],"pdf_url":"https://arxiv.org/pdf/2408.09404v1.pdf","comment":"4 pages, 1 figure, 5 tables"},{"id":"http://arxiv.org/abs/2403.14624v2","updated":"2024-08-18T08:10:16Z","published":"2024-03-21T17:59:50Z","title":"MathVerse: Does Your Multi-modal LLM Truly See the Diagrams in Visual\n Math Problems?","summary":" The remarkable progress of Multi-modal Large Language Models (MLLMs) has\ngarnered unparalleled attention, due to their superior performance in visual\ncontexts. However, their capabilities in visual math problem-solving remain\ninsufficiently evaluated and understood. We investigate current benchmarks to\nincorporate excessive visual content within textual questions, which\npotentially assist MLLMs in deducing answers without truly interpreting the\ninput diagrams. To this end, we introduce MathVerse, an all-around visual math\nbenchmark designed for an equitable and in-depth evaluation of MLLMs. We\nmeticulously collect 2,612 high-quality, multi-subject math problems with\ndiagrams from publicly available sources. Each problem is then transformed by\nhuman annotators into six distinct versions, each offering varying degrees of\ninformation content in multi-modality, contributing to 15K test samples in\ntotal. This approach allows MathVerse to comprehensively assess whether and how\nmuch MLLMs can truly understand the visual diagrams for mathematical reasoning.\nIn addition, we propose a Chain-of-Thought (CoT) evaluation strategy for a\nfine-grained assessment of the output answers. Rather than naively judging True\nor False, we employ GPT-4(V) to adaptively extract crucial reasoning steps, and\nthen score each step with detailed error analysis, which can reveal the\nintermediate CoT reasoning quality by MLLMs. We hope the MathVerse benchmark\nmay provide unique insights to guide the future development of MLLMs. Project\npage: https://mathverse-cuhk.github.io\n","authors":["Renrui Zhang","Dongzhi Jiang","Yichi Zhang","Haokun Lin","Ziyu Guo","Pengshuo Qiu","Aojun Zhou","Pan Lu","Kai-Wei Chang","Peng Gao","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2403.14624v2.pdf","comment":"Accepted by ECCV 2024, 46 Pages, Benchmark Project Page:\n https://mathverse-cuhk.github.io"},{"id":"http://arxiv.org/abs/2407.08440v2","updated":"2024-08-18T07:59:59Z","published":"2024-07-11T12:26:55Z","title":"Beyond Instruction Following: Evaluating Inferential Rule Following of\n Large Language Models","summary":" Although Large Language Models (LLMs) have demonstrated strong\ninstruction-following ability, they are further supposed to be controlled and\nguided by rules in real-world scenarios to be safe, accurate, and intelligent.\nThis demands the possession of inferential rule-following capability of LLMs.\nHowever, few works have made a clear evaluation of the inferential\nrule-following capability of LLMs. Previous studies that try to evaluate the\ninferential rule-following capability of LLMs fail to distinguish the\ninferential rule-following scenarios from the instruction-following scenarios.\nTherefore, this paper first clarifies the concept of inferential rule-following\nand proposes a comprehensive benchmark, RuleBench, to evaluate a diversified\nrange of inferential rule-following abilities. Our experimental results on a\nvariety of LLMs show that they are still limited in following rules. Our\nanalysis based on the evaluation results provides insights into the\nimprovements for LLMs toward a better inferential rule-following intelligent\nagent. We further propose Inferential Rule-Following Tuning (IRFT), which\noutperforms IFT in helping LLMs solve RuleBench. The data and code can be found\nat: https://anonymous.4open.science/r/llm-rule-following-B3E3/\n","authors":["Wangtao Sun","Chenxiang Zhang","Xueyou Zhang","Ziyang Huang","Haotian Xu","Pei Chen","Shizhu He","Jun Zhao","Kang Liu"],"pdf_url":"https://arxiv.org/pdf/2407.08440v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10450v2","updated":"2024-08-18T07:56:17Z","published":"2024-06-15T00:07:44Z","title":"TokenRec: Learning to Tokenize ID for LLM-based Generative\n Recommendation","summary":" There is a growing interest in utilizing large-scale language models (LLMs)\nto advance next-generation Recommender Systems (RecSys), driven by their\noutstanding language understanding and in-context learning capabilities. In\nthis scenario, tokenizing (i.e., indexing) users and items becomes essential\nfor ensuring a seamless alignment of LLMs with recommendations. While several\nstudies have made progress in representing users and items through textual\ncontents or latent representations, challenges remain in efficiently capturing\nhigh-order collaborative knowledge into discrete tokens that are compatible\nwith LLMs. Additionally, the majority of existing tokenization approaches often\nface difficulties in generalizing effectively to new/unseen users or items that\nwere not in the training corpus. To address these challenges, we propose a\nnovel framework called TokenRec, which introduces not only an effective ID\ntokenization strategy but also an efficient retrieval paradigm for LLM-based\nrecommendations. Specifically, our tokenization strategy, Masked\nVector-Quantized (MQ) Tokenizer, involves quantizing the masked user/item\nrepresentations learned from collaborative filtering into discrete tokens, thus\nachieving a smooth incorporation of high-order collaborative knowledge and a\ngeneralizable tokenization of users and items for LLM-based RecSys. Meanwhile,\nour generative retrieval paradigm is designed to efficiently recommend top-$K$\nitems for users to eliminate the need for the time-consuming auto-regressive\ndecoding and beam search processes used by LLMs, thus significantly reducing\ninference time. Comprehensive experiments validate the effectiveness of the\nproposed methods, demonstrating that TokenRec outperforms competitive\nbenchmarks, including both traditional recommender systems and emerging\nLLM-based recommender systems.\n","authors":["Haohao Qu","Wenqi Fan","Zihuai Zhao","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2406.10450v2.pdf","comment":"Submitted to IEEE TKDE. Our code and dataset will be made available\n upon acceptance of the paper"},{"id":"http://arxiv.org/abs/2408.09386v1","updated":"2024-08-18T07:06:57Z","published":"2024-08-18T07:06:57Z","title":"Game Development as Human-LLM Interaction","summary":" Game development is a highly specialized task that relies on a complex game\nengine powered by complex programming languages, preventing many gaming\nenthusiasts from handling it. This paper introduces the Interaction-driven Game\nEngine (IGE) powered by LLM, which allows everyone to develop a custom game\nusing natural language through Human-LLM interaction. To enable an LLM to\nfunction as an IGE, we instruct it to perform the following processes in each\nturn: (1) $P_{script}$ : configure the game script segment based on the user's\ninput; (2) $P_{code}$ : generate the corresponding code snippet based on the\ngame script segment; (3) $P_{utter}$ : interact with the user, including\nguidance and feedback. We propose a data synthesis pipeline based on the LLM to\ngenerate game script-code pairs and interactions from a few manually crafted\nseed data. We propose a three-stage progressive training strategy to transfer\nthe dialogue-based LLM to our IGE smoothly. We construct an IGE for poker games\nas a case study and comprehensively evaluate it from two perspectives:\ninteraction quality and code correctness. The code and data are available at\n\\url{https://github.com/alterego238/IGE}.\n","authors":["Jiale Hong","Hongqiu Wu","Hai Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.09386v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09385v1","updated":"2024-08-18T07:04:16Z","published":"2024-08-18T07:04:16Z","title":"Offline RLHF Methods Need More Accurate Supervision Signals","summary":" With the rapid advances in Large Language Models (LLMs), aligning LLMs with\nhuman preferences become increasingly important. Although Reinforcement\nLearning with Human Feedback (RLHF) proves effective, it is complicated and\nhighly resource-intensive. As such, offline RLHF has been introduced as an\nalternative solution, which directly optimizes LLMs with ranking losses on a\nfixed preference dataset. Current offline RLHF only captures the ``ordinal\nrelationship'' between responses, overlooking the crucial aspect of ``how\nmuch'' one is preferred over the others. To address this issue, we propose a\nsimple yet effective solution called \\textbf{R}eward \\textbf{D}ifference\n\\textbf{O}ptimization, shorted as \\textbf{RDO}. Specifically, we introduce {\\it\nreward difference coefficients} to reweigh sample pairs in offline RLHF. We\nthen develop a {\\it difference model} involving rich interactions between a\npair of responses for predicting these difference coefficients. Experiments\nwith 7B LLMs on the HH and TL;DR datasets substantiate the effectiveness of our\nmethod in both automatic metrics and human evaluation, thereby highlighting its\npotential for aligning LLMs with human intent and values.\n","authors":["Shiqi Wang","Zhengze Zhang","Rui Zhao","Fei Tan","Cam Tu Nguyen"],"pdf_url":"https://arxiv.org/pdf/2408.09385v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2408.09366v1","updated":"2024-08-18T05:41:36Z","published":"2024-08-18T05:41:36Z","title":"Improving and Assessing the Fidelity of Large Language Models Alignment\n to Online Communities","summary":" Large language models (LLMs) have shown promise in representing individuals\nand communities, offering new ways to study complex social dynamics. However,\neffectively aligning LLMs with specific human groups and systematically\nassessing the fidelity of the alignment remains a challenge. This paper\npresents a robust framework for aligning LLMs with online communities via\ninstruction-tuning and comprehensively evaluating alignment across various\naspects of language, including authenticity, emotional tone, toxicity, and\nharm. We demonstrate the utility of our approach by applying it to online\ncommunities centered on dieting and body image. We administer an eating\ndisorder psychometric test to the aligned LLMs to reveal unhealthy beliefs and\nsuccessfully differentiate communities with varying levels of eating disorder\nrisk. Our results highlight the potential of LLMs in automated moderation and\nbroader applications in public health and social science research.\n","authors":["Minh Duc Chu","Zihao He","Rebecca Dorn","Kristina Lerman"],"pdf_url":"https://arxiv.org/pdf/2408.09366v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09365v1","updated":"2024-08-18T05:37:48Z","published":"2024-08-18T05:37:48Z","title":"Concept Distillation from Strong to Weak Models via\n Hypotheses-to-Theories Prompting","summary":" Hand-crafting high quality prompts to optimize the performance of language\nmodels is a complicated and labor-intensive process. Furthermore, when\nmigrating to newer, smaller, or weaker models (possibly due to latency or cost\ngains), prompts need to be updated to re-optimize the task performance. We\npropose Concept Distillation (CD), an automatic prompt optimization technique\nfor enhancing weaker models on complex tasks. CD involves: (1) collecting\nmistakes made by weak models with a base prompt (initialization), (2) using a\nstrong model to generate reasons for these mistakes and create rules/concepts\nfor weak models (induction), and (3) filtering these rules based on validation\nset performance and integrating them into the base prompt\n(deduction/verification). We evaluated CD on NL2Code and mathematical reasoning\ntasks, observing significant performance boosts for small and weaker language\nmodels. Notably, Mistral-7B's accuracy on Multi-Arith increased by 20%, and\nPhi-3-mini-3.8B's accuracy on HumanEval rose by 34%. Compared to other\nautomated methods, CD offers an effective, cost-efficient strategy for\nimproving weak models' performance on complex tasks and enables seamless\nworkload migration across different language models without compromising\nperformance.\n","authors":["Emmanuel Aboah Boateng","Cassiano O. Becker","Nabiha Asghar","Kabir Walia","Ashwin Srinivasan","Ehi Nosakhare","Victor Dibia","Soundar Srinivasan"],"pdf_url":"https://arxiv.org/pdf/2408.09365v1.pdf","comment":"13 pages, 8 figures, conference"},{"id":"http://arxiv.org/abs/2308.16475v3","updated":"2024-08-18T04:08:29Z","published":"2023-08-31T05:40:14Z","title":"$\\rm SP^3$: Enhancing Structured Pruning via PCA Projection","summary":" Structured pruning is a widely used technique for reducing the size of\npre-trained language models (PLMs), but current methods often overlook the\npotential of compressing the hidden dimension (d) in PLMs, a dimension critical\nto model size and efficiency. This paper introduces a novel structured pruning\napproach, Structured Pruning with PCA Projection (SP3), targeting the effective\nreduction of d by projecting features into a space defined by principal\ncomponents before masking. Extensive experiments on benchmarks (GLUE and SQuAD)\nshow that SP3 can reduce d by 70%, compress 94% of the BERTbase model, maintain\nover 96% accuracy, and outperform other methods that compress d by 6% in\naccuracy at the same compression ratio. SP3 has also proven effective with\nother models, including OPT and Llama. Our data and code are available at an\nanonymous repo.\n","authors":["Yuxuan Hu","Jing Zhang","Zhe Zhao","Chen Zhao","Xiaodong Chen","Cuiping Li","Hong Chen"],"pdf_url":"https://arxiv.org/pdf/2308.16475v3.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2408.09333v1","updated":"2024-08-18T02:27:25Z","published":"2024-08-18T02:27:25Z","title":"SkyScript-100M: 1,000,000,000 Pairs of Scripts and Shooting Scripts for\n Short Drama","summary":" Generating high-quality shooting scripts containing information such as scene\nand shot language is essential for short drama script generation. We collect\n6,660 popular short drama episodes from the Internet, each with an average of\n100 short episodes, and the total number of short episodes is about 80,000,\nwith a total duration of about 2,000 hours and totaling 10 terabytes (TB). We\nperform keyframe extraction and annotation on each episode to obtain about\n10,000,000 shooting scripts. We perform 100 script restorations on the\nextracted shooting scripts based on our self-developed large short drama\ngeneration model SkyReels. This leads to a dataset containing 1,000,000,000\npairs of scripts and shooting scripts for short dramas, called SkyScript-100M.\nWe compare SkyScript-100M with the existing dataset in detail and demonstrate\nsome deeper insights that can be achieved based on SkyScript-100M. Based on\nSkyScript-100M, researchers can achieve several deeper and more far-reaching\nscript optimization goals, which may drive a paradigm shift in the entire field\nof text-to-video and significantly advance the field of short drama video\ngeneration. The data and code are available at\nhttps://github.com/vaew/SkyScript-100M.\n","authors":["Jing Tang","Quanlu Jia","Yuqiang Xie","Zeyu Gong","Xiang Wen","Jiayi Zhang","Yalong Guo","Guibin Chen","Jiangping Yang"],"pdf_url":"https://arxiv.org/pdf/2408.09333v1.pdf","comment":"18 pages, 12 figures"},{"id":"http://arxiv.org/abs/2408.09330v1","updated":"2024-08-18T02:06:25Z","published":"2024-08-18T02:06:25Z","title":"Fostering Natural Conversation in Large Language Models with NICO: a\n Natural Interactive COnversation dataset","summary":" Benefiting from diverse instruction datasets, contemporary Large Language\nModels (LLMs) perform effectively as AI assistants in collaborating with\nhumans. However, LLMs still struggle to generate natural and colloquial\nresponses in real-world applications such as chatbots and psychological\ncounseling that require more human-like interactions. To address these\nlimitations, we introduce NICO, a Natural Interactive COnversation dataset in\nChinese. We first use GPT-4-turbo to generate dialogue drafts and make them\ncover 20 daily-life topics and 5 types of social interactions. Then, we hire\nworkers to revise these dialogues to ensure that they are free of grammatical\nerrors and unnatural utterances. We define two dialogue-level natural\nconversation tasks and two sentence-level tasks for identifying and rewriting\nunnatural sentences. Multiple open-source and closed-source LLMs are tested and\nanalyzed in detail. The experimental results highlight the challenge of the\ntasks and demonstrate how NICO can help foster the natural dialogue\ncapabilities of LLMs. The dataset will be released.\n","authors":["Renliang Sun","Mengyuan Liu","Shiping Yang","Rui Wang","Junqing He","Jiaxing Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.09330v1.pdf","comment":"16 pages, 3 figures, 10 tables"},{"id":"http://arxiv.org/abs/2408.09327v1","updated":"2024-08-18T01:59:41Z","published":"2024-08-18T01:59:41Z","title":"Threshold Filtering Packing for Supervised Fine-Tuning: Training Related\n Samples within Packs","summary":" Packing for Supervised Fine-Tuning (SFT) in autoregressive models involves\nconcatenating data points of varying lengths until reaching the designed\nmaximum length to facilitate GPU processing. However, randomly concatenating\ndata points and feeding them into an autoregressive transformer can lead to\ncross-contamination of sequences due to the significant difference in their\nsubject matter. The mainstream approaches in SFT ensure that each token in the\nattention calculation phase only focuses on tokens within its own short\nsequence, without providing additional learning signals for the preceding\ncontext. To address these challenges, we introduce Threshold Filtering Packing\n(TFP), a method that selects samples with related context while maintaining\nsufficient diversity within the same pack. Our experiments show that TFP offers\na simple-to-implement and scalable approach that significantly enhances SFT\nperformance, with observed improvements of up to 7\\% on GSM8K, 4\\% on\nHumanEval, and 15\\% on the adult-census-income dataset.\n","authors":["Jiancheng Dong","Lei Jiang","Wei Jin","Lu Cheng"],"pdf_url":"https://arxiv.org/pdf/2408.09327v1.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.09326v1","updated":"2024-08-18T01:58:03Z","published":"2024-08-18T01:58:03Z","title":"Characterizing and Evaluating the Reliability of LLMs against Jailbreak\n Attacks","summary":" Large Language Models (LLMs) have increasingly become pivotal in content\ngeneration with notable societal impact. These models hold the potential to\ngenerate content that could be deemed harmful.Efforts to mitigate this risk\ninclude implementing safeguards to ensure LLMs adhere to social ethics.However,\ndespite such measures, the phenomenon of \"jailbreaking\" -- where carefully\ncrafted prompts elicit harmful responses from models -- persists as a\nsignificant challenge. Recognizing the continuous threat posed by jailbreaking\ntactics and their repercussions for the trustworthy use of LLMs, a rigorous\nassessment of the models' robustness against such attacks is essential. This\nstudy introduces an comprehensive evaluation framework and conducts an\nlarge-scale empirical experiment to address this need. We concentrate on 10\ncutting-edge jailbreak strategies across three categories, 1525 questions from\n61 specific harmful categories, and 13 popular LLMs. We adopt multi-dimensional\nmetrics such as Attack Success Rate (ASR), Toxicity Score, Fluency, Token\nLength, and Grammatical Errors to thoroughly assess the LLMs' outputs under\njailbreak. By normalizing and aggregating these metrics, we present a detailed\nreliability score for different LLMs, coupled with strategic recommendations to\nreduce their susceptibility to such vulnerabilities. Additionally, we explore\nthe relationships among the models, attack strategies, and types of harmful\ncontent, as well as the correlations between the evaluation metrics, which\nproves the validity of our multifaceted evaluation framework. Our extensive\nexperimental results demonstrate a lack of resilience among all tested LLMs\nagainst certain strategies, and highlight the need to concentrate on the\nreliability facets of LLMs. We believe our study can provide valuable insights\ninto enhancing the security evaluation of LLMs against jailbreak within the\ndomain.\n","authors":["Kexin Chen","Yi Liu","Dongxia Wang","Jiaying Chen","Wenhai Wang"],"pdf_url":"https://arxiv.org/pdf/2408.09326v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2408.09615v1","updated":"2024-08-18T23:53:34Z","published":"2024-08-18T23:53:34Z","title":"The First Competition on Resource-Limited Infrared Small Target\n Detection Challenge: Methods and Results","summary":" In this paper, we briefly summarize the first competition on resource-limited\ninfrared small target detection (namely, LimitIRSTD). This competition has two\ntracks, including weakly-supervised infrared small target detection (Track 1)\nand lightweight infrared small target detection (Track 2). 46 and 60 teams\nsuccessfully registered and took part in Tracks 1 and Track 2, respectively.\nThe top-performing methods and their results in each track are described with\ndetails. This competition inspires the community to explore the tough problems\nin the application of infrared small target detection, and ultimately promote\nthe deployment of this technology under limited resource.\n","authors":["Boyang Li","Xinyi Ying","Ruojing Li","Yongxian Liu","Yangsi Shi","Miao Li"],"pdf_url":"https://arxiv.org/pdf/2408.09615v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15787v3","updated":"2024-08-18T20:38:13Z","published":"2024-07-22T16:47:29Z","title":"M&M: Unsupervised Mamba-based Mastoidectomy for Cochlear Implant Surgery\n with Noisy Data","summary":" Cochlear Implant (CI) procedures involve inserting an array of electrodes\ninto the cochlea located inside the inner ear. Mastoidectomy is a surgical\nprocedure that uses a high-speed drill to remove part of the mastoid region of\nthe temporal bone, providing safe access to the cochlea through the middle and\ninner ear. We aim to develop an intraoperative navigation system that registers\nplans created using 3D preoperative Computerized Tomography (CT) volumes with\nthe 2D surgical microscope view. Herein, we propose a method to synthesize the\nmastoidectomy volume using only the preoperative CT scan, where the mastoid is\nintact. We introduce an unsupervised learning framework designed to synthesize\nmastoidectomy. For model training purposes, this method uses postoperative CT\nscans to avoid manual data cleaning or labeling, even when the region removed\nduring mastoidectomy is visible but affected by metal artifacts, low\nsignal-to-noise ratio, or electrode wiring. Our approach estimates\nmastoidectomy regions with a mean dice score of 70.0%. This approach represents\na major step forward for CI intraoperative navigation by predicting realistic\nmastoidectomy-removed regions in preoperative planning that can be used to\nregister the pre-surgery plan to intraoperative microscopy.\n","authors":["Yike Zhang","Eduardo Davalos","Dingjie Su","Ange Lou","Jack H. Noble"],"pdf_url":"https://arxiv.org/pdf/2407.15787v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08031v2","updated":"2024-08-18T18:53:16Z","published":"2024-04-11T17:59:52Z","title":"Latent Guard: a Safety Framework for Text-to-image Generation","summary":" With the ability to generate high-quality images, text-to-image (T2I) models\ncan be exploited for creating inappropriate content. To prevent misuse,\nexisting safety measures are either based on text blacklists, which can be\neasily circumvented, or harmful content classification, requiring large\ndatasets for training and offering low flexibility. Hence, we propose Latent\nGuard, a framework designed to improve safety measures in text-to-image\ngeneration. Inspired by blacklist-based approaches, Latent Guard learns a\nlatent space on top of the T2I model's text encoder, where it is possible to\ncheck the presence of harmful concepts in the input text embeddings. Our\nproposed framework is composed of a data generation pipeline specific to the\ntask using large language models, ad-hoc architectural components, and a\ncontrastive learning strategy to benefit from the generated data. The\neffectiveness of our method is verified on three datasets and against four\nbaselines. Code and data will be shared at https://latentguard.github.io/.\n","authors":["Runtao Liu","Ashkan Khakzar","Jindong Gu","Qifeng Chen","Philip Torr","Fabio Pizzati"],"pdf_url":"https://arxiv.org/pdf/2404.08031v2.pdf","comment":"This paper has been accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2408.09567v1","updated":"2024-08-18T18:40:30Z","published":"2024-08-18T18:40:30Z","title":"Enhancing ASL Recognition with GCNs and Successive Residual Connections","summary":" This study presents a novel approach for enhancing American Sign Language\n(ASL) recognition using Graph Convolutional Networks (GCNs) integrated with\nsuccessive residual connections. The method leverages the MediaPipe framework\nto extract key landmarks from each hand gesture, which are then used to\nconstruct graph representations. A robust preprocessing pipeline, including\ntranslational and scale normalization techniques, ensures consistency across\nthe dataset. The constructed graphs are fed into a GCN-based neural\narchitecture with residual connections to improve network stability. The\narchitecture achieves state-of-the-art results, demonstrating superior\ngeneralization capabilities with a validation accuracy of 99.14%.\n","authors":["Ushnish Sarkar","Archisman Chakraborti","Tapas Samanta","Sarbajit Pal","Amitabha Das"],"pdf_url":"https://arxiv.org/pdf/2408.09567v1.pdf","comment":"To be submitted in G2-SP CV 2024. Contains 7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.01669v4","updated":"2024-08-18T18:06:06Z","published":"2024-08-03T05:35:13Z","title":"SynopGround: A Large-Scale Dataset for Multi-Paragraph Video Grounding\n from TV Dramas and Synopses","summary":" Video grounding is a fundamental problem in multimodal content understanding,\naiming to localize specific natural language queries in an untrimmed video.\nHowever, current video grounding datasets merely focus on simple events and are\neither limited to shorter videos or brief sentences, which hinders the model\nfrom evolving toward stronger multimodal understanding capabilities. To address\nthese limitations, we present a large-scale video grounding dataset named\nSynopGround, in which more than 2800 hours of videos are sourced from popular\nTV dramas and are paired with accurately localized human-written synopses. Each\nparagraph in the synopsis serves as a language query and is manually annotated\nwith precise temporal boundaries in the long video. These paragraph queries are\ntightly correlated to each other and contain a wealth of abstract expressions\nsummarizing video storylines and specific descriptions portraying event\ndetails, which enables the model to learn multimodal perception on more\nintricate concepts over longer context dependencies. Based on the dataset, we\nfurther introduce a more complex setting of video grounding dubbed\nMulti-Paragraph Video Grounding (MPVG), which takes as input multiple\nparagraphs and a long video for grounding each paragraph query to its temporal\ninterval. In addition, we propose a novel Local-Global Multimodal Reasoner\n(LGMR) to explicitly model the local-global structures of long-term multimodal\ninputs for MPVG. Our method provides an effective baseline solution to the\nmulti-paragraph video grounding problem. Extensive experiments verify the\nproposed model's effectiveness as well as its superiority in long-term\nmulti-paragraph video grounding over prior state-of-the-arts. Dataset and code\nare publicly available. Project page: https://synopground.github.io/.\n","authors":["Chaolei Tan","Zihang Lin","Junfu Pu","Zhongang Qi","Wei-Yi Pei","Zhi Qu","Yexin Wang","Ying Shan","Wei-Shi Zheng","Jian-Fang Hu"],"pdf_url":"https://arxiv.org/pdf/2408.01669v4.pdf","comment":"Accepted to ACM MM 2024. Project page: https://synopground.github.io/"},{"id":"http://arxiv.org/abs/2408.09558v1","updated":"2024-08-18T17:53:26Z","published":"2024-08-18T17:53:26Z","title":"Generating Automatically Print/Scan Textures for Morphing Attack\n Detection Applications","summary":" Morphing Attack Detection (MAD) is a relevant topic that aims to detect\nattempts by unauthorised individuals to access a \"valid\" identity. One of the\nmain scenarios is printing morphed images and submitting the respective print\nin a passport application process. Today, small datasets are available to train\nthe MAD algorithm because of privacy concerns and the limitations resulting\nfrom the effort associated with the printing and scanning of images at large\nnumbers. In order to improve the detection capabilities and spot such morphing\nattacks, it will be necessary to have a larger and more realistic dataset\nrepresenting the passport application scenario with the diversity of devices\nand the resulting printed scanned or compressed images. Creating training data\nrepresenting the diversity of attacks is a very demanding task because the\ntraining material is developed manually. This paper proposes two different\nmethods based on transfer-transfer for automatically creating digital\nprint/scan face images and using such images in the training of a Morphing\nAttack Detection algorithm. Our proposed method can reach an Equal Error Rate\n(EER) of 3.84% and 1.92% on the FRGC/FERET database when including our\nsynthetic and texture-transfer print/scan with 600 dpi to handcrafted images,\nrespectively.\n","authors":["Juan E. Tapia","Maximilian Russo","Christoph Busch"],"pdf_url":"https://arxiv.org/pdf/2408.09558v1.pdf","comment":"Paper under revision process in Journal"},{"id":"http://arxiv.org/abs/2408.09554v1","updated":"2024-08-18T17:44:00Z","published":"2024-08-18T17:44:00Z","title":"Screen Them All: High-Throughput Pan-Cancer Genetic and Phenotypic\n Biomarker Screening from H\\&E Whole Slide Images","summary":" Many molecular alterations serve as clinically prognostic or\ntherapy-predictive biomarkers, typically detected using single or multi-gene\nmolecular assays. However, these assays are expensive, tissue destructive and\noften take weeks to complete. Using AI on routine H&E WSIs offers a fast and\neconomical approach to screen for multiple molecular biomarkers. We present a\nhigh-throughput AI-based system leveraging Virchow2, a foundation model\npre-trained on 3 million slides, to interrogate genomic features previously\ndetermined by an next-generation sequencing (NGS) assay, using 47,960 scanned\nhematoxylin and eosin (H&E) whole slide images (WSIs) from 38,984 cancer\npatients. Unlike traditional methods that train individual models for each\nbiomarker or cancer type, our system employs a unified model to simultaneously\npredict a wide range of clinically relevant molecular biomarkers across cancer\ntypes. By training the network to replicate the MSK-IMPACT targeted biomarker\npanel of 505 genes, it identified 80 high performing biomarkers with a mean\nAU-ROC of 0.89 in 15 most common cancer types. In addition, 40 biomarkers\ndemonstrated strong associations with specific cancer histologic subtypes.\nFurthermore, 58 biomarkers were associated with targets frequently assayed\nclinically for therapy selection and response prediction. The model can also\npredict the activity of five canonical signaling pathways, identify defects in\nDNA repair mechanisms, and predict genomic instability measured by tumor\nmutation burden, microsatellite instability (MSI), and chromosomal instability\n(CIN). The proposed model can offer potential to guide therapy selection,\nimprove treatment efficacy, accelerate patient screening for clinical trials\nand provoke the interrogation of new therapeutic targets.\n","authors":["Yi Kan Wang","Ludmila Tydlitatova","Jeremy D. Kunz","Gerard Oakley","Ran A. Godrich","Matthew C. H. Lee","Chad Vanderbilt","Razik Yousfi","Thomas Fuchs","David S. Klimstra","Siqi Liu"],"pdf_url":"https://arxiv.org/pdf/2408.09554v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.01139v2","updated":"2024-08-18T17:13:31Z","published":"2024-08-02T09:35:06Z","title":"Interpreting Global Perturbation Robustness of Image Models using\n Axiomatic Spectral Importance Decomposition","summary":" Perturbation robustness evaluates the vulnerabilities of models, arising from\na variety of perturbations, such as data corruptions and adversarial attacks.\nUnderstanding the mechanisms of perturbation robustness is critical for global\ninterpretability. We present a model-agnostic, global mechanistic\ninterpretability method to interpret the perturbation robustness of image\nmodels. This research is motivated by two key aspects. First, previous global\ninterpretability works, in tandem with robustness benchmarks, e.g. mean\ncorruption error (mCE), are not designed to directly interpret the mechanisms\nof perturbation robustness within image models. Second, we notice that the\nspectral signal-to-noise ratios (SNR) of perturbed natural images exponentially\ndecay over the frequency. This power-law-like decay implies that: Low-frequency\nsignals are generally more robust than high-frequency signals -- yet high\nclassification accuracy can not be achieved by low-frequency signals alone. By\napplying Shapley value theory, our method axiomatically quantifies the\npredictive powers of robust features and non-robust features within an\ninformation theory framework. Our method, dubbed as \\textbf{I-ASIDE}\n(\\textbf{I}mage \\textbf{A}xiomatic \\textbf{S}pectral \\textbf{I}mportance\n\\textbf{D}ecomposition \\textbf{E}xplanation), provides a unique insight into\nmodel robustness mechanisms. We conduct extensive experiments over a variety of\nvision models pre-trained on ImageNet to show that \\textbf{I-ASIDE} can not\nonly \\textbf{measure} the perturbation robustness but also \\textbf{provide\ninterpretations} of its mechanisms.\n","authors":["Róisín Luo","James McDermott","Colm O'Riordan"],"pdf_url":"https://arxiv.org/pdf/2408.01139v2.pdf","comment":"Accepted by Transactions on Machine Learning Research (TMLR 2024)"},{"id":"http://arxiv.org/abs/2408.07278v3","updated":"2024-08-18T16:45:32Z","published":"2024-08-03T13:03:31Z","title":"Scene-wise Adaptive Network for Dynamic Cold-start Scenes Optimization\n in CTR Prediction","summary":" In the realm of modern mobile E-commerce, providing users with nearby\ncommercial service recommendations through location-based online services has\nbecome increasingly vital. While machine learning approaches have shown promise\nin multi-scene recommendation, existing methodologies often struggle to address\ncold-start problems in unprecedented scenes: the increasing diversity of\ncommercial choices, along with the short online lifespan of scenes, give rise\nto the complexity of effective recommendations in online and dynamic scenes. In\nthis work, we propose Scene-wise Adaptive Network (SwAN), a novel approach that\nemphasizes high-performance cold-start online recommendations for new scenes.\nOur approach introduces several crucial capabilities, including scene\nsimilarity learning, user-specific scene transition cognition, scene-specific\ninformation construction for the new scene, and enhancing the diverged logical\ninformation between scenes. We demonstrate SwAN's potential to optimize dynamic\nmulti-scene recommendation problems by effectively online handling cold-start\nrecommendations for any newly arrived scenes. More encouragingly, SwAN has been\nsuccessfully deployed in Meituan's online catering recommendation service,\nwhich serves millions of customers per day, and SwAN has achieved a 5.64% CTR\nindex improvement relative to the baselines and a 5.19% increase in daily order\nvolume proportion.\n","authors":["Wenhao Li","Jie Zhou","Chuan Luo","Chao Tang","Kun Zhang","Shixiong Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.07278v3.pdf","comment":"10 pages, 6 figures, accepted by Recsys 2024"},{"id":"http://arxiv.org/abs/2408.09533v1","updated":"2024-08-18T16:40:11Z","published":"2024-08-18T16:40:11Z","title":"AnomalyFactory: Regard Anomaly Generation as Unsupervised Anomaly\n Localization","summary":" Recent advances in anomaly generation approaches alleviate the effect of data\ninsufficiency on task of anomaly localization. While effective, most of them\nlearn multiple large generative models on different datasets and cumbersome\nanomaly prediction models for different classes. To address the limitations, we\npropose a novel scalable framework, named AnomalyFactory, that unifies\nunsupervised anomaly generation and localization with same network\narchitecture. It starts with a BootGenerator that combines structure of a\ntarget edge map and appearance of a reference color image with the guidance of\na learned heatmap. Then, it proceeds with a FlareGenerator that receives\nsupervision signals from the BootGenerator and reforms the heatmap to indicate\nanomaly locations in the generated image. Finally, it easily transforms the\nsame network architecture to a BlazeDetector that localizes anomaly pixels with\nthe learned heatmap by converting the anomaly images generated by the\nFlareGenerator to normal images. By manipulating the target edge maps and\ncombining them with various reference images, AnomalyFactory generates\nauthentic and diversity samples cross domains. Comprehensive experiments\ncarried on 5 datasets, including MVTecAD, VisA, MVTecLOCO, MADSim and RealIAD,\ndemonstrate that our approach is superior to competitors in generation\ncapability and scalability.\n","authors":["Ying Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.09533v1.pdf","comment":"Accepted to the 2nd workshop on Vision-based InduStrial InspectiON\n (VISION) at ECCV 2024"},{"id":"http://arxiv.org/abs/2307.01013v2","updated":"2024-08-18T15:38:19Z","published":"2023-07-03T13:44:36Z","title":"A Synthetic Benchmarking Pipeline to Compare Camera Calibration\n Algorithms","summary":" Accurate camera calibration is crucial for various computer vision\napplications. However, measuring calibration accuracy in the real world is\nchallenging due to the lack of datasets with ground truth to evaluate them. In\nthis paper, we present SynthCal, a synthetic camera calibration benchmarking\npipeline that generates images of calibration patterns to measure and enable\naccurate quantification of calibration algorithm performance in camera\nparameter estimation. We present a SynthCal generated calibration dataset with\nfour common patterns, two camera types, and two environments with varying view,\ndistortion, lighting, and noise levels for both monocular and multi-camera\nsystems. The dataset evaluates both single and multi-view calibration\nalgorithms by measuring re-projection and root-mean-square errors for identical\npatterns and camera settings. Additionally, we analyze the significance of\ndifferent patterns using different calibration configurations. The experimental\nresults demonstrate the effectiveness of SynthCal in evaluating various\ncalibration algorithms and patterns.\n","authors":["Lala Shakti Swarup Ray","Bo Zhou","Lars Krupp","Sungho Suh","Paul Lukowicz"],"pdf_url":"https://arxiv.org/pdf/2307.01013v2.pdf","comment":"ICPR 2024"},{"id":"http://arxiv.org/abs/2408.09511v1","updated":"2024-08-18T15:27:06Z","published":"2024-08-18T15:27:06Z","title":"NAVERO: Unlocking Fine-Grained Semantics for Video-Language\n Compositionality","summary":" We study the capability of Video-Language (VidL) models in understanding\ncompositions between objects, attributes, actions and their relations.\nComposition understanding becomes particularly challenging for video data since\nthe compositional relations rapidly change over time in videos. We first build\na benchmark named AARO to evaluate composition understanding related to actions\non top of spatial concepts. The benchmark is constructed by generating negative\ntexts with incorrect action descriptions for a given video and the model is\nexpected to pair a positive text with its corresponding video. Furthermore, we\npropose a training method called NAVERO which utilizes video-text data\naugmented with negative texts to enhance composition understanding. We also\ndevelop a negative-augmented visual-language matching loss which is used\nexplicitly to benefit from the generated negative text. We compare NAVERO with\nother state-of-the-art methods in terms of compositional understanding as well\nas video-text retrieval performance. NAVERO achieves significant improvement\nover other methods for both video-language and image-language composition\nunderstanding, while maintaining strong performance on traditional text-video\nretrieval tasks.\n","authors":["Chaofan Tao","Gukyeong Kwon","Varad Gunjal","Hao Yang","Zhaowei Cai","Yonatan Dukler","Ashwin Swaminathan","R. Manmatha","Colin Jon Taylor","Stefano Soatto"],"pdf_url":"https://arxiv.org/pdf/2408.09511v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20171v3","updated":"2024-08-18T15:15:36Z","published":"2024-07-29T17:00:09Z","title":"Diffusion Feedback Helps CLIP See Better","summary":" Contrastive Language-Image Pre-training (CLIP), which excels at abstracting\nopen-world representations across domains and modalities, has become a\nfoundation for a variety of vision and multimodal tasks. However, recent\nstudies reveal that CLIP has severe visual shortcomings, such as which can\nhardly distinguish orientation, quantity, color, structure, etc. These visual\nshortcomings also limit the perception capabilities of multimodal large\nlanguage models (MLLMs) built on CLIP. The main reason could be that the\nimage-text pairs used to train CLIP are inherently biased, due to the lack of\nthe distinctiveness of the text and the diversity of images. In this work, we\npresent a simple post-training approach for CLIP models, which largely\novercomes its visual shortcomings via a self-supervised diffusion process. We\nintroduce DIVA, which uses the DIffusion model as a Visual Assistant for CLIP.\nSpecifically, DIVA leverages generative feedback from text-to-image diffusion\nmodels to optimize CLIP representations, with only images (without\ncorresponding text). We demonstrate that DIVA improves CLIP's performance on\nthe challenging MMVP-VLM benchmark which assesses fine-grained visual abilities\nto a large extent (e.g., 3-7%), and enhances the performance of MLLMs and\nvision models on multimodal understanding and segmentation tasks. Extensive\nevaluation on 29 image classification and retrieval benchmarks confirms that\nour framework preserves CLIP's strong zero-shot capabilities. The code is\navailable at https://github.com/baaivision/DIVA.\n","authors":["Wenxuan Wang","Quan Sun","Fan Zhang","Yepeng Tang","Jing Liu","Xinlong Wang"],"pdf_url":"https://arxiv.org/pdf/2407.20171v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09496v1","updated":"2024-08-18T14:27:20Z","published":"2024-08-18T14:27:20Z","title":"StyleBrush: Style Extraction and Transfer from a Single Image","summary":" Stylization for visual content aims to add specific style patterns at the\npixel level while preserving the original structural features. Compared with\nusing predefined styles, stylization guided by reference style images is more\nchallenging, where the main difficulty is to effectively separate style from\nstructural elements. In this paper, we propose StyleBrush, a method that\naccurately captures styles from a reference image and ``brushes'' the extracted\nstyle onto other input visual content. Specifically, our architecture consists\nof two branches: ReferenceNet, which extracts style from the reference image,\nand Structure Guider, which extracts structural features from the input image,\nthus enabling image-guided stylization. We utilize LLM and T2I models to create\na dataset comprising 100K high-quality style images, encompassing a diverse\nrange of styles and contents with high aesthetic score. To construct training\npairs, we crop different regions of the same training image. Experiments show\nthat our approach achieves state-of-the-art results through both qualitative\nand quantitative analyses. We will release our code and dataset upon acceptance\nof the paper.\n","authors":["Wancheng Feng","Wanquan Feng","Dawei Huang","Jiaming Pei","Guangliang Cheng","Lukun Wang"],"pdf_url":"https://arxiv.org/pdf/2408.09496v1.pdf","comment":"9 pages, 6figures, Under Review"},{"id":"http://arxiv.org/abs/2401.01339v3","updated":"2024-08-18T14:26:31Z","published":"2024-01-02T18:59:55Z","title":"Street Gaussians: Modeling Dynamic Urban Scenes with Gaussian Splatting","summary":" This paper aims to tackle the problem of modeling dynamic urban streets for\nautonomous driving scenes. Recent methods extend NeRF by incorporating tracked\nvehicle poses to animate vehicles, enabling photo-realistic view synthesis of\ndynamic urban street scenes. However, significant limitations are their slow\ntraining and rendering speed. We introduce Street Gaussians, a new explicit\nscene representation that tackles these limitations. Specifically, the dynamic\nurban scene is represented as a set of point clouds equipped with semantic\nlogits and 3D Gaussians, each associated with either a foreground vehicle or\nthe background. To model the dynamics of foreground object vehicles, each\nobject point cloud is optimized with optimizable tracked poses, along with a 4D\nspherical harmonics model for the dynamic appearance. The explicit\nrepresentation allows easy composition of object vehicles and background, which\nin turn allows for scene editing operations and rendering at 135 FPS (1066\n$\\times$ 1600 resolution) within half an hour of training. The proposed method\nis evaluated on multiple challenging benchmarks, including KITTI and Waymo Open\ndatasets. Experiments show that the proposed method consistently outperforms\nstate-of-the-art methods across all datasets. The code will be released to\nensure reproducibility.\n","authors":["Yunzhi Yan","Haotong Lin","Chenxu Zhou","Weijie Wang","Haiyang Sun","Kun Zhan","Xianpeng Lang","Xiaowei Zhou","Sida Peng"],"pdf_url":"https://arxiv.org/pdf/2401.01339v3.pdf","comment":"Project page: https://zju3dv.github.io/street_gaussians/"},{"id":"http://arxiv.org/abs/2408.09494v1","updated":"2024-08-18T14:24:05Z","published":"2024-08-18T14:24:05Z","title":"Source-Free Test-Time Adaptation For Online Surface-Defect Detection","summary":" Surface defect detection is significant in industrial production. However,\ndetecting defects with varying textures and anomaly classes during the test\ntime is challenging. This arises due to the differences in data distributions\nbetween source and target domains. Collecting and annotating new data from the\ntarget domain and retraining the model is time-consuming and costly. In this\npaper, we propose a novel test-time adaptation surface-defect detection\napproach that adapts pre-trained models to new domains and classes during\ninference. Our approach involves two core ideas. Firstly, we introduce a\nsupervisor to filter samples and select only those with high confidence to\nupdate the model. This ensures that the model is not excessively biased by\nincorrect data. Secondly, we propose the augmented mean prediction to generate\nrobust pseudo labels and a dynamically-balancing loss to facilitate the model\nin effectively integrating classification and segmentation results to improve\nsurface-defect detection accuracy. Our approach is real-time and does not\nrequire additional offline retraining. Experiments demonstrate it outperforms\nstate-of-the-art techniques.\n","authors":["Yiran Song","Qianyu Zhou","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2408.09494v1.pdf","comment":"Accepted to ICPR 2024"},{"id":"http://arxiv.org/abs/2408.09476v1","updated":"2024-08-18T13:45:27Z","published":"2024-08-18T13:45:27Z","title":"Advances in Multiple Instance Learning for Whole Slide Image Analysis:\n Techniques, Challenges, and Future Directions","summary":" Whole slide images (WSIs) are gigapixel-scale digital images of H\\&E-stained\ntissue samples widely used in pathology. The substantial size and complexity of\nWSIs pose unique analytical challenges. Multiple Instance Learning (MIL) has\nemerged as a powerful approach for addressing these challenges, particularly in\ncancer classification and detection. This survey provides a comprehensive\noverview of the challenges and methodologies associated with applying MIL to\nWSI analysis, including attention mechanisms, pseudo-labeling, transformers,\npooling functions, and graph neural networks. Additionally, it explores the\npotential of MIL in discovering cancer cell morphology, constructing\ninterpretable machine learning models, and quantifying cancer grading. By\nsummarizing the current challenges, methodologies, and potential applications\nof MIL in WSI analysis, this survey aims to inform researchers about the state\nof the field and inspire future research directions.\n","authors":["Jun Wang","Yu Mao","Nan Guan","Chun Jason Xue"],"pdf_url":"https://arxiv.org/pdf/2408.09476v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09474v1","updated":"2024-08-18T13:39:43Z","published":"2024-08-18T13:39:43Z","title":"Image-Based Geolocation Using Large Vision-Language Models","summary":" Geolocation is now a vital aspect of modern life, offering numerous benefits\nbut also presenting serious privacy concerns. The advent of large\nvision-language models (LVLMs) with advanced image-processing capabilities\nintroduces new risks, as these models can inadvertently reveal sensitive\ngeolocation information. This paper presents the first in-depth study analyzing\nthe challenges posed by traditional deep learning and LVLM-based geolocation\nmethods. Our findings reveal that LVLMs can accurately determine geolocations\nfrom images, even without explicit geographic training.\n To address these challenges, we introduce \\tool{}, an innovative framework\nthat significantly enhances image-based geolocation accuracy. \\tool{} employs a\nsystematic chain-of-thought (CoT) approach, mimicking human geoguessing\nstrategies by carefully analyzing visual and contextual cues such as vehicle\ntypes, architectural styles, natural landscapes, and cultural elements.\nExtensive testing on a dataset of 50,000 ground-truth data points shows that\n\\tool{} outperforms both traditional models and human benchmarks in accuracy.\nIt achieves an impressive average score of 4550.5 in the GeoGuessr game, with\nan 85.37\\% win rate, and delivers highly precise geolocation predictions, with\nthe closest distances as accurate as 0.3 km. Furthermore, our study highlights\nissues related to dataset integrity, leading to the creation of a more robust\ndataset and a refined framework that leverages LVLMs' cognitive capabilities to\nimprove geolocation precision. These findings underscore \\tool{}'s superior\nability to interpret complex visual data, the urgent need to address emerging\nsecurity vulnerabilities posed by LVLMs, and the importance of responsible AI\ndevelopment to ensure user privacy protection.\n","authors":["Yi Liu","Junchen Ding","Gelei Deng","Yuekang Li","Tianwei Zhang","Weisong Sun","Yaowen Zheng","Jingquan Ge","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2408.09474v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.07708v2","updated":"2024-08-18T13:34:42Z","published":"2022-05-16T14:21:30Z","title":"Exploring Diversity-based Active Learning for 3D Object Detection in\n Autonomous Driving","summary":" 3D object detection has recently received much attention due to its great\npotential in autonomous vehicle (AV). The success of deep learning based object\ndetectors relies on the availability of large-scale annotated datasets, which\nis time-consuming and expensive to compile, especially for 3D bounding box\nannotation. In this work, we investigate diversity-based active learning (AL)\nas a potential solution to alleviate the annotation burden. Given limited\nannotation budget, only the most informative frames and objects are\nautomatically selected for human to annotate. Technically, we take the\nadvantage of the multimodal information provided in an AV dataset, and propose\na novel acquisition function that enforces spatial and temporal diversity in\nthe selected samples. We benchmark the proposed method against other AL\nstrategies under realistic annotation cost measurement, where the realistic\ncosts for annotating a frame and a 3D bounding box are both taken into\nconsideration. We demonstrate the effectiveness of the proposed method on the\nnuScenes dataset and show that it outperforms existing AL strategies\nsignificantly.\n","authors":["Jinpeng Lin","Zhihao Liang","Shengheng Deng","Lile Cai","Tao Jiang","Tianrui Li","Kui Jia","Xun Xu"],"pdf_url":"https://arxiv.org/pdf/2205.07708v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09465v1","updated":"2024-08-18T13:16:30Z","published":"2024-08-18T13:16:30Z","title":"MedMAP: Promoting Incomplete Multi-modal Brain Tumor Segmentation with\n Alignment","summary":" Brain tumor segmentation is often based on multiple magnetic resonance\nimaging (MRI). However, in clinical practice, certain modalities of MRI may be\nmissing, which presents a more difficult scenario. To cope with this challenge,\nKnowledge Distillation, Domain Adaption, and Shared Latent Space have emerged\nas commonly promising strategies. However, recent efforts typically overlook\nthe modality gaps and thus fail to learn important invariant feature\nrepresentations across different modalities. Such drawback consequently leads\nto limited performance for missing modality models. To ameliorate these\nproblems, pre-trained models are used in natural visual segmentation tasks to\nminimize the gaps. However, promising pre-trained models are often unavailable\nin medical image segmentation tasks. Along this line, in this paper, we propose\na novel paradigm that aligns latent features of involved modalities to a\nwell-defined distribution anchor as the substitution of the pre-trained model}.\nAs a major contribution, we prove that our novel training paradigm ensures a\ntight evidence lower bound, thus theoretically certifying its effectiveness.\nExtensive experiments on different backbones validate that the proposed\nparadigm can enable invariant feature representations and produce models with\nnarrowed modality gaps. Models with our alignment paradigm show their superior\nperformance on both BraTS2018 and BraTS2020 datasets.\n","authors":["Tianyi Liu","Zhaorui Tan","Muyin Chen","Xi Yang","Haochuan Jiang","Kaizhu Huang"],"pdf_url":"https://arxiv.org/pdf/2408.09465v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09464v1","updated":"2024-08-18T13:14:20Z","published":"2024-08-18T13:14:20Z","title":"3C: Confidence-Guided Clustering and Contrastive Learning for\n Unsupervised Person Re-Identification","summary":" Unsupervised person re-identification (Re-ID) aims to learn a feature network\nwith cross-camera retrieval capability in unlabelled datasets. Although the\npseudo-label based methods have achieved great progress in Re-ID, their\nperformance in the complex scenario still needs to sharpen up. In order to\nreduce potential misguidance, including feature bias, noise pseudo-labels and\ninvalid hard samples, accumulated during the learning process, in this pa per,\na confidence-guided clustering and contrastive learning (3C) framework is\nproposed for unsupervised person Re-ID. This 3C framework presents three\nconfidence degrees. i) In the clustering stage, the confidence of the\ndiscrepancy between samples and clusters is proposed to implement a harmonic\ndiscrepancy clustering algorithm (HDC). ii) In the forward-propagation training\nstage, the confidence of the camera diversity of a cluster is evaluated via a\nnovel camera information entropy (CIE). Then, the clusters with high CIE values\nwill play leading roles in training the model. iii) In the back-propagation\ntraining stage, the confidence of the hard sample in each cluster is designed\nand further used in a confidence integrated harmonic discrepancy (CHD), to\nselect the informative sample for updating the memory in contrastive learning.\nExtensive experiments on three popular Re-ID benchmarks demonstrate the\nsuperiority of the proposed framework. Particularly, the 3C framework achieves\nstate-of-the-art results: 86.7%/94.7%, 45.3%/73.1% and 47.1%/90.6% in terms of\nmAP/Rank-1 accuracy on Market-1501, the com plex datasets MSMT17 and VeRi-776,\nrespectively. Code is available at https://github.com/stone5265/3C-reid.\n","authors":["Mingxiao Zheng","Yanpeng Qu","Changjing Shang","Longzhi Yang","Qiang Shen"],"pdf_url":"https://arxiv.org/pdf/2408.09464v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09460v1","updated":"2024-08-18T12:48:48Z","published":"2024-08-18T12:48:48Z","title":"Fine-Grained Building Function Recognition from Street-View Images via\n Geometry-Aware Semi-Supervised Learning","summary":" In this work, we propose a geometry-aware semi-supervised method for\nfine-grained building function recognition. This method leverages the geometric\nrelationships between multi-source data to improve the accuracy of pseudo\nlabels in semi-supervised learning, extending the task's scope and making it\napplicable to cross-categorization systems of building function recognition.\nFirstly, we design an online semi-supervised pre-training stage, which\nfacilitates the precise acquisition of building facade location information in\nstreet-view images. In the second stage, we propose a geometry-aware coarse\nannotation generation module. This module effectively combines GIS data and\nstreet-view data based on the geometric relationships, improving the accuracy\nof pseudo annotations. In the third stage, we combine the newly generated\ncoarse annotations with the existing labeled dataset to achieve fine-grained\nfunctional recognition of buildings across multiple cities at a large scale.\nExtensive experiments demonstrate that our proposed framework exhibits superior\nperformance in fine-grained functional recognition of buildings. Within the\nsame categorization system, it achieves improvements of 7.6% and 4.8% compared\nto fully-supervised methods and state-of-the-art semi-supervised methods,\nrespectively. Additionally, our method also performs well in cross-city tasks,\ni.e., extending the model trained on OmniCity (New York) to new areas (i.e.,\nLos Angeles and Boston). This study provides a novel solution for the\nfine-grained function recognition of large-scale buildings across multiple\ncities, offering essential data for understanding urban infrastructure\nplanning, human activity patterns, and the interactions between humans and\nbuildings.\n","authors":["Weijia Li","Jinhua Yu","Dairong Chen","Yi Lin","Runming Dong","Xiang Zhang","Conghui He","Haohuan Fu"],"pdf_url":"https://arxiv.org/pdf/2408.09460v1.pdf","comment":"This paper is currently under review"},{"id":"http://arxiv.org/abs/2408.09458v1","updated":"2024-08-18T12:36:47Z","published":"2024-08-18T12:36:47Z","title":"G2Face: High-Fidelity Reversible Face Anonymization via Generative and\n Geometric Priors","summary":" Reversible face anonymization, unlike traditional face pixelization, seeks to\nreplace sensitive identity information in facial images with synthesized\nalternatives, preserving privacy without sacrificing image clarity. Traditional\nmethods, such as encoder-decoder networks, often result in significant loss of\nfacial details due to their limited learning capacity. Additionally, relying on\nlatent manipulation in pre-trained GANs can lead to changes in ID-irrelevant\nattributes, adversely affecting data utility due to GAN inversion inaccuracies.\nThis paper introduces G\\textsuperscript{2}Face, which leverages both generative\nand geometric priors to enhance identity manipulation, achieving high-quality\nreversible face anonymization without compromising data utility. We utilize a\n3D face model to extract geometric information from the input face, integrating\nit with a pre-trained GAN-based decoder. This synergy of generative and\ngeometric priors allows the decoder to produce realistic anonymized faces with\nconsistent geometry. Moreover, multi-scale facial features are extracted from\nthe original face and combined with the decoder using our novel identity-aware\nfeature fusion blocks (IFF). This integration enables precise blending of the\ngenerated facial patterns with the original ID-irrelevant features, resulting\nin accurate identity manipulation. Extensive experiments demonstrate that our\nmethod outperforms existing state-of-the-art techniques in face anonymization\nand recovery, while preserving high data utility. Code is available at\nhttps://github.com/Harxis/G2Face.\n","authors":["Haoxin Yang","Xuemiao Xu","Cheng Xu","Huaidong Zhang","Jing Qin","Yi Wang","Pheng-Ann Heng","Shengfeng He"],"pdf_url":"https://arxiv.org/pdf/2408.09458v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09454v1","updated":"2024-08-18T12:28:26Z","published":"2024-08-18T12:28:26Z","title":"Retina-inspired Object Motion Segmentation","summary":" Dynamic Vision Sensors (DVS) have emerged as a revolutionary technology with\na high temporal resolution that far surpasses RGB cameras. DVS technology draws\nbiological inspiration from photoreceptors and the initial retinal synapse. Our\nresearch showcases the potential of additional retinal functionalities to\nextract visual features. We provide a domain-agnostic and efficient algorithm\nfor ego-motion compensation based on Object Motion Sensitivity (OMS), one of\nthe multiple robust features computed within the mammalian retina. We develop a\nframework based on experimental neuroscience that translates OMS' biological\ncircuitry to a low-overhead algorithm. OMS processes DVS data from dynamic\nscenes to perform pixel-wise object motion segmentation. Using a real and a\nsynthetic dataset, we highlight OMS' ability to differentiate object motion\nfrom ego-motion, bypassing the need for deep networks. This paper introduces a\nbio-inspired computer vision method that dramatically reduces the number of\nparameters by a factor of 1000 compared to prior works. Our work paves the way\nfor robust, high-speed, and low-bandwidth decision-making for in-sensor\ncomputations.\n","authors":["Victoria Clerico","Shay Snyder","Arya Lohia","Md Abdullah-Al Kaiser","Gregory Schwartz","Akhilesh Jaiswal","Maryam Parsa"],"pdf_url":"https://arxiv.org/pdf/2408.09454v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02955v2","updated":"2024-08-18T12:22:06Z","published":"2024-03-05T13:25:30Z","title":"XAI-Based Detection of Adversarial Attacks on Deepfake Detectors","summary":" We introduce a novel methodology for identifying adversarial attacks on\ndeepfake detectors using eXplainable Artificial Intelligence (XAI). In an era\ncharacterized by digital advancement, deepfakes have emerged as a potent tool,\ncreating a demand for efficient detection systems. However, these systems are\nfrequently targeted by adversarial attacks that inhibit their performance. We\naddress this gap, developing a defensible deepfake detector by leveraging the\npower of XAI. The proposed methodology uses XAI to generate interpretability\nmaps for a given method, providing explicit visualizations of decision-making\nfactors within the AI models. We subsequently employ a pretrained feature\nextractor that processes both the input image and its corresponding XAI image.\nThe feature embeddings extracted from this process are then used for training a\nsimple yet effective classifier. Our approach contributes not only to the\ndetection of deepfakes but also enhances the understanding of possible\nadversarial attacks, pinpointing potential vulnerabilities. Furthermore, this\napproach does not change the performance of the deepfake detector. The paper\ndemonstrates promising results suggesting a potential pathway for future\ndeepfake detection mechanisms. We believe this study will serve as a valuable\ncontribution to the community, sparking much-needed discourse on safeguarding\ndeepfake detectors.\n","authors":["Ben Pinhasov","Raz Lapid","Rony Ohayon","Moshe Sipper","Yehudit Aperstein"],"pdf_url":"https://arxiv.org/pdf/2403.02955v2.pdf","comment":"Accepted at TMLR 2024"},{"id":"http://arxiv.org/abs/2407.21001v2","updated":"2024-08-18T12:21:32Z","published":"2024-07-30T17:46:06Z","title":"GABInsight: Exploring Gender-Activity Binding Bias in Vision-Language\n Models","summary":" Vision-language models (VLMs) are intensively used in many downstream tasks,\nincluding those requiring assessments of individuals appearing in the images.\nWhile VLMs perform well in simple single-person scenarios, in real-world\napplications, we often face complex situations in which there are persons of\ndifferent genders doing different activities. We show that in such cases, VLMs\nare biased towards identifying the individual with the expected gender\n(according to ingrained gender stereotypes in the model or other forms of\nsample selection bias) as the performer of the activity. We refer to this bias\nin associating an activity with the gender of its actual performer in an image\nor text as the Gender-Activity Binding (GAB) bias and analyze how this bias is\ninternalized in VLMs. To assess this bias, we have introduced the GAB dataset\nwith approximately 5500 AI-generated images that represent a variety of\nactivities, addressing the scarcity of real-world images for some scenarios. To\nhave extensive quality control, the generated images are evaluated for their\ndiversity, quality, and realism. We have tested 12 renowned pre-trained VLMs on\nthis dataset in the context of text-to-image and image-to-text retrieval to\nmeasure the effect of this bias on their predictions. Additionally, we have\ncarried out supplementary experiments to quantify the bias in VLMs' text\nencoders and to evaluate VLMs' capability to recognize activities. Our\nexperiments indicate that VLMs experience an average performance decline of\nabout 13.2% when confronted with gender-activity binding bias.\n","authors":["Ali Abdollahi","Mahdi Ghaznavi","Mohammad Reza Karimi Nejad","Arash Mari Oriyad","Reza Abbasi","Ali Salesi","Melika Behjati","Mohammad Hossein Rohban","Mahdieh Soleymani Baghshah"],"pdf_url":"https://arxiv.org/pdf/2407.21001v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09449v1","updated":"2024-08-18T12:15:22Z","published":"2024-08-18T12:15:22Z","title":"Attention Is Not What You Need: Revisiting Multi-Instance Learning for\n Whole Slide Image Classification","summary":" Although attention-based multi-instance learning algorithms have achieved\nimpressive performances on slide-level whole slide image (WSI) classification\ntasks, they are prone to mistakenly focus on irrelevant patterns such as\nstaining conditions and tissue morphology, leading to incorrect patch-level\npredictions and unreliable interpretability. Moreover, these attention-based\nMIL algorithms tend to focus on salient instances and struggle to recognize\nhard-to-classify instances. In this paper, we first demonstrate that\nattention-based WSI classification methods do not adhere to the standard MIL\nassumptions. From the standard MIL assumptions, we propose a surprisingly\nsimple yet effective instance-based MIL method for WSI classification\n(FocusMIL) based on max-pooling and forward amortized variational inference. We\nargue that synergizing the standard MIL assumption with variational inference\nencourages the model to focus on tumour morphology instead of spurious\ncorrelations. Our experimental evaluations show that FocusMIL significantly\noutperforms the baselines in patch-level classification tasks on the Camelyon16\nand TCGA-NSCLC benchmarks. Visualization results show that our method also\nachieves better classification boundaries for identifying hard instances and\nmitigates the effect of spurious correlations between bags and labels.\n","authors":["Xin Liu","Weijia Zhang","Min-Ling Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.09449v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09441v1","updated":"2024-08-18T11:23:21Z","published":"2024-08-18T11:23:21Z","title":"CLIP-CID: Efficient CLIP Distillation via Cluster-Instance\n Discrimination","summary":" Contrastive Language-Image Pre-training (CLIP) has achieved excellent\nperformance over a wide range of tasks. However, the effectiveness of CLIP\nheavily relies on a substantial corpus of pre-training data, resulting in\nnotable consumption of computational resources. Although knowledge distillation\nhas been widely applied in single modality models, how to efficiently expand\nknowledge distillation to vision-language foundation models with extensive data\nremains relatively unexplored. In this paper, we introduce CLIP-CID, a novel\ndistillation mechanism that effectively transfers knowledge from a large\nvision-language foundation model to a smaller model. We initially propose a\nsimple but efficient image semantic balance method to reduce transfer learning\nbias and improve distillation efficiency. This method filters out 43.7% of\nimage-text pairs from the LAION400M while maintaining superior performance.\nAfter that, we leverage cluster-instance discrimination to facilitate knowledge\ntransfer from the teacher model to the student model, thereby empowering the\nstudent model to acquire a holistic semantic comprehension of the pre-training\ndata. Experimental results demonstrate that CLIP-CID achieves state-of-the-art\nperformance on various downstream tasks including linear probe and zero-shot\nclassification.\n","authors":["Kaicheng Yang","Tiancheng Gu","Xiang An","Haiqiang Jiang","Xiangzi Dai","Ziyong Feng","Weidong Cai","Jiankang Deng"],"pdf_url":"https://arxiv.org/pdf/2408.09441v1.pdf","comment":"11 pages,8 figures"},{"id":"http://arxiv.org/abs/2408.09432v1","updated":"2024-08-18T10:29:35Z","published":"2024-08-18T10:29:35Z","title":"Deformation-aware GAN for Medical Image Synthesis with Substantially\n Misaligned Pairs","summary":" Medical image synthesis generates additional imaging modalities that are\ncostly, invasive or harmful to acquire, which helps to facilitate the clinical\nworkflow. When training pairs are substantially misaligned (e.g., lung MRI-CT\npairs with respiratory motion), accurate image synthesis remains a critical\nchallenge. Recent works explored the directional registration module to adjust\nmisalignment in generative adversarial networks (GANs); however, substantial\nmisalignment will lead to 1) suboptimal data mapping caused by correspondence\nambiguity, and 2) degraded image fidelity caused by morphology influence on\ndiscriminators. To address the challenges, we propose a novel Deformation-aware\nGAN (DA-GAN) to dynamically correct the misalignment during the image synthesis\nbased on multi-objective inverse consistency. Specifically, in the generative\nprocess, three levels of inverse consistency cohesively optimise symmetric\nregistration and image generation for improved correspondence. In the\nadversarial process, to further improve image fidelity under misalignment, we\ndesign deformation-aware discriminators to disentangle the mismatched spatial\nmorphology from the judgement of image fidelity. Experimental results show that\nDA-GAN achieved superior performance on a public dataset with simulated\nmisalignments and a real-world lung MRI-CT dataset with respiratory motion\nmisalignment. The results indicate the potential for a wide range of medical\nimage synthesis tasks such as radiotherapy planning.\n","authors":["Bowen Xin","Tony Young","Claire E Wainwright","Tamara Blake","Leo Lebrat","Thomas Gaass","Thomas Benkert","Alto Stemmer","David Coman","Jason Dowling"],"pdf_url":"https://arxiv.org/pdf/2408.09432v1.pdf","comment":"Accepted by MIDL2024"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2408.09585v1","updated":"2024-08-18T20:08:42Z","published":"2024-08-18T20:08:42Z","title":"On the Necessity of World Knowledge for Mitigating Missing Labels in\n Extreme Classification","summary":" Extreme Classification (XC) aims to map a query to the most relevant\ndocuments from a very large document set. XC algorithms used in real-world\napplications learn this mapping from datasets curated from implicit feedback,\nsuch as user clicks. However, these datasets inevitably suffer from missing\nlabels. In this work, we observe that systematic missing labels lead to missing\nknowledge, which is critical for accurately modelling relevance between queries\nand documents. We formally show that this absence of knowledge cannot be\nrecovered using existing methods such as propensity weighting and data\nimputation strategies that solely rely on the training dataset. While LLMs\nprovide an attractive solution to augment the missing knowledge, leveraging\nthem in applications with low latency requirements and large document sets is\nchallenging. To incorporate missing knowledge at scale, we propose SKIM\n(Scalable Knowledge Infusion for Missing Labels), an algorithm that leverages a\ncombination of small LM and abundant unstructured meta-data to effectively\nmitigate the missing label problem. We show the efficacy of our method on\nlarge-scale public datasets through exhaustive unbiased evaluation ranging from\nhuman annotations to simulations inspired from industrial settings. SKIM\noutperforms existing methods on Recall@100 by more than 10 absolute points.\nAdditionally, SKIM scales to proprietary query-ad retrieval datasets containing\n10 million documents, outperforming contemporary methods by 12% in offline\nevaluation and increased ad click-yield by 1.23% in an online A/B test\nconducted on a popular search engine. We release our code, prompts, trained XC\nmodels and finetuned SLMs at: https://github.com/bicycleman15/skim\n","authors":["Jatin Prakash","Anirudh Buvanesh","Bishal Santra","Deepak Saini","Sachin Yadav","Jian Jiao","Yashoteja Prabhu","Amit Sharma","Manik Varma"],"pdf_url":"https://arxiv.org/pdf/2408.09585v1.pdf","comment":"Preprint, 23 pages"},{"id":"http://arxiv.org/abs/2311.09101v3","updated":"2024-08-18T16:52:14Z","published":"2023-11-15T16:47:57Z","title":"Towards A Unified View of Answer Calibration for Multi-Step Reasoning","summary":" Large Language Models (LLMs) employing Chain-of-Thought (CoT) prompting have\nbroadened the scope for improving multi-step reasoning capabilities. We\ngenerally divide multi-step reasoning into two phases: path generation to\ngenerate the reasoning path(s); and answer calibration post-processing the\nreasoning path(s) to obtain a final answer. However, the existing literature\nlacks systematic analysis on different answer calibration approaches. In this\npaper, we summarize the taxonomy of recent answer calibration techniques and\nbreak them down into step-level and path-level strategies. We then conduct a\nthorough evaluation on these strategies from a unified view, systematically\nscrutinizing step-level and path-level answer calibration across multiple\npaths. Experimental results reveal that integrating the dominance of both\nstrategies tends to derive optimal outcomes. Our study holds the potential to\nilluminate key insights for optimizing multi-step reasoning with answer\ncalibration.\n","authors":["Shumin Deng","Ningyu Zhang","Nay Oo","Bryan Hooi"],"pdf_url":"https://arxiv.org/pdf/2311.09101v3.pdf","comment":"Accepted by NLRSE@ACL2024"},{"id":"http://arxiv.org/abs/2408.07278v3","updated":"2024-08-18T16:45:32Z","published":"2024-08-03T13:03:31Z","title":"Scene-wise Adaptive Network for Dynamic Cold-start Scenes Optimization\n in CTR Prediction","summary":" In the realm of modern mobile E-commerce, providing users with nearby\ncommercial service recommendations through location-based online services has\nbecome increasingly vital. While machine learning approaches have shown promise\nin multi-scene recommendation, existing methodologies often struggle to address\ncold-start problems in unprecedented scenes: the increasing diversity of\ncommercial choices, along with the short online lifespan of scenes, give rise\nto the complexity of effective recommendations in online and dynamic scenes. In\nthis work, we propose Scene-wise Adaptive Network (SwAN), a novel approach that\nemphasizes high-performance cold-start online recommendations for new scenes.\nOur approach introduces several crucial capabilities, including scene\nsimilarity learning, user-specific scene transition cognition, scene-specific\ninformation construction for the new scene, and enhancing the diverged logical\ninformation between scenes. We demonstrate SwAN's potential to optimize dynamic\nmulti-scene recommendation problems by effectively online handling cold-start\nrecommendations for any newly arrived scenes. More encouragingly, SwAN has been\nsuccessfully deployed in Meituan's online catering recommendation service,\nwhich serves millions of customers per day, and SwAN has achieved a 5.64% CTR\nindex improvement relative to the baselines and a 5.19% increase in daily order\nvolume proportion.\n","authors":["Wenhao Li","Jie Zhou","Chuan Luo","Chao Tang","Kun Zhang","Shixiong Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.07278v3.pdf","comment":"10 pages, 6 figures, accepted by Recsys 2024"},{"id":"http://arxiv.org/abs/2408.01262v2","updated":"2024-08-18T15:48:02Z","published":"2024-08-02T13:35:11Z","title":"RAGEval: Scenario Specific RAG Evaluation Dataset Generation Framework","summary":" Retrieval-Augmented Generation (RAG) systems have demonstrated their\nadvantages in alleviating the hallucination of Large Language Models (LLMs).\nExisting RAG benchmarks mainly focus on evaluating whether LLMs can correctly\nanswer the general knowledge. However, they are unable to evaluate the\neffectiveness of the RAG system in dealing with the data from different\nvertical domains. This paper introduces RAGEval, a framework for automatically\ngenerating evaluation datasets to evaluate the knowledge usage ability of\ndifferent LLMs in different scenarios. Specifically, RAGEval summarizes a\nschema from seed documents, applies the configurations to generate diverse\ndocuments, and constructs question-answering pairs according to both articles\nand configurations. We propose three novel metrics, Completeness,\nHallucination, and Irrelevance, to carefully evaluate the responses generated\nby LLMs. By benchmarking RAG models in vertical domains, RAGEval has the\nability to better evaluate the knowledge usage ability of LLMs, which avoids\nthe confusion regarding the source of knowledge in answering question in\nexisting QA datasets--whether it comes from parameterized memory or retrieval.\nThe code and dataset will be released.\n","authors":["Kunlun Zhu","Yifan Luo","Dingling Xu","Ruobing Wang","Shi Yu","Shuo Wang","Yukun Yan","Zhenghao Liu","Xu Han","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2408.01262v2.pdf","comment":"16 pages, 9 figures"},{"id":"http://arxiv.org/abs/2402.11480v4","updated":"2024-08-18T15:36:17Z","published":"2024-02-18T07:06:17Z","title":"Pattern-wise Transparent Sequential Recommendation","summary":" A transparent decision-making process is essential for developing reliable\nand trustworthy recommender systems. For sequential recommendation, it means\nthat the model can identify critical items asthe justifications for its\nrecommendation results. However, achieving both model transparency and\nrecommendation performance simultaneously is challenging, especially for models\nthat take the entire sequence of items as input without screening. In this\npaper,we propose an interpretable framework (named PTSR) that enables a\npattern-wise transparent decision-making process. It breaks the sequence of\nitems into multi-level patterns that serve as atomic units for the entire\nrecommendation process. The contribution of each pattern to the outcome is\nquantified in the probability space. With a carefully designed pattern\nweighting correction, the pattern contribution can be learned in the absence of\nground-truth critical patterns. The final recommended items are those items\nthat most critical patterns strongly endorse. Extensive experiments on four\npublic datasets demonstrate remarkable recommendation performance, while case\nstudies validate the model transparency. Our code is available at\nhttps://anonymous.4open.science/r/PTSR-2237.\n","authors":["Kun Ma","Cong Xu","Zeyuan Chen","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.11480v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13168v3","updated":"2024-08-18T13:45:17Z","published":"2023-05-22T15:56:44Z","title":"LLMs for Knowledge Graph Construction and Reasoning: Recent Capabilities\n and Future Opportunities","summary":" This paper presents an exhaustive quantitative and qualitative evaluation of\nLarge Language Models (LLMs) for Knowledge Graph (KG) construction and\nreasoning. We engage in experiments across eight diverse datasets, focusing on\nfour representative tasks encompassing entity and relation extraction, event\nextraction, link prediction, and question-answering, thereby thoroughly\nexploring LLMs' performance in the domain of construction and inference.\nEmpirically, our findings suggest that LLMs, represented by GPT-4, are more\nsuited as inference assistants rather than few-shot information extractors.\nSpecifically, while GPT-4 exhibits good performance in tasks related to KG\nconstruction, it excels further in reasoning tasks, surpassing fine-tuned\nmodels in certain cases. Moreover, our investigation extends to the potential\ngeneralization ability of LLMs for information extraction, leading to the\nproposition of a Virtual Knowledge Extraction task and the development of the\ncorresponding VINE dataset. Based on these empirical findings, we further\npropose AutoKG, a multi-agent-based approach employing LLMs and external\nsources for KG construction and reasoning. We anticipate that this research can\nprovide invaluable insights for future undertakings in the field of knowledge\ngraphs. The code and datasets are in https://github.com/zjunlp/AutoKG.\n","authors":["Yuqi Zhu","Xiaohan Wang","Jing Chen","Shuofei Qiao","Yixin Ou","Yunzhi Yao","Shumin Deng","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.13168v3.pdf","comment":"World Wide Web Journal"},{"id":"http://arxiv.org/abs/2408.09459v1","updated":"2024-08-18T12:37:03Z","published":"2024-08-18T12:37:03Z","title":"WPN: An Unlearning Method Based on N-pair Contrastive Learning in\n Language Models","summary":" Generative language models (LMs) offer numerous advantages but may produce\ninappropriate or harmful outputs due to the harmful knowledge acquired during\npre-training. This knowledge often manifests as undesirable correspondences,\nsuch as \"harmful prompts\" leading to \"harmful outputs,\" which our research aims\nto mitigate through unlearning techniques.However, existing unlearning methods\nbased on gradient ascent can significantly impair the performance of LMs. To\naddress this issue, we propose a novel approach called Weighted Positional\nN-pair (WPN) Learning, which leverages position-weighted mean pooling within an\nn-pair contrastive learning framework. WPN is designed to modify the output\ndistribution of LMs by eliminating specific harmful outputs (e.g., replacing\ntoxic responses with neutral ones), thereby transforming the model's behavior\nfrom \"harmful prompt-harmful output\" to \"harmful prompt-harmless\nresponse\".Experiments on OPT and GPT-NEO LMs show that WPN effectively reduces\nthe proportion of harmful responses, achieving a harmless rate of up to 95.8\\%\nwhile maintaining stable performance on nine common benchmarks (with less than\n2\\% degradation on average). Moreover, we provide empirical evidence to\ndemonstrate WPN's ability to weaken the harmful correspondences in terms of\ngeneralizability and robustness, as evaluated on out-of-distribution test sets\nand under adversarial attacks.\n","authors":["Guitao Chen","Yunshen Wang","Hongye Sun","Guang Chen"],"pdf_url":"https://arxiv.org/pdf/2408.09459v1.pdf","comment":"ECAI 2024"},{"id":"http://arxiv.org/abs/2408.09439v1","updated":"2024-08-18T11:07:38Z","published":"2024-08-18T11:07:38Z","title":"Towards Boosting LLMs-driven Relevance Modeling with Progressive\n Retrieved Behavior-augmented Prompting","summary":" Relevance modeling is a critical component for enhancing user experience in\nsearch engines, with the primary objective of identifying items that align with\nusers' queries. Traditional models only rely on the semantic congruence between\nqueries and items to ascertain relevance. However, this approach represents\nmerely one aspect of the relevance judgement, and is insufficient in isolation.\nEven powerful Large Language Models (LLMs) still cannot accurately judge the\nrelevance of a query and an item from a semantic perspective. To augment\nLLMs-driven relevance modeling, this study proposes leveraging user\ninteractions recorded in search logs to yield insights into users' implicit\nsearch intentions. The challenge lies in the effective prompting of LLMs to\ncapture dynamic search intentions, which poses several obstacles in real-world\nrelevance scenarios, i.e., the absence of domain-specific knowledge, the\ninadequacy of an isolated prompt, and the prohibitive costs associated with\ndeploying LLMs. In response, we propose ProRBP, a novel Progressive Retrieved\nBehavior-augmented Prompting framework for integrating search scenario-oriented\nknowledge with LLMs effectively. Specifically, we perform the user-driven\nbehavior neighbors retrieval from the daily search logs to obtain\ndomain-specific knowledge in time, retrieving candidates that users consider to\nmeet their expectations. Then, we guide LLMs for relevance modeling by\nemploying advanced prompting techniques that progressively improve the outputs\nof the LLMs, followed by a progressive aggregation with comprehensive\nconsideration of diverse aspects. For online serving, we have developed an\nindustrial application framework tailored for the deployment of LLMs in\nrelevance modeling. Experiments on real-world industry data and online A/B\ntesting demonstrate our proposal achieves promising performance.\n","authors":["Zeyuan Chen","Haiyan Wu","Kaixin Wu","Wei Chen","Mingjie Zhong","Jia Xu","Zhongyi Liu","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.09439v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09437v1","updated":"2024-08-18T10:55:04Z","published":"2024-08-18T10:55:04Z","title":"Hindi-BEIR : A Large Scale Retrieval Benchmark in Hindi","summary":" Given the large number of Hindi speakers worldwide, there is a pressing need\nfor robust and efficient information retrieval systems for Hindi. Despite\nongoing research, there is a lack of comprehensive benchmark for evaluating\nretrieval models in Hindi. To address this gap, we introduce the Hindi version\nof the BEIR benchmark, which includes a subset of English BEIR datasets\ntranslated to Hindi, existing Hindi retrieval datasets, and synthetically\ncreated datasets for retrieval. The benchmark is comprised of $15$ datasets\nspanning across $8$ distinct tasks. We evaluate state-of-the-art multilingual\nretrieval models on this benchmark to identify task and domain-specific\nchallenges and their impact on retrieval performance. By releasing this\nbenchmark and a set of relevant baselines, we enable researchers to understand\nthe limitations and capabilities of current Hindi retrieval models, promoting\nadvancements in this critical area. The datasets from Hindi-BEIR are publicly\navailable.\n","authors":["Arkadeep Acharya","Rudra Murthy","Vishwajeet Kumar","Jaydeep Sen"],"pdf_url":"https://arxiv.org/pdf/2408.09437v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10450v2","updated":"2024-08-18T07:56:17Z","published":"2024-06-15T00:07:44Z","title":"TokenRec: Learning to Tokenize ID for LLM-based Generative\n Recommendation","summary":" There is a growing interest in utilizing large-scale language models (LLMs)\nto advance next-generation Recommender Systems (RecSys), driven by their\noutstanding language understanding and in-context learning capabilities. In\nthis scenario, tokenizing (i.e., indexing) users and items becomes essential\nfor ensuring a seamless alignment of LLMs with recommendations. While several\nstudies have made progress in representing users and items through textual\ncontents or latent representations, challenges remain in efficiently capturing\nhigh-order collaborative knowledge into discrete tokens that are compatible\nwith LLMs. Additionally, the majority of existing tokenization approaches often\nface difficulties in generalizing effectively to new/unseen users or items that\nwere not in the training corpus. To address these challenges, we propose a\nnovel framework called TokenRec, which introduces not only an effective ID\ntokenization strategy but also an efficient retrieval paradigm for LLM-based\nrecommendations. Specifically, our tokenization strategy, Masked\nVector-Quantized (MQ) Tokenizer, involves quantizing the masked user/item\nrepresentations learned from collaborative filtering into discrete tokens, thus\nachieving a smooth incorporation of high-order collaborative knowledge and a\ngeneralizable tokenization of users and items for LLM-based RecSys. Meanwhile,\nour generative retrieval paradigm is designed to efficiently recommend top-$K$\nitems for users to eliminate the need for the time-consuming auto-regressive\ndecoding and beam search processes used by LLMs, thus significantly reducing\ninference time. Comprehensive experiments validate the effectiveness of the\nproposed methods, demonstrating that TokenRec outperforms competitive\nbenchmarks, including both traditional recommender systems and emerging\nLLM-based recommender systems.\n","authors":["Haohao Qu","Wenqi Fan","Zihuai Zhao","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2406.10450v2.pdf","comment":"Submitted to IEEE TKDE. Our code and dataset will be made available\n upon acceptance of the paper"},{"id":"http://arxiv.org/abs/2408.09380v1","updated":"2024-08-18T06:41:46Z","published":"2024-08-18T06:41:46Z","title":"ELASTIC: Efficient Linear Attention for Sequential Interest Compression","summary":" State-of-the-art sequential recommendation models heavily rely on\ntransformer's attention mechanism. However, the quadratic computational and\nmemory complexities of self attention have limited its scalability for modeling\nusers' long range behaviour sequences. To address this problem, we propose\nELASTIC, an Efficient Linear Attention for SequenTial Interest Compression,\nrequiring only linear time complexity and decoupling model capacity from\ncomputational cost. Specifically, ELASTIC introduces a fixed length interest\nexperts with linear dispatcher attention mechanism which compresses the\nlong-term behaviour sequences to a significantly more compact representation\nwhich reduces up to 90% GPU memory usage with x2.7 inference speed up. The\nproposed linear dispatcher attention mechanism significantly reduces the\nquadratic complexity and makes the model feasible for adequately modeling\nextremely long sequences. Moreover, in order to retain the capacity for\nmodeling various user interests, ELASTIC initializes a vast learnable interest\nmemory bank and sparsely retrieves compressed user's interests from the memory\nwith a negligible computational overhead. The proposed interest memory\nretrieval technique significantly expands the cardinality of available interest\nspace while keeping the same computational cost, thereby striking a trade-off\nbetween recommendation accuracy and efficiency. To validate the effectiveness\nof our proposed ELASTIC, we conduct extensive experiments on various public\ndatasets and compare it with several strong sequential recommenders.\nExperimental results demonstrate that ELASTIC consistently outperforms\nbaselines by a significant margin and also highlight the computational\nefficiency of ELASTIC when modeling long sequences. We will make our\nimplementation code publicly available.\n","authors":["Jiaxin Deng","Shiyao Wang","Song Lu","Yinfeng Li","Xinchen Luo","Yuanjun Liu","Peixing Xu","Guorui Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.09380v1.pdf","comment":"Submitted to AAAI 2025"},{"id":"http://arxiv.org/abs/2408.09378v1","updated":"2024-08-18T06:30:31Z","published":"2024-08-18T06:30:31Z","title":"Gender Dynamics in Russian Online Political Discourse","summary":" The digital landscape provides a dynamic platform for political discourse\ncrucial for understanding shifts in public opinion and engagement especially\nunder authoritarian governments This study examines YouTube user behavior\nduring the Russian-Ukrainian war analyzing 2168 videos with over 36000 comments\nfrom January 2022 to February 2024 We observe distinct patterns of\nparticipation and gender dynamics that correlate with major political and\nmilitary events Notably females were more active in antigovernment channels\nespecially during peak conflict periods Contrary to assumptions about online\nengagement in authoritarian contexts our findings suggest a complex interplay\nwhere women emerge as pivotal digital communicators This highlights online\nplatforms role in facilitating political expression under authoritarian regimes\ndemonstrating its potential as a barometer for public sentiment.\n","authors":["Elizaveta Savchenko","Michael Raphael Freedman"],"pdf_url":"https://arxiv.org/pdf/2408.09378v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09345v1","updated":"2024-08-18T03:47:34Z","published":"2024-08-18T03:47:34Z","title":"Deep Code Search with Naming-Agnostic Contrastive Multi-View Learning","summary":" Software development is a repetitive task, as developers usually reuse or get\ninspiration from existing implementations. Code search, which refers to the\nretrieval of relevant code snippets from a codebase according to the\ndeveloper's intent that has been expressed as a query, has become increasingly\nimportant in the software development process. Due to the success of deep\nlearning in various applications, a great number of deep learning based code\nsearch approaches have sprung up and achieved promising results. However,\ndevelopers may not follow the same naming conventions and the same variable may\nhave different variable names in different implementations, bringing a\nchallenge to deep learning based code search methods that rely on explicit\nvariable correspondences to understand source code. To overcome this challenge,\nwe propose a naming-agnostic code search method (NACS) based on contrastive\nmulti-view code representation learning. NACS strips information bound to\nvariable names from Abstract Syntax Tree (AST), the representation of the\nabstract syntactic structure of source code, and focuses on capturing intrinsic\nproperties solely from AST structures. We use semantic-level and syntax-level\naugmentation techniques to prepare realistically rational data and adopt\ncontrastive learning to design a graph-view modeling component in NACS to\nenhance the understanding of code snippets. We further model ASTs in a path\nview to strengthen the graph-view modeling component through multi-view\nlearning. Extensive experiments show that NACS provides superior code search\nperformance compared to baselines and NACS can be adapted to help existing code\nsearch methods overcome the impact of different naming conventions.\n","authors":["Jiadong Feng","Wei Li","Zhao Wei","Yong Xu","Juhong Wang","Hui Li"],"pdf_url":"https://arxiv.org/pdf/2408.09345v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2303.07103v3","updated":"2024-08-18T22:53:56Z","published":"2023-03-04T19:14:20Z","title":"Could a Large Language Model be Conscious?","summary":" There has recently been widespread discussion of whether large language\nmodels might be sentient. Should we take this idea seriously? I will break down\nthe strongest reasons for and against. Given mainstream assumptions in the\nscience of consciousness, there are significant obstacles to consciousness in\ncurrent models: for example, their lack of recurrent processing, a global\nworkspace, and unified agency. At the same time, it is quite possible that\nthese obstacles will be overcome in the next decade or so. I conclude that\nwhile it is somewhat unlikely that current large language models are conscious,\nwe should take seriously the possibility that successors to large language\nmodels may be conscious in the not-too-distant future.\n","authors":["David J. Chalmers"],"pdf_url":"https://arxiv.org/pdf/2303.07103v3.pdf","comment":"Invited lecture at NeurIPS, November 28, 2022"},{"id":"http://arxiv.org/abs/2408.09604v1","updated":"2024-08-18T22:11:24Z","published":"2024-08-18T22:11:24Z","title":"Circuit design in biology and machine learning. I. Random networks and\n dimensional reduction","summary":" A biological circuit is a neural or biochemical cascade, taking inputs and\nproducing outputs. How have biological circuits learned to solve environmental\nchallenges over the history of life? The answer certainly follows Dobzhansky's\nfamous quote that ``nothing in biology makes sense except in the light of\nevolution.'' But that quote leaves out the mechanistic basis by which natural\nselection's trial-and-error learning happens, which is exactly what we have to\nunderstand. How does the learning process that designs biological circuits\nactually work? How much insight can we gain about the form and function of\nbiological circuits by studying the processes that have made those circuits?\nBecause life's circuits must often solve the same problems as those faced by\nmachine learning, such as environmental tracking, homeostatic control,\ndimensional reduction, or classification, we can begin by considering how\nmachine learning designs computational circuits to solve problems. We can then\nask: How much insight do those computational circuits provide about the design\nof biological circuits? How much does biology differ from computers in the\nparticular circuit designs that it uses to solve problems? This article steps\nthrough two classic machine learning models to set the foundation for analyzing\nbroad questions about the design of biological circuits. One insight is the\nsurprising power of randomly connected networks. Another is the central role of\ninternal models of the environment embedded within biological circuits,\nillustrated by a model of dimensional reduction and trend prediction. Overall,\nmany challenges in biology have machine learning analogs, suggesting hypotheses\nabout how biology's circuits are designed.\n","authors":["Steven A. Frank"],"pdf_url":"https://arxiv.org/pdf/2408.09604v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15374v2","updated":"2024-08-18T21:55:19Z","published":"2024-04-21T21:47:54Z","title":"Minimum Description Feature Selection for Complexity Reduction in\n Machine Learning-based Wireless Positioning","summary":" Recently, deep learning approaches have provided solutions to difficult\nproblems in wireless positioning (WP). Although these WP algorithms have\nattained excellent and consistent performance against complex channel\nenvironments, the computational complexity coming from processing\nhigh-dimensional features can be prohibitive for mobile applications. In this\nwork, we design a novel positioning neural network (P-NN) that utilizes the\nminimum description features to substantially reduce the complexity of deep\nlearning-based WP. P-NN's feature selection strategy is based on maximum power\nmeasurements and their temporal locations to convey information needed to\nconduct WP. We improve P-NN's learning ability by intelligently processing two\ndifferent types of inputs: sparse image and measurement matrices. Specifically,\nwe implement a self-attention layer to reinforce the training ability of our\nnetwork. We also develop a technique to adapt feature space size, optimizing\nover the expected information gain and the classification capability quantified\nwith information-theoretic measures on signal bin selection. Numerical results\nshow that P-NN achieves a significant advantage in performance-complexity\ntradeoff over deep learning baselines that leverage the full power delay\nprofile (PDP). In particular, we find that P-NN achieves a large improvement in\nperformance for low SNR, as unnecessary measurements are discarded in our\nminimum description features.\n","authors":["Myeung Suk Oh","Anindya Bijoy Das","Taejoon Kim","David J. Love","Christopher G. Brinton"],"pdf_url":"https://arxiv.org/pdf/2404.15374v2.pdf","comment":"This paper has been accepted for the publication in IEEE Journal on\n Selected Areas in Communications. arXiv admin note: text overlap with\n arXiv:2402.09580"},{"id":"http://arxiv.org/abs/2407.08868v4","updated":"2024-08-18T20:37:23Z","published":"2024-07-11T21:10:03Z","title":"Generalizable Physics-Informed Learning for Stochastic Safety-Critical\n Systems","summary":" Accurate estimate of long-term risk is critical for safe decision-making, but\nsampling from rare risk events and long-term trajectories can be prohibitively\ncostly. Risk gradient can be used in many first-order techniques for learning\nand control methods, but gradient estimate is difficult to obtain using Monte\nCarlo (MC) methods because the infinitesimal divisor may significantly amplify\nsampling noise. Motivated by this gap, we propose an efficient method to\nevaluate long-term risk probabilities and their gradients using short-term\nsamples without sufficient risk events. We first derive that four types of\nlong-term risk probability are solutions of certain partial differential\nequations (PDEs). Then, we propose a physics-informed learning technique that\nintegrates data and physics information (aforementioned PDEs). The physics\ninformation helps propagate information beyond available data and obtain\nprovable generalization beyond available data, which in turn enables long-term\nrisk to be estimated using short-term samples of safe events. Finally, we\ndemonstrate in simulation that the proposed technique has improved sample\nefficiency, generalizes well to unseen regions, and adapts to changing system\nparameters.\n","authors":["Zhuoyuan Wang","Albert Chern","Yorie Nakahira"],"pdf_url":"https://arxiv.org/pdf/2407.08868v4.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2305.06432"},{"id":"http://arxiv.org/abs/2109.02383v2","updated":"2024-08-18T20:32:42Z","published":"2021-09-06T12:00:29Z","title":"Data Science Kitchen at GermEval 2021: A Fine Selection of Hand-Picked\n Features, Delivered Fresh from the Oven","summary":" This paper presents the contribution of the Data Science Kitchen at GermEval\n2021 shared task on the identification of toxic, engaging, and fact-claiming\ncomments. The task aims at extending the identification of offensive language,\nby including additional subtasks that identify comments which should be\nprioritized for fact-checking by moderators and community managers. Our\ncontribution focuses on a feature-engineering approach with a conventional\nclassification backend. We combine semantic and writing style embeddings\nderived from pre-trained deep neural networks with additional numerical\nfeatures, specifically designed for this task. Classifier ensembles are used to\nderive predictions for each subtask via a majority voting scheme. Our best\nsubmission achieved macro-averaged F1-scores of 66.8\\%,\\,69.9\\% and 72.5\\% for\nthe identification of toxic, engaging, and fact-claiming comments.\n","authors":["Niclas Hildebrandt","Benedikt Boenninghoff","Dennis Orth","Christopher Schymura"],"pdf_url":"https://arxiv.org/pdf/2109.02383v2.pdf","comment":"Accepted at 17th Conference on Natural Language Processing (KONVENS\n 2021)"},{"id":"http://arxiv.org/abs/2305.06432v3","updated":"2024-08-18T20:32:01Z","published":"2023-05-10T19:44:42Z","title":"A Generalizable Physics-informed Learning Framework for Risk Probability\n Estimation","summary":" Accurate estimates of long-term risk probabilities and their gradients are\ncritical for many stochastic safe control methods. However, computing such risk\nprobabilities in real-time and in unseen or changing environments is\nchallenging. Monte Carlo (MC) methods cannot accurately evaluate the\nprobabilities and their gradients as an infinitesimal devisor can amplify the\nsampling noise. In this paper, we develop an efficient method to evaluate the\nprobabilities of long-term risk and their gradients. The proposed method\nexploits the fact that long-term risk probability satisfies certain partial\ndifferential equations (PDEs), which characterize the neighboring relations\nbetween the probabilities, to integrate MC methods and physics-informed neural\nnetworks. We provide theoretical guarantees of the estimation error given\ncertain choices of training configurations. Numerical results show the proposed\nmethod has better sample efficiency, generalizes well to unseen regions, and\ncan adapt to systems with changing parameters. The proposed method can also\naccurately estimate the gradients of risk probabilities, which enables first-\nand second-order techniques on risk probabilities to be used for learning and\ncontrol.\n","authors":["Zhuoyuan Wang","Yorie Nakahira"],"pdf_url":"https://arxiv.org/pdf/2305.06432v3.pdf","comment":"Accepted at the 5th Annual Learning for Dynamics & Control (L4DC)\n Conference, 2023"},{"id":"http://arxiv.org/abs/2408.09585v1","updated":"2024-08-18T20:08:42Z","published":"2024-08-18T20:08:42Z","title":"On the Necessity of World Knowledge for Mitigating Missing Labels in\n Extreme Classification","summary":" Extreme Classification (XC) aims to map a query to the most relevant\ndocuments from a very large document set. XC algorithms used in real-world\napplications learn this mapping from datasets curated from implicit feedback,\nsuch as user clicks. However, these datasets inevitably suffer from missing\nlabels. In this work, we observe that systematic missing labels lead to missing\nknowledge, which is critical for accurately modelling relevance between queries\nand documents. We formally show that this absence of knowledge cannot be\nrecovered using existing methods such as propensity weighting and data\nimputation strategies that solely rely on the training dataset. While LLMs\nprovide an attractive solution to augment the missing knowledge, leveraging\nthem in applications with low latency requirements and large document sets is\nchallenging. To incorporate missing knowledge at scale, we propose SKIM\n(Scalable Knowledge Infusion for Missing Labels), an algorithm that leverages a\ncombination of small LM and abundant unstructured meta-data to effectively\nmitigate the missing label problem. We show the efficacy of our method on\nlarge-scale public datasets through exhaustive unbiased evaluation ranging from\nhuman annotations to simulations inspired from industrial settings. SKIM\noutperforms existing methods on Recall@100 by more than 10 absolute points.\nAdditionally, SKIM scales to proprietary query-ad retrieval datasets containing\n10 million documents, outperforming contemporary methods by 12% in offline\nevaluation and increased ad click-yield by 1.23% in an online A/B test\nconducted on a popular search engine. We release our code, prompts, trained XC\nmodels and finetuned SLMs at: https://github.com/bicycleman15/skim\n","authors":["Jatin Prakash","Anirudh Buvanesh","Bishal Santra","Deepak Saini","Sachin Yadav","Jian Jiao","Yashoteja Prabhu","Amit Sharma","Manik Varma"],"pdf_url":"https://arxiv.org/pdf/2408.09585v1.pdf","comment":"Preprint, 23 pages"},{"id":"http://arxiv.org/abs/2312.04610v6","updated":"2024-08-18T19:58:17Z","published":"2023-12-07T16:16:09Z","title":"Data-driven Semi-supervised Machine Learning with Surrogate Measures of\n Safety for Abnormal Driving Behavior Detection","summary":" Detecting abnormal driving behavior is critical for road traffic safety and\nthe evaluation of drivers' behavior. With the advancement of machine learning\n(ML) algorithms and the accumulation of naturalistic driving data, many ML\nmodels have been adopted for abnormal driving behavior detection (also referred\nto in this paper as anomalies). Most existing ML-based detectors rely on\n(fully) supervised ML methods, which require substantial labeled data. However,\nground truth labels are not always available in the real world, and labeling\nlarge amounts of data is tedious. Thus, there is a need to explore unsupervised\nor semi-supervised methods to make the anomaly detection process more feasible\nand efficient. To fill this research gap, this study analyzes large-scale\nreal-world data revealing several abnormal driving behaviors (e.g., sudden\nacceleration, rapid lane-changing) and develops a Hierarchical Extreme Learning\nMachines (HELM) based semi-supervised ML method using partly labeled data to\naccurately detect the identified abnormal driving behaviors. Moreover, previous\nML-based approaches predominantly utilized basic vehicle motion features (such\nas velocity and acceleration) to label and detect abnormal driving behaviors,\nwhile this study seeks to introduce Surrogate Measures of Safety (SMoS) as\ninput features for ML models to improve the detection performance. Results from\nextensive experiments demonstrate the effectiveness of the proposed\nsemi-supervised ML model with the introduced SMoS serving as important\nfeatures. The proposed semi-supervised ML method outperforms other baseline\nsemi-supervised or unsupervised methods regarding various metrics, e.g.,\ndelivering the best accuracy at 99.58% and the best F-1 measure at 0.9913. The\nablation study further highlights the significance of SMoS for advancing the\ndetection performance of abnormal driving behaviors.\n","authors":["Yongqi Dong","Lanxin Zhang","Haneen Farah","Arkady Zgonnikov","Bart van Arem"],"pdf_url":"https://arxiv.org/pdf/2312.04610v6.pdf","comment":"24 pages, 10 figures, accepted by the 103rd Transportation Research\n Board (TRB) Annual Meeting, under third round review by Transportation\n Research Record: Journal of the Transportation Research Board"},{"id":"http://arxiv.org/abs/2307.16062v2","updated":"2024-08-18T19:55:39Z","published":"2023-07-29T19:46:09Z","title":"Using Implicit Behavior Cloning and Dynamic Movement Primitive to\n Facilitate Reinforcement Learning for Robot Motion Planning","summary":" Reinforcement learning (RL) for motion planning of multi-degree-of-freedom\nrobots still suffers from low efficiency in terms of slow training speed and\npoor generalizability. In this paper, we propose a novel RL-based robot motion\nplanning framework that uses implicit behavior cloning (IBC) and dynamic\nmovement primitive (DMP) to improve the training speed and generalizability of\nan off-policy RL agent. IBC utilizes human demonstration data to leverage the\ntraining speed of RL, and DMP serves as a heuristic model that transfers motion\nplanning into a simpler planning space. To support this, we also create a human\ndemonstration dataset using a pick-and-place experiment that can be used for\nsimilar studies. Comparison studies in simulation reveal the advantage of the\nproposed method over the conventional RL agents with faster training speed and\nhigher scores. A real-robot experiment indicates the applicability of the\nproposed method to a simple assembly task. Our work provides a novel\nperspective on using motion primitives and human demonstration to leverage the\nperformance of RL for robot applications.\n","authors":["Zengjie Zhang","Jayden Hong","Amir Soufi Enayati","Homayoun Najjaran"],"pdf_url":"https://arxiv.org/pdf/2307.16062v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09583v1","updated":"2024-08-18T19:53:38Z","published":"2024-08-18T19:53:38Z","title":"Convolutional Conditional Neural Processes","summary":" Neural processes are a family of models which use neural networks to directly\nparametrise a map from data sets to predictions. Directly parametrising this\nmap enables the use of expressive neural networks in small-data problems where\nneural networks would traditionally overfit. Neural processes can produce\nwell-calibrated uncertainties, effectively deal with missing data, and are\nsimple to train. These properties make this family of models appealing for a\nbreadth of applications areas, such as healthcare or environmental sciences.\n This thesis advances neural processes in three ways.\n First, we propose convolutional neural processes (ConvNPs). ConvNPs improve\ndata efficiency of neural processes by building in a symmetry called\ntranslation equivariance. ConvNPs rely on convolutional neural networks rather\nthan multi-layer perceptrons.\n Second, we propose Gaussian neural processes (GNPs). GNPs directly\nparametrise dependencies in the predictions of a neural process. Current\napproaches to modelling dependencies in the predictions depend on a latent\nvariable, which consequently requires approximate inference, undermining the\nsimplicity of the approach.\n Third, we propose autoregressive conditional neural processes (AR CNPs). AR\nCNPs train a neural process without any modifications to the model or training\nprocedure and, at test time, roll out the model in an autoregressive fashion.\nAR CNPs equip the neural process framework with a new knob where modelling\ncomplexity and computational expense at training time can be traded for\ncomputational expense at test time.\n In addition to methodological advancements, this thesis also proposes a\nsoftware abstraction that enables a compositional approach to implementing\nneural processes. This approach allows the user to rapidly explore the space of\nneural process models by putting together elementary building blocks in\ndifferent ways.\n","authors":["Wessel P. Bruinsma"],"pdf_url":"https://arxiv.org/pdf/2408.09583v1.pdf","comment":"PhD thesis, 226 pages"},{"id":"http://arxiv.org/abs/2408.05861v2","updated":"2024-08-18T19:32:35Z","published":"2024-08-11T21:04:14Z","title":"Leveraging Knowledge Graph-Based Human-Like Memory Systems to Solve\n Partially Observable Markov Decision Processes","summary":" Humans observe only part of their environment at any moment but can still\nmake complex, long-term decisions thanks to our long-term memory. To test how\nan AI can learn and utilize its long-term memory, we have developed a partially\nobservable Markov decision processes (POMDP) environment, where the agent has\nto answer questions while navigating a maze. The environment is completely\nknowledge graph (KG) based, where the hidden states are dynamic KGs. A KG is\nboth human- and machine-readable, making it easy to see what the agents\nremember and forget. We train and compare agents with different memory systems,\nto shed light on how human brains work when it comes to managing its own\nmemory. By repurposing the given learning objective as learning a memory\nmanagement policy, we were able to capture the most likely hidden state, which\nis not only interpretable but also reusable.\n","authors":["Taewoon Kim","Vincent François-Lavet","Michael Cochez"],"pdf_url":"https://arxiv.org/pdf/2408.05861v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09576v1","updated":"2024-08-18T19:27:30Z","published":"2024-08-18T19:27:30Z","title":"A Markov Random Field Multi-Modal Variational AutoEncoder","summary":" Recent advancements in multimodal Variational AutoEncoders (VAEs) have\nhighlighted their potential for modeling complex data from multiple modalities.\nHowever, many existing approaches use relatively straightforward aggregating\nschemes that may not fully capture the complex dynamics present between\ndifferent modalities. This work introduces a novel multimodal VAE that\nincorporates a Markov Random Field (MRF) into both the prior and posterior\ndistributions. This integration aims to capture complex intermodal interactions\nmore effectively. Unlike previous models, our approach is specifically designed\nto model and leverage the intricacies of these relationships, enabling a more\nfaithful representation of multimodal data. Our experiments demonstrate that\nour model performs competitively on the standard PolyMNIST dataset and shows\nsuperior performance in managing complex intermodal dependencies in a specially\ndesigned synthetic dataset, intended to test intricate relationships.\n","authors":["Fouad Oubari","Mohamed El Baha","Raphael Meunier","Rodrigue Décatoire","Mathilde Mougeot"],"pdf_url":"https://arxiv.org/pdf/2408.09576v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.05316v2","updated":"2024-08-18T19:10:34Z","published":"2024-06-08T01:32:44Z","title":"C-Mamba: Channel Correlation Enhanced State Space Models for\n Multivariate Time Series Forecasting","summary":" In recent years, significant progress has been made in multivariate time\nseries forecasting using Linear-based, Transformer-based, and Convolution-based\nmodels. However, these approaches face notable limitations: linear forecasters\nstruggle with representation capacities, attention mechanisms suffer from\nquadratic complexity, and convolutional models have a restricted receptive\nfield. These constraints impede their effectiveness in modeling complex time\nseries, particularly those with numerous variables. Additionally, many models\nadopt the Channel-Independent (CI) strategy, treating multivariate time series\nas uncorrelated univariate series while ignoring their correlations. For models\nconsidering inter-channel relationships, whether through the self-attention\nmechanism, linear combination, or convolution, they all incur high\ncomputational costs and focus solely on weighted summation relationships,\nneglecting potential proportional relationships between channels. In this work,\nwe address these issues by leveraging the newly introduced state space model\nand propose \\textbf{C-Mamba}, a novel approach that captures cross-channel\ndependencies while maintaining linear complexity without losing the global\nreceptive field. Our model consists of two key components: (i) channel mixup,\nwhere two channels are mixed to enhance the training sets; (ii) channel\nattention enhanced patch-wise Mamba encoder that leverages the ability of the\nstate space models to capture cross-time dependencies and models correlations\nbetween channels by mining their weight relationships. Our model achieves\nstate-of-the-art performance on seven real-world time series datasets.\nMoreover, the proposed mixup and attention strategy exhibits strong\ngeneralizability across other frameworks.\n","authors":["Chaolv Zeng","Zhanyu Liu","Guanjie Zheng","Linghe Kong"],"pdf_url":"https://arxiv.org/pdf/2406.05316v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08031v2","updated":"2024-08-18T18:53:16Z","published":"2024-04-11T17:59:52Z","title":"Latent Guard: a Safety Framework for Text-to-image Generation","summary":" With the ability to generate high-quality images, text-to-image (T2I) models\ncan be exploited for creating inappropriate content. To prevent misuse,\nexisting safety measures are either based on text blacklists, which can be\neasily circumvented, or harmful content classification, requiring large\ndatasets for training and offering low flexibility. Hence, we propose Latent\nGuard, a framework designed to improve safety measures in text-to-image\ngeneration. Inspired by blacklist-based approaches, Latent Guard learns a\nlatent space on top of the T2I model's text encoder, where it is possible to\ncheck the presence of harmful concepts in the input text embeddings. Our\nproposed framework is composed of a data generation pipeline specific to the\ntask using large language models, ad-hoc architectural components, and a\ncontrastive learning strategy to benefit from the generated data. The\neffectiveness of our method is verified on three datasets and against four\nbaselines. Code and data will be shared at https://latentguard.github.io/.\n","authors":["Runtao Liu","Ashkan Khakzar","Jindong Gu","Qifeng Chen","Philip Torr","Fabio Pizzati"],"pdf_url":"https://arxiv.org/pdf/2404.08031v2.pdf","comment":"This paper has been accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2408.09570v1","updated":"2024-08-18T18:50:59Z","published":"2024-08-18T18:50:59Z","title":"Say My Name: a Model's Bias Discovery Framework","summary":" In the last few years, due to the broad applicability of deep learning to\ndownstream tasks and end-to-end training capabilities, increasingly more\nconcerns about potential biases to specific, non-representative patterns have\nbeen raised. Many works focusing on unsupervised debiasing usually leverage the\ntendency of deep models to learn ``easier'' samples, for example by clustering\nthe latent space to obtain bias pseudo-labels. However, the interpretation of\nsuch pseudo-labels is not trivial, especially for a non-expert end user, as it\ndoes not provide semantic information about the bias features. To address this\nissue, we introduce ``Say My Name'' (SaMyNa), the first tool to identify biases\nwithin deep models semantically. Unlike existing methods, our approach focuses\non biases learned by the model. Our text-based pipeline enhances explainability\nand supports debiasing efforts: applicable during either training or post-hoc\nvalidation, our method can disentangle task-related information and proposes\nitself as a tool to analyze biases. Evaluation on traditional benchmarks\ndemonstrates its effectiveness in detecting biases and even disclaiming them,\nshowcasing its broad applicability for model diagnosis.\n","authors":["Massimiliano Ciranni","Luca Molinaro","Carlo Alberto Barbano","Attilio Fiandrotti","Vittorio Murino","Vito Paolo Pastore","Enzo Tartaglione"],"pdf_url":"https://arxiv.org/pdf/2408.09570v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09562v1","updated":"2024-08-18T18:21:24Z","published":"2024-08-18T18:21:24Z","title":"Security Concerns in Quantum Machine Learning as a Service","summary":" Quantum machine learning (QML) is a category of algorithms that employ\nvariational quantum circuits (VQCs) to tackle machine learning tasks. Recent\ndiscoveries have shown that QML models can effectively generalize from limited\ntraining data samples. This capability has sparked increased interest in\ndeploying these models to address practical, real-world challenges, resulting\nin the emergence of Quantum Machine Learning as a Service (QMLaaS). QMLaaS\nrepresents a hybrid model that utilizes both classical and quantum computing\nresources. Classical computers play a crucial role in this setup, handling\ninitial pre-processing and subsequent post-processing of data to compensate for\nthe current limitations of quantum hardware. Since this is a new area, very\nlittle work exists to paint the whole picture of QMLaaS in the context of known\nsecurity threats in the domain of classical and quantum machine learning. This\nSoK paper is aimed to bridge this gap by outlining the complete QMLaaS\nworkflow, which encompasses both the training and inference phases and\nhighlighting significant security concerns involving untrusted classical or\nquantum providers. QML models contain several sensitive assets, such as the\nmodel architecture, training/testing data, encoding techniques, and trained\nparameters. Unauthorized access to these components could compromise the\nmodel's integrity and lead to intellectual property (IP) theft. We pinpoint the\ncritical security issues that must be considered to pave the way for a secure\nQMLaaS deployment.\n","authors":["Satwik Kundu","Swaroop Ghosh"],"pdf_url":"https://arxiv.org/pdf/2408.09562v1.pdf","comment":"9 pages, 3 figures"}],"Multimedia":[{"id":"http://arxiv.org/abs/2408.01669v4","updated":"2024-08-18T18:06:06Z","published":"2024-08-03T05:35:13Z","title":"SynopGround: A Large-Scale Dataset for Multi-Paragraph Video Grounding\n from TV Dramas and Synopses","summary":" Video grounding is a fundamental problem in multimodal content understanding,\naiming to localize specific natural language queries in an untrimmed video.\nHowever, current video grounding datasets merely focus on simple events and are\neither limited to shorter videos or brief sentences, which hinders the model\nfrom evolving toward stronger multimodal understanding capabilities. To address\nthese limitations, we present a large-scale video grounding dataset named\nSynopGround, in which more than 2800 hours of videos are sourced from popular\nTV dramas and are paired with accurately localized human-written synopses. Each\nparagraph in the synopsis serves as a language query and is manually annotated\nwith precise temporal boundaries in the long video. These paragraph queries are\ntightly correlated to each other and contain a wealth of abstract expressions\nsummarizing video storylines and specific descriptions portraying event\ndetails, which enables the model to learn multimodal perception on more\nintricate concepts over longer context dependencies. Based on the dataset, we\nfurther introduce a more complex setting of video grounding dubbed\nMulti-Paragraph Video Grounding (MPVG), which takes as input multiple\nparagraphs and a long video for grounding each paragraph query to its temporal\ninterval. In addition, we propose a novel Local-Global Multimodal Reasoner\n(LGMR) to explicitly model the local-global structures of long-term multimodal\ninputs for MPVG. Our method provides an effective baseline solution to the\nmulti-paragraph video grounding problem. Extensive experiments verify the\nproposed model's effectiveness as well as its superiority in long-term\nmulti-paragraph video grounding over prior state-of-the-arts. Dataset and code\nare publicly available. Project page: https://synopground.github.io/.\n","authors":["Chaolei Tan","Zihang Lin","Junfu Pu","Zhongang Qi","Wei-Yi Pei","Zhi Qu","Yexin Wang","Ying Shan","Wei-Shi Zheng","Jian-Fang Hu"],"pdf_url":"https://arxiv.org/pdf/2408.01669v4.pdf","comment":"Accepted to ACM MM 2024. Project page: https://synopground.github.io/"},{"id":"http://arxiv.org/abs/2408.09462v1","updated":"2024-08-18T12:52:55Z","published":"2024-08-18T12:52:55Z","title":"SpeechEE: A Novel Benchmark for Speech Event Extraction","summary":" Event extraction (EE) is a critical direction in the field of information\nextraction, laying an important foundation for the construction of structured\nknowledge bases. EE from text has received ample research and attention for\nyears, yet there can be numerous real-world applications that require direct\ninformation acquisition from speech signals, online meeting minutes, interview\nsummaries, press releases, etc. While EE from speech has remained\nunder-explored, this paper fills the gap by pioneering a SpeechEE, defined as\ndetecting the event predicates and arguments from a given audio speech. To\nbenchmark the SpeechEE task, we first construct a large-scale high-quality\ndataset. Based on textual EE datasets under the sentence, document, and\ndialogue scenarios, we convert texts into speeches through both manual\nreal-person narration and automatic synthesis, empowering the data with diverse\nscenarios, languages, domains, ambiences, and speaker styles. Further, to\neffectively address the key challenges in the task, we tailor an E2E SpeechEE\nsystem based on the encoder-decoder architecture, where a novel Shrinking Unit\nmodule and a retrieval-aided decoding mechanism are devised. Extensive\nexperimental results on all SpeechEE subsets demonstrate the efficacy of the\nproposed model, offering a strong baseline for the task. At last, being the\nfirst work on this topic, we shed light on key directions for future research.\nOur codes and the benchmark datasets are open at https://SpeechEE.github.io/\n","authors":["Bin Wang","Meishan Zhang","Hao Fei","Yu Zhao","Bobo Li","Shengqiong Wu","Wei Ji","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.09462v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09384v1","updated":"2024-08-18T07:03:53Z","published":"2024-08-18T07:03:53Z","title":"FD2Talk: Towards Generalized Talking Head Generation with Facial\n Decoupled Diffusion Model","summary":" Talking head generation is a significant research topic that still faces\nnumerous challenges. Previous works often adopt generative adversarial networks\nor regression models, which are plagued by generation quality and average\nfacial shape problem. Although diffusion models show impressive generative\nability, their exploration in talking head generation remains unsatisfactory.\nThis is because they either solely use the diffusion model to obtain an\nintermediate representation and then employ another pre-trained renderer, or\nthey overlook the feature decoupling of complex facial details, such as\nexpressions, head poses and appearance textures. Therefore, we propose a Facial\nDecoupled Diffusion model for Talking head generation called FD2Talk, which\nfully leverages the advantages of diffusion models and decouples the complex\nfacial details through multi-stages. Specifically, we separate facial details\ninto motion and appearance. In the initial phase, we design the Diffusion\nTransformer to accurately predict motion coefficients from raw audio. These\nmotions are highly decoupled from appearance, making them easier for the\nnetwork to learn compared to high-dimensional RGB images. Subsequently, in the\nsecond phase, we encode the reference image to capture appearance textures. The\npredicted facial and head motions and encoded appearance then serve as the\nconditions for the Diffusion UNet, guiding the frame generation. Benefiting\nfrom decoupling facial details and fully leveraging diffusion models, extensive\nexperiments substantiate that our approach excels in enhancing image quality\nand generating more accurate and diverse results compared to previous\nstate-of-the-art methods.\n","authors":["Ziyu Yao","Xuxin Cheng","Zhiqi Huang"],"pdf_url":"https://arxiv.org/pdf/2408.09384v1.pdf","comment":"Accepted by ACM Multimedia 2024"},{"id":"http://arxiv.org/abs/2110.06707v4","updated":"2024-08-18T02:06:30Z","published":"2021-10-13T13:30:54Z","title":"Singer separation for karaoke content generation","summary":" Due to the rapid development of deep learning, we can now successfully\nseparate singing voice from mono audio music. However, this separation can only\nextract human voices from other musical instruments, which is undesirable for\nkaraoke content generation applications that only require the separation of\nlead singers. For this karaoke application, we need to separate the music\ncontaining male and female duets into two vocals, or extract a single lead\nvocal from the music containing vocal harmony. For this reason, we propose in\nthis article to use a singer separation system, which generates karaoke content\nfor one or two separated lead singers. In particular, we introduced three\nmodels for the singer separation task and designed an automatic model selection\nscheme to distinguish how many lead singers are in the song. We also collected\na large enough data set, MIR-SingerSeparation, which has been publicly released\nto advance the frontier of this research. Our singer separation is most\nsuitable for sentimental ballads and can be directly applied to karaoke content\ngeneration. As far as we know, this is the first singer-separation work for\nreal-world karaoke applications.\n","authors":["Hsuan-Yu Lin","Xuanjun Chen","Jyh-Shing Roger Jang"],"pdf_url":"https://arxiv.org/pdf/2110.06707v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09438v1","updated":"2024-08-18T11:05:21Z","published":"2024-08-18T11:05:21Z","title":"Enhancing Modal Fusion by Alignment and Label Matching for Multimodal\n Emotion Recognition","summary":" To address the limitation in multimodal emotion recognition (MER) performance\narising from inter-modal information fusion, we propose a novel MER framework\nbased on multitask learning where fusion occurs after alignment, called\nFoal-Net. The framework is designed to enhance the effectiveness of modality\nfusion and includes two auxiliary tasks: audio-video emotion alignment (AVEL)\nand cross-modal emotion label matching (MEM). First, AVEL achieves alignment of\nemotional information in audio-video representations through contrastive\nlearning. Then, a modal fusion network integrates the aligned features.\nMeanwhile, MEM assesses whether the emotions of the current sample pair are the\nsame, providing assistance for modal information fusion and guiding the model\nto focus more on emotional information. The experimental results conducted on\nIEMOCAP corpus show that Foal-Net outperforms the state-of-the-art methods and\nemotion alignment is necessary before modal fusion.\n","authors":["Qifei Li","Yingming Gao","Yuhua Wen","Cong Wang","Ya Li"],"pdf_url":"https://arxiv.org/pdf/2408.09438v1.pdf","comment":"The paper has been accepted by INTERSPEECH 2024"}]},"2024-08-17T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2408.09311v1","updated":"2024-08-17T23:59:17Z","published":"2024-08-17T23:59:17Z","title":"An Open-Source American Sign Language Fingerspell Recognition and\n Semantic Pose Retrieval Interface","summary":" This paper introduces an open-source interface for American Sign Language\nfingerspell recognition and semantic pose retrieval, aimed to serve as a\nstepping stone towards more advanced sign language translation systems.\nUtilizing a combination of convolutional neural networks and pose estimation\nmodels, the interface provides two modular components: a recognition module for\ntranslating ASL fingerspelling into spoken English and a production module for\nconverting spoken English into ASL pose sequences. The system is designed to be\nhighly accessible, user-friendly, and capable of functioning in real-time under\nvarying environmental conditions like backgrounds, lighting, skin tones, and\nhand sizes. We discuss the technical details of the model architecture,\napplication in the wild, as well as potential future enhancements for\nreal-world consumer applications.\n","authors":["Kevin Jose Thomas"],"pdf_url":"https://arxiv.org/pdf/2408.09311v1.pdf","comment":"8 pages, 9 figures"},{"id":"http://arxiv.org/abs/2408.09304v1","updated":"2024-08-17T22:37:39Z","published":"2024-08-17T22:37:39Z","title":"CyberPal.AI: Empowering LLMs with Expert-Driven Cybersecurity\n Instructions","summary":" Large Language Models (LLMs) have significantly advanced natural language\nprocessing (NLP), providing versatile capabilities across various applications.\nHowever, their application to complex, domain-specific tasks, such as\ncyber-security, often faces substantial challenges. In this study, we introduce\nSecKnowledge and CyberPal.AI to address these challenges and train\nsecurity-expert LLMs. SecKnowledge is a domain-knowledge-driven cyber-security\ninstruction dataset, meticulously designed using years of accumulated expert\nknowledge in the domain through a multi-phase generation process. CyberPal.AI\nrefers to a family of LLMs fine-tuned using SecKnowledge, aimed at building\nsecurity-specialized LLMs capable of answering and following complex\nsecurity-related instructions. Additionally, we introduce SecKnowledge-Eval, a\ncomprehensive and diverse cyber-security evaluation benchmark, composed of an\nextensive set of cyber-security tasks we specifically developed to assess LLMs\nin the field of cyber-security, along with other publicly available security\nbenchmarks. Our results show a significant average improvement of up to 24%\nover the baseline models, underscoring the benefits of our expert-driven\ninstruction dataset generation process. These findings contribute to the\nadvancement of AI-based cyber-security applications, paving the way for\nsecurity-expert LLMs that can enhance threat-hunting and investigation\nprocesses.\n","authors":["Matan Levi","Yair Alluouche","Daniel Ohayon","Anton Puzanov"],"pdf_url":"https://arxiv.org/pdf/2408.09304v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09273v1","updated":"2024-08-17T19:03:53Z","published":"2024-08-17T19:03:53Z","title":"ConVerSum: A Contrastive Learning based Approach for Data-Scarce\n Solution of Cross-Lingual Summarization Beyond Direct Equivalents","summary":" Cross-Lingual summarization (CLS) is a sophisticated branch in Natural\nLanguage Processing that demands models to accurately translate and summarize\narticles from different source languages. Despite the improvement of the\nsubsequent studies, This area still needs data-efficient solutions along with\neffective training methodologies. To the best of our knowledge, there is no\nfeasible solution for CLS when there is no available high-quality CLS data. In\nthis paper, we propose a novel data-efficient approach, ConVerSum, for CLS\nleveraging the power of contrastive learning, generating versatile candidate\nsummaries in different languages based on the given source document and\ncontrasting these summaries with reference summaries concerning the given\ndocuments. After that, we train the model with a contrastive ranking loss.\nThen, we rigorously evaluate the proposed approach against current\nmethodologies and compare it to powerful Large Language Models (LLMs)- Gemini,\nGPT 3.5, and GPT 4 proving our model performs better for low-resource\nlanguages' CLS. These findings represent a substantial improvement in the area,\nopening the door to more efficient and accurate cross-lingual summarizing\ntechniques.\n","authors":["Sanzana Karim Lora","Rifat Shahriyar"],"pdf_url":"https://arxiv.org/pdf/2408.09273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09235v1","updated":"2024-08-17T16:01:45Z","published":"2024-08-17T16:01:45Z","title":"Reference-Guided Verdict: LLMs-as-Judges in Automatic Evaluation of\n Free-Form Text","summary":" The rapid advancements in Large Language Models (LLMs) have highlighted the\ncritical need for robust evaluation methods that can accurately assess the\nquality of generated text, particularly in free-form tasks. Traditional metrics\nlike BLEU and ROUGE, while useful, often fail to capture the semantic richness\nand contextual relevance of free-form text compared to reference answers. In\nthis study, we introduce a reference-guided verdict method that leverages\nmultiple LLMs-as-judges to provide a more reliable and accurate evaluation of\nopen-ended LLM generations. By integrating diverse LLMs, our approach mitigates\nindividual model biases and significantly improves alignment with human\njudgments, especially in challenging tasks where traditional metrics and\nsingle-model evaluations fall short. Through experiments across multiple\nquestion-answering tasks, we show that our method closely aligns with human\nevaluations, establishing it as a scalable, reproducible, and effective\nalternative to human evaluation. Our approach not only enhances evaluation\nreliability but also opens new avenues for refining automated assessment in\ngenerative AI.\n","authors":["Sher Badshah","Hassan Sajjad"],"pdf_url":"https://arxiv.org/pdf/2408.09235v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09215v1","updated":"2024-08-17T14:47:05Z","published":"2024-08-17T14:47:05Z","title":"Generating Data with Text-to-Speech and Large-Language Models for\n Conversational Speech Recognition","summary":" Currently, a common approach in many speech processing tasks is to leverage\nlarge scale pre-trained models by fine-tuning them on in-domain data for a\nparticular application. Yet obtaining even a small amount of such data can be\nproblematic, especially for sensitive domains and conversational speech\nscenarios, due to both privacy issues and annotation costs. To address this,\nsynthetic data generation using single speaker datasets has been employed. Yet,\nfor multi-speaker cases, such an approach often requires extensive manual\neffort and is prone to domain mismatches. In this work, we propose a synthetic\ndata generation pipeline for multi-speaker conversational ASR, leveraging a\nlarge language model (LLM) for content creation and a conversational\nmulti-speaker text-to-speech (TTS) model for speech synthesis. We conduct\nevaluation by fine-tuning the Whisper ASR model for telephone and distant\nconversational speech settings, using both in-domain data and generated\nsynthetic data. Our results show that the proposed method is able to\nsignificantly outperform classical multi-speaker generation approaches that use\nexternal, non-conversational speech datasets.\n","authors":["Samuele Cornell","Jordan Darefsky","Zhiyao Duan","Shinji Watanabe"],"pdf_url":"https://arxiv.org/pdf/2408.09215v1.pdf","comment":"To appear at SynData4GenAI 2024 workshop"},{"id":"http://arxiv.org/abs/2305.06575v6","updated":"2024-08-17T14:31:42Z","published":"2023-05-11T05:19:47Z","title":"Chain-of-Dictionary Prompting Elicits Translation in Large Language\n Models","summary":" Large language models (LLMs) have shown surprisingly good performance in\nmultilingual neural machine translation (MNMT) even when trained without\nparallel data. Yet, despite the fact that the amount of training data is\ngigantic, they still struggle with translating rare words, particularly for\nlow-resource languages. Even worse, it is usually unrealistic to retrieve\nrelevant demonstrations for in-context learning with low-resource languages on\nLLMs, which restricts the practical use of LLMs for translation -- how should\nwe mitigate this problem? To this end, we present a novel method, CoD, which\naugments LLMs with prior knowledge with the chains of multilingual dictionaries\nfor a subset of input words to elicit translation abilities for LLMs. Extensive\nexperiments indicate that augmenting ChatGPT with CoD elicits large gains by up\nto 13x chrF++ points for MNMT (3.08 to 42.63 for English to Serbian written in\nCyrillic script) on FLORES-200 full devtest set. We further demonstrate the\nimportance of chaining the multilingual dictionaries, as well as the\nsuperiority of CoD to few-shot demonstration for low-resource languages.\n","authors":["Hongyuan Lu","Haoran Yang","Haoyang Huang","Dongdong Zhang","Wai Lam","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2305.06575v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09205v1","updated":"2024-08-17T13:54:34Z","published":"2024-08-17T13:54:34Z","title":"Architectural Foundations and Strategic Considerations for the Large\n Language Model Infrastructures","summary":" The development of a large language model (LLM) infrastructure is a pivotal\nundertaking in artificial intelligence. This paper explores the intricate\nlandscape of LLM infrastructure, software, and data management. By analyzing\nthese core components, we emphasize the pivotal considerations and safeguards\ncrucial for successful LLM development. This work presents a concise synthesis\nof the challenges and strategies inherent in constructing a robust and\neffective LLM infrastructure, offering valuable insights for researchers and\npractitioners alike.\n","authors":["Hongyin Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.09205v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09193v1","updated":"2024-08-17T13:11:46Z","published":"2024-08-17T13:11:46Z","title":"AI Managed Emergency Documentation with a Pretrained Model","summary":" This study investigates the use of a large language model system to improve\nefficiency and quality in emergency department (ED) discharge letter writing.\nTime constraints and infrastructural deficits make compliance with current\ndischarge letter targets difficult. We explored potential efficiencies from an\nartificial intelligence software in the generation of ED discharge letters and\nthe attitudes of doctors toward this technology. The evaluated system leverages\nadvanced techniques to fine-tune a model to generate discharge summaries from\nshort-hand inputs, including voice, text, and electronic health record data.\nNineteen physicians with emergency medicine experience evaluated the system\ntext and voice-to-text interfaces against manual typing. The results showed\nsignificant time savings with MedWrite LLM interfaces compared to manual\nmethods.\n","authors":["David Menzies","Sean Kirwan","Ahmad Albarqawi"],"pdf_url":"https://arxiv.org/pdf/2408.09193v1.pdf","comment":"Ethical approval for the study was obtained from the University\n College Dublin, Human Research Ethics Committee (UCD HREC)"},{"id":"http://arxiv.org/abs/2405.04163v2","updated":"2024-08-17T12:43:13Z","published":"2024-05-07T10:00:00Z","title":"MEDVOC: Vocabulary Adaptation for Fine-tuning Pre-trained Language\n Models on Medical Text Summarization","summary":" This work presents a dynamic vocabulary adaptation strategy, MEDVOC, for\nfine-tuning pre-trained language models (PLMs) like BertSumAbs, BART, and\nPEGASUS for improved medical text summarization. In contrast to existing domain\nadaptation approaches in summarization, MEDVOC treats vocabulary as an\noptimizable parameter and optimizes the PLM vocabulary based on fragment score\nconditioned only on the downstream task's reference summaries. Unlike previous\nworks on vocabulary adaptation (limited only to classification tasks),\noptimizing vocabulary based on summarization tasks requires an extremely costly\nintermediate fine-tuning step on large summarization datasets. To that end, our\nnovel fragment score-based hyperparameter search very significantly reduces\nthis fine-tuning time -- from 450 days to less than 2 days on average.\nFurthermore, while previous works on vocabulary adaptation are often primarily\ntied to single PLMs, MEDVOC is designed to be deployable across multiple PLMs\n(with varying model vocabulary sizes, pre-training objectives, and model sizes)\n-- bridging the limited vocabulary overlap between the biomedical literature\ndomain and PLMs. MEDVOC outperforms baselines by 15.74% in terms of Rouge-L in\nzero-shot setting and shows gains of 17.29% in high Out-Of-Vocabulary (OOV)\nconcentrations. Our human evaluation shows MEDVOC generates more faithful\nmedical summaries (88% compared to 59% in baselines). We make the codebase\npublicly available at https://github.com/gb-kgp/MEDVOC.\n","authors":["Gunjan Balde","Soumyadeep Roy","Mainack Mondal","Niloy Ganguly"],"pdf_url":"https://arxiv.org/pdf/2405.04163v2.pdf","comment":"13 pages, Accepted to the 33rd International Joint Conference on\n Artificial Intelligence, IJCAI 2024 (Main) Track"},{"id":"http://arxiv.org/abs/2408.09177v1","updated":"2024-08-17T11:56:38Z","published":"2024-08-17T11:56:38Z","title":"Chinese Metaphor Recognition Using a Multi-stage Prompting Large\n Language Model","summary":" Metaphors are common in everyday language, and the identification and\nunderstanding of metaphors are facilitated by models to achieve a better\nunderstanding of the text. Metaphors are mainly identified and generated by\npre-trained models in existing research, but situations, where tenors or\nvehicles are not included in the metaphor, cannot be handled. The problem can\nbe effectively solved by using Large Language Models (LLMs), but significant\nroom for exploration remains in this early-stage research area. A multi-stage\ngenerative heuristic-enhanced prompt framework is proposed in this study to\nenhance the ability of LLMs to recognize tenors, vehicles, and grounds in\nChinese metaphors. In the first stage, a small model is trained to obtain the\nrequired confidence score for answer candidate generation. In the second stage,\nquestions are clustered and sampled according to specific rules. Finally, the\nheuristic-enhanced prompt needed is formed by combining the generated answer\ncandidates and demonstrations. The proposed model achieved 3rd place in Track 1\nof Subtask 1, 1st place in Track 2 of Subtask 1, and 1st place in both tracks\nof Subtask 2 at the NLPCC-2024 Shared Task 9.\n","authors":["Jie Wang","Jin Wang","Xuejie Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.09177v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09176v1","updated":"2024-08-17T11:49:53Z","published":"2024-08-17T11:49:53Z","title":"Cognitive LLMs: Towards Integrating Cognitive Architectures and Large\n Language Models for Manufacturing Decision-making","summary":" Resolving the dichotomy between the human-like yet constrained reasoning\nprocesses of Cognitive Architectures and the broad but often noisy inference\nbehavior of Large Language Models (LLMs) remains a challenging but exciting\npursuit, for enabling reliable machine reasoning capabilities in production\nsystems. Because Cognitive Architectures are famously developed for the purpose\nof modeling the internal mechanisms of human cognitive decision-making at a\ncomputational level, new investigations consider the goal of informing LLMs\nwith the knowledge necessary for replicating such processes, e.g., guided\nperception, memory, goal-setting, and action. Previous approaches that use LLMs\nfor grounded decision-making struggle with complex reasoning tasks that require\nslower, deliberate cognition over fast and intuitive inference -- reporting\nissues related to the lack of sufficient grounding, as in hallucination. To\nresolve these challenges, we introduce LLM-ACTR, a novel neuro-symbolic\narchitecture that provides human-aligned and versatile decision-making by\nintegrating the ACT-R Cognitive Architecture with LLMs. Our framework extracts\nand embeds knowledge of ACT-R's internal decision-making process as latent\nneural representations, injects this information into trainable LLM adapter\nlayers, and fine-tunes the LLMs for downstream prediction. Our experiments on\nnovel Design for Manufacturing tasks show both improved task performance as\nwell as improved grounded decision-making capability of our approach, compared\nto LLM-only baselines that leverage chain-of-thought reasoning strategies.\n","authors":["Siyu Wu","Alessandro Oltramari","Jonathan Francis","C. Lee Giles","Frank E. Ritter"],"pdf_url":"https://arxiv.org/pdf/2408.09176v1.pdf","comment":"20 pages, 8 figures, 2 tables"},{"id":"http://arxiv.org/abs/2408.09174v1","updated":"2024-08-17T11:40:10Z","published":"2024-08-17T11:40:10Z","title":"TableBench: A Comprehensive and Complex Benchmark for Table Question\n Answering","summary":" Recent advancements in Large Language Models (LLMs) have markedly enhanced\nthe interpretation and processing of tabular data, introducing previously\nunimaginable capabilities. Despite these achievements, LLMs still encounter\nsignificant challenges when applied in industrial scenarios, particularly due\nto the increased complexity of reasoning required with real-world tabular data,\nunderscoring a notable disparity between academic benchmarks and practical\napplications. To address this discrepancy, we conduct a detailed investigation\ninto the application of tabular data in industrial scenarios and propose a\ncomprehensive and complex benchmark TableBench, including 18 fields within four\nmajor categories of table question answering (TableQA) capabilities.\nFurthermore, we introduce TableLLM, trained on our meticulously constructed\ntraining set TableInstruct, achieving comparable performance with GPT-3.5.\nMassive experiments conducted on TableBench indicate that both open-source and\nproprietary LLMs still have significant room for improvement to meet real-world\ndemands, where the most advanced model, GPT-4, achieves only a modest score\ncompared to humans.\n","authors":["Xianjie Wu","Jian Yang","Linzheng Chai","Ge Zhang","Jiaheng Liu","Xinrun Du","Di Liang","Daixin Shu","Xianfu Cheng","Tianzhen Sun","Guanglin Niu","Tongliang Li","Zhoujun Li"],"pdf_url":"https://arxiv.org/pdf/2408.09174v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2408.09172v1","updated":"2024-08-17T11:33:23Z","published":"2024-08-17T11:33:23Z","title":"Unc-TTP: A Method for Classifying LLM Uncertainty to Improve In-Context\n Example Selection","summary":" Nowadays, Large Language Models (LLMs) have demonstrated exceptional\nperformance across various downstream tasks. However, it is challenging for\nusers to discern whether the responses are generated with certainty or are\nfabricated to meet user expectations. Estimating the uncertainty of LLMs is\nparticularly challenging due to their vast scale and the lack of white-box\naccess. In this work, we propose a novel Uncertainty Tripartite Testing\nParadigm (Unc-TTP) to classify LLM uncertainty, via evaluating the consistency\nof LLM outputs when incorporating label interference into the sampling-based\napproach. Based on Unc-TTP outputs, we aggregate instances into certain and\nuncertain categories. Further, we conduct a detailed analysis of the\nuncertainty properties of LLMs and show Unc-TTP's superiority over the existing\nsampling-based methods. In addition, we leverage the obtained uncertainty\ninformation to guide in-context example selection, demonstrating that Unc-TTP\nobviously outperforms retrieval-based and sampling-based approaches in\nselecting more informative examples. Our work paves a new way to classify the\nuncertainty of both open- and closed-source LLMs, and introduces a practical\napproach to exploit this uncertainty to improve LLMs performance.\n","authors":["Hsiu-Yuan Huang","Zichen Wu","Yutong Yang","Junzhao Zhang","Yunfang Wu"],"pdf_url":"https://arxiv.org/pdf/2408.09172v1.pdf","comment":"7 pages, long paper"},{"id":"http://arxiv.org/abs/2407.19794v2","updated":"2024-08-17T11:31:56Z","published":"2024-07-29T08:38:14Z","title":"Introducing a new hyper-parameter for RAG: Context Window Utilization","summary":" This paper introduces a new hyper-parameter for Retrieval-Augmented\nGeneration (RAG) systems called Context Window Utilization. RAG systems enhance\ngenerative models by incorporating relevant information retrieved from external\nknowledge bases, improving the factual accuracy and contextual relevance of\ngenerated responses. The size of the text chunks retrieved and processed is a\ncritical factor influencing RAG performance. This study aims to identify the\noptimal chunk size that maximizes answer generation quality. Through systematic\nexperimentation, we analyze the effects of varying chunk sizes on the\nefficiency and effectiveness of RAG frameworks. Our findings reveal that an\noptimal chunk size balances the trade-off between providing sufficient context\nand minimizing irrelevant information. These insights are crucial for enhancing\nthe design and implementation of RAG systems, underscoring the importance of\nselecting an appropriate chunk size to achieve superior performance.\n","authors":["Kush Juvekar","Anupam Purwar"],"pdf_url":"https://arxiv.org/pdf/2407.19794v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09169v1","updated":"2024-08-17T11:13:10Z","published":"2024-08-17T11:13:10Z","title":"Automatic Metrics in Natural Language Generation: A Survey of Current\n Evaluation Practices","summary":" Automatic metrics are extensively used to evaluate natural language\nprocessing systems. However, there has been increasing focus on how they are\nused and reported by practitioners within the field. In this paper, we have\nconducted a survey on the use of automatic metrics, focusing particularly on\nnatural language generation (NLG) tasks. We inspect which metrics are used as\nwell as why they are chosen and how their use is reported. Our findings from\nthis survey reveal significant shortcomings, including inappropriate metric\nusage, lack of implementation details and missing correlations with human\njudgements. We conclude with recommendations that we believe authors should\nfollow to enable more rigour within the field.\n","authors":["Patrícia Schmidtová","Saad Mahamood","Simone Balloccu","Ondřej Dušek","Albert Gatt","Dimitra Gkatzia","David M. Howcroft","Ondřej Plátek","Adarsa Sivaprasad"],"pdf_url":"https://arxiv.org/pdf/2408.09169v1.pdf","comment":"Accepted to INLG 2024"},{"id":"http://arxiv.org/abs/2402.10892v3","updated":"2024-08-17T10:04:53Z","published":"2024-02-16T18:49:27Z","title":"Proving membership in LLM pretraining data via data watermarks","summary":" Detecting whether copyright holders' works were used in LLM pretraining is\npoised to be an important problem. This work proposes using data watermarks to\nenable principled detection with only black-box model access, provided that the\nrightholder contributed multiple training documents and watermarked them before\npublic release. By applying a randomly sampled data watermark, detection can be\nframed as hypothesis testing, which provides guarantees on the false detection\nrate. We study two watermarks: one that inserts random sequences, and another\nthat randomly substitutes characters with Unicode lookalikes. We first show how\nthree aspects of watermark design -- watermark length, number of duplications,\nand interference -- affect the power of the hypothesis test. Next, we study how\na watermark's detection strength changes under model and dataset scaling: while\nincreasing the dataset size decreases the strength of the watermark, watermarks\nremain strong if the model size also increases. Finally, we view SHA hashes as\nnatural watermarks and show that we can robustly detect hashes from\nBLOOM-176B's training data, as long as they occurred at least 90 times.\nTogether, our results point towards a promising future for data watermarks in\nreal world use.\n","authors":["Johnny Tian-Zheng Wei","Ryan Yixiang Wang","Robin Jia"],"pdf_url":"https://arxiv.org/pdf/2402.10892v3.pdf","comment":"Findings of ACL 2024"},{"id":"http://arxiv.org/abs/2408.09150v1","updated":"2024-08-17T09:49:40Z","published":"2024-08-17T09:49:40Z","title":"CogLM: Tracking Cognitive Development of Large Language Models","summary":" Piaget's Theory of Cognitive Development (PTC) posits that the development of\ncognitive levels forms the foundation for human learning across various\nabilities. As Large Language Models (LLMs) have recently shown remarkable\nabilities across a wide variety of tasks, we are curious about the cognitive\nlevels of current LLMs: to what extent they have developed and how this\ndevelopment has been achieved. To this end, we construct a benchmark CogLM\n(Cognitive Ability Evaluation for Language Model) based on PTC to assess the\ncognitive levels of LLMs. CogLM comprises 1,220 questions spanning 10 cognitive\nabilities crafted by more than 20 human experts, providing a comprehensive\ntestbed for the cognitive levels of LLMs. Through extensive experiments across\nmultiple mainstream LLMs with CogLM, we find that: (1) Human-like cognitive\nabilities have emerged in advanced LLMs (GPT-4), comparable to those of a\n20-year-old human. (2) The parameter size and optimization objective are two\nkey factors affecting the cognitive levels of LLMs. (3) The performance on\ndownstream tasks is positively correlated with the level of cognitive\nabilities. These findings fill the gap in research on the cognitive abilities\nof LLMs, tracing the development of LLMs from a cognitive perspective and\nguiding the future direction of their evolution.\n","authors":["Xinglin Wang","Peiwen Yuan","Shaoxiong Feng","Yiwei Li","Boyuan Pan","Heda Wang","Yao Hu","Kan Li"],"pdf_url":"https://arxiv.org/pdf/2408.09150v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2406.14868v3","updated":"2024-08-17T09:33:12Z","published":"2024-06-21T05:13:20Z","title":"Direct Multi-Turn Preference Optimization for Language Agents","summary":" Adapting Large Language Models (LLMs) for agent tasks is critical in\ndeveloping language agents. Direct Preference Optimization (DPO) is a promising\ntechnique for this adaptation with the alleviation of compounding errors,\noffering a means to directly optimize Reinforcement Learning (RL) objectives.\nHowever, applying DPO to multi-turn tasks presents challenges due to the\ninability to cancel the partition function. Overcoming this obstacle involves\nmaking the partition function independent of the current state and addressing\nlength disparities between preferred and dis-preferred trajectories. In this\nlight, we replace the policy constraint with the state-action occupancy measure\nconstraint in the RL objective and add length normalization to the\nBradley-Terry model, yielding a novel loss function named DMPO for multi-turn\nagent tasks with theoretical explanations. Extensive experiments on three\nmulti-turn agent task datasets confirm the effectiveness and superiority of the\nDMPO loss.\n","authors":["Wentao Shi","Mengqi Yuan","Junkang Wu","Qifan Wang","Fuli Feng"],"pdf_url":"https://arxiv.org/pdf/2406.14868v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09121v1","updated":"2024-08-17T07:11:02Z","published":"2024-08-17T07:11:02Z","title":"Selective Prompt Anchoring for Code Generation","summary":" Recent advances in large language models (LLMs) such as Copilot and ChatGPT\nhave transformed software development by automating coding tasks. Despite these\nadvancements, challenges remain in reducing error rates and fully meeting user\nexpectations. Our empirical study reveals LLMs tend to dilute their\nself-attention on the initial prompt as more code tokens are generated. We\nhypothesize this self-attention dilution issue is one of the root causes of\ninaccuracies in LLM-generated code. To mitigate this issue, we propose\nSelective Prompt Anchoring (SPA). SPA amplifies the influence of the selected\nparts in the initial prompt, which we refer to as ``anchored text'', during\ncode generation. Specifically, SPA calculates the logit distribution difference\nwith and without the anchored text. We prove this difference approximates the\nanchored text's contextual contribution to the output logits. SPA creates an\naugmented logit distribution by linearly combining the original logit\ndistribution and the logit difference. We evaluate SPA with five LLMs on four\nbenchmarks. Our results demonstrate that using SPA can consistently improve\nPass@1 rates by up to 9.7% in all settings. Notably, with selective text\nanchoring, a small version of DeepSeek-Coder (6.7B) can achieve better\nperformance than an original much larger version (33B). Our code is available\nat https://github.com/magic-YuanTian/Selective-Prompt-Anchoring.\n","authors":["Yuan Tian","Tianyi Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.09121v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2407.10953v2","updated":"2024-08-17T06:59:05Z","published":"2024-07-15T17:50:43Z","title":"MMM: Multilingual Mutual Reinforcement Effect Mix Datasets & Test with\n Open-domain Information Extraction Large Language Models","summary":" The Mutual Reinforcement Effect (MRE) represents a promising avenue in\ninformation extraction and multitasking research. Nevertheless, its\napplicability has been constrained due to the exclusive availability of MRE mix\ndatasets in Japanese, thereby limiting comprehensive exploration by the global\nresearch community. To address this limitation, we introduce a Multilingual MRE\nmix dataset (MMM) that encompasses 21 sub-datasets in English, Japanese, and\nChinese. In this paper, we also propose a method for dataset translation\nassisted by Large Language Models (LLMs), which significantly reduces the\nmanual annotation time required for dataset construction by leveraging LLMs to\ntranslate the original Japanese datasets. Additionally, we have enriched the\ndataset by incorporating open-domain Named Entity Recognition (NER) and\nsentence classification tasks. Utilizing this expanded dataset, we developed a\nunified input-output framework to train an Open-domain Information Extraction\nLarge Language Model (OIELLM). The OIELLM model demonstrates the capability to\neffectively process novel MMM datasets, exhibiting significant improvements in\nperformance.\n","authors":["Chengguang Gan","Qingyu Yin","Xinyang He","Hanjun Wei","Yunhao Liang","Younghun Lim","Shijian Wang","Hexiang Huang","Qinghao Zhang","Shiwen Ni","Tatsunori Mori"],"pdf_url":"https://arxiv.org/pdf/2407.10953v2.pdf","comment":"Under Review. 11 pages, 5 Figure"},{"id":"http://arxiv.org/abs/2408.09111v1","updated":"2024-08-17T06:25:36Z","published":"2024-08-17T06:25:36Z","title":"Measuring Visual Sycophancy in Multimodal Models","summary":" This paper introduces and examines the phenomenon of \"visual sycophancy\" in\nmultimodal language models, a term we propose to describe these models'\ntendency to disproportionately favor visually presented information, even when\nit contradicts their prior knowledge or responses. Our study employs a\nsystematic methodology to investigate this phenomenon: we present models with\nimages of multiple-choice questions, which they initially answer correctly,\nthen expose the same model to versions with visually pre-marked options. Our\nfindings reveal a significant shift in the models' responses towards the\npre-marked option despite their previous correct answers. Comprehensive\nevaluations demonstrate that visual sycophancy is a consistent and quantifiable\nbehavior across various model architectures. Our findings highlight potential\nlimitations in the reliability of these models when processing potentially\nmisleading visual information, raising important questions about their\napplication in critical decision-making contexts.\n","authors":["Jaehyuk Lim","Bruce W. Lee"],"pdf_url":"https://arxiv.org/pdf/2408.09111v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20574v2","updated":"2024-08-17T03:45:25Z","published":"2024-05-31T02:05:45Z","title":"Open Ko-LLM Leaderboard: Evaluating Large Language Models in Korean with\n Ko-H5 Benchmark","summary":" This paper introduces the Open Ko-LLM Leaderboard and the Ko-H5 Benchmark as\nvital tools for evaluating Large Language Models (LLMs) in Korean.\nIncorporating private test sets while mirroring the English Open LLM\nLeaderboard, we establish a robust evaluation framework that has been well\nintegrated in the Korean LLM community. We perform data leakage analysis that\nshows the benefit of private test sets along with a correlation study within\nthe Ko-H5 benchmark and temporal analyses of the Ko-H5 score. Moreover, we\npresent empirical support for the need to expand beyond set benchmarks. We hope\nthe Open Ko-LLM Leaderboard sets precedent for expanding LLM evaluation to\nfoster more linguistic diversity.\n","authors":["Chanjun Park","Hyeonwoo Kim","Dahyun Kim","Seonghwan Cho","Sanghoon Kim","Sukyung Lee","Yungi Kim","Hwalsuk Lee"],"pdf_url":"https://arxiv.org/pdf/2405.20574v2.pdf","comment":"Accepted at ACL 2024 Main"},{"id":"http://arxiv.org/abs/2408.09075v1","updated":"2024-08-17T02:26:29Z","published":"2024-08-17T02:26:29Z","title":"Improving Rare Word Translation With Dictionaries and Attention Masking","summary":" In machine translation, rare words continue to be a problem for the dominant\nencoder-decoder architecture, especially in low-resource and out-of-domain\ntranslation settings. Human translators solve this problem with monolingual or\nbilingual dictionaries. In this paper, we propose appending definitions from a\nbilingual dictionary to source sentences and using attention masking to link\ntogether rare words with their definitions. We find that including definitions\nfor rare words improves performance by up to 1.0 BLEU and 1.6 MacroF1.\n","authors":["Kenneth J. Sible","David Chiang"],"pdf_url":"https://arxiv.org/pdf/2408.09075v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09070v1","updated":"2024-08-17T02:15:07Z","published":"2024-08-17T02:15:07Z","title":"CodeTaxo: Enhancing Taxonomy Expansion with Limited Examples via Code\n Language Prompts","summary":" Taxonomies play a crucial role in various applications by providing a\nstructural representation of knowledge. The task of taxonomy expansion involves\nintegrating emerging concepts into existing taxonomies by identifying\nappropriate parent concepts for these new query concepts. Previous approaches\ntypically relied on self-supervised methods that generate annotation data from\nexisting taxonomies. However, these methods are less effective when the\nexisting taxonomy is small (fewer than 100 entities). In this work, we\nintroduce \\textsc{CodeTaxo}, a novel approach that leverages large language\nmodels through code language prompts to capture the taxonomic structure.\nExtensive experiments on five real-world benchmarks from different domains\ndemonstrate that \\textsc{CodeTaxo} consistently achieves superior performance\nacross all evaluation metrics, significantly outperforming previous\nstate-of-the-art methods. The code and data are available at\n\\url{https://github.com/QingkaiZeng/CodeTaxo-Pub}.\n","authors":["Qingkai Zeng","Yuyang Bai","Zhaoxuan Tan","Zhenyu Wu","Shangbin Feng","Meng Jiang"],"pdf_url":"https://arxiv.org/pdf/2408.09070v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2408.09283v1","updated":"2024-08-17T20:11:01Z","published":"2024-08-17T20:11:01Z","title":"A Study of PHOC Spatial Region Configurations for Math Formula Retrieval","summary":" A Pyramidal Histogram Of Characters (PHOC) represents the spatial location of\nsymbols as binary vectors. The vectors are composed of levels that split a\nformula into equal-sized regions of one or more types (e.g., rectangles or\nellipses). For each region type, this produces a pyramid of overlapping\nregions, where the first level contains the entire formula, and the final level\nthe finest-grained regions. In this work, we introduce concentric rectangles\nfor regions, and analyze whether subsequent PHOC levels encode redundant\ninformation by omitting levels from PHOC configurations. As a baseline, we\ninclude a bag of words PHOC containing only the first whole-formula level.\nFinally, using the ARQMath-3 formula retrieval benchmark, we demonstrate that\nsome levels encoded in the original PHOC configurations are redundant, that\nPHOC models with rectangular regions outperform earlier PHOC models, and that\ndespite their simplicity, PHOC models are surprisingly competitive with the\nstate-of-the-art. PHOC is not math-specific, and might be used for chemical\ndiagrams, charts, or other graphics.\n","authors":["Matt Langsenkamp","Bryan Amador","Richard Zanibbi"],"pdf_url":"https://arxiv.org/pdf/2408.09283v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09239v1","updated":"2024-08-17T16:21:32Z","published":"2024-08-17T16:21:32Z","title":"Towards Effective Top-N Hamming Search via Bipartite Graph Contrastive\n Hashing","summary":" Searching on bipartite graphs serves as a fundamental task for various\nreal-world applications, such as recommendation systems, database retrieval,\nand document querying. Conventional approaches rely on similarity matching in\ncontinuous Euclidean space of vectorized node embeddings. To handle intensive\nsimilarity computation efficiently, hashing techniques for graph-structured\ndata have emerged as a prominent research direction. However, despite the\nretrieval efficiency in Hamming space, previous studies have encountered\ncatastrophic performance decay. To address this challenge, we investigate the\nproblem of hashing with Graph Convolutional Network for effective Top-N search.\nOur findings indicate the learning effectiveness of incorporating hashing\ntechniques within the exploration of bipartite graph reception fields, as\nopposed to simply treating hashing as post-processing to output embeddings. To\nfurther enhance the model performance, we advance upon these findings and\npropose Bipartite Graph Contrastive Hashing (BGCH+). BGCH+ introduces a novel\ndual augmentation approach to both intermediate information and hash code\noutputs in the latent feature spaces, thereby producing more expressive and\nrobust hash codes within a dual self-supervised learning paradigm.\nComprehensive empirical analyses on six real-world benchmarks validate the\neffectiveness of our dual feature contrastive learning in boosting the\nperformance of BGCH+ compared to existing approaches.\n","authors":["Yankai Chen","Yixiang Fang","Yifei Zhang","Chenhao Ma","Yang Hong","Irwin King"],"pdf_url":"https://arxiv.org/pdf/2408.09239v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09236v1","updated":"2024-08-17T16:04:31Z","published":"2024-08-17T16:04:31Z","title":"Hybrid Semantic Search: Unveiling User Intent Beyond Keywords","summary":" This paper addresses the limitations of traditional keyword-based search in\nunderstanding user intent and introduces a novel hybrid search approach that\nleverages the strengths of non-semantic search engines, Large Language Models\n(LLMs), and embedding models. The proposed system integrates keyword matching,\nsemantic vector embeddings, and LLM-generated structured queries to deliver\nhighly relevant and contextually appropriate search results. By combining these\ncomplementary methods, the hybrid approach effectively captures both explicit\nand implicit user intent.The paper further explores techniques to optimize\nquery execution for faster response times and demonstrates the effectiveness of\nthis hybrid search model in producing comprehensive and accurate search\noutcomes.\n","authors":["Aman Ahluwalia","Bishwajit Sutradhar","Karishma Ghosh"],"pdf_url":"https://arxiv.org/pdf/2408.09236v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12309v2","updated":"2024-08-17T15:29:35Z","published":"2024-04-18T16:38:02Z","title":"iRAG: Advancing RAG for Videos with an Incremental Approach","summary":" Retrieval-augmented generation (RAG) systems combine the strengths of\nlanguage generation and information retrieval to power many real-world\napplications like chatbots. Use of RAG for understanding of videos is appealing\nbut there are two critical limitations. One-time, upfront conversion of all\ncontent in large corpus of videos into text descriptions entails high\nprocessing times. Also, not all information in the rich video data is typically\ncaptured in the text descriptions. Since user queries are not known apriori,\ndeveloping a system for video to text conversion and interactive querying of\nvideo data is challenging.\n To address these limitations, we propose an incremental RAG system called\niRAG, which augments RAG with a novel incremental workflow to enable\ninteractive querying of a large corpus of videos. Unlike traditional RAG, iRAG\nquickly indexes large repositories of videos, and in the incremental workflow,\nit uses the index to opportunistically extract more details from select\nportions of the videos to retrieve context relevant to an interactive user\nquery. Such an incremental workflow avoids long video to text conversion times,\nand overcomes information loss issues due to conversion of video to text, by\ndoing on-demand query-specific extraction of details in video data. This\nensures high quality of responses to interactive user queries that are often\nnot known apriori. To the best of our knowledge, iRAG is the first system to\naugment RAG with an incremental workflow to support efficient interactive\nquerying of a large corpus of videos. Experimental results on real-world\ndatasets demonstrate 23x to 25x faster video to text ingestion, while ensuring\nthat latency and quality of responses to interactive user queries is comparable\nto responses from a traditional RAG where all video data is converted to text\nupfront before any user querying.\n","authors":["Md Adnan Arefeen","Biplob Debnath","Md Yusuf Sarwar Uddin","Srimat Chakradhar"],"pdf_url":"https://arxiv.org/pdf/2404.12309v2.pdf","comment":"Accepted in CIKM 2024"},{"id":"http://arxiv.org/abs/2408.09226v1","updated":"2024-08-17T15:16:54Z","published":"2024-08-17T15:16:54Z","title":"FabricQA-Extractor: A Question Answering System to Extract Information\n from Documents using Natural Language Questions","summary":" Reading comprehension models answer questions posed in natural language when\nprovided with a short passage of text. They present an opportunity to address a\nlong-standing challenge in data management: the extraction of structured data\nfrom unstructured text. Consequently, several approaches are using these models\nto perform information extraction. However, these modern approaches leave an\nopportunity behind because they do not exploit the relational structure of the\ntarget extraction table. In this paper, we introduce a new model, Relation\nCoherence, that exploits knowledge of the relational structure to improve the\nextraction quality. We incorporate the Relation Coherence model as part of\nFabricQA-Extractor, an end-to-end system we built from scratch to conduct large\nscale extraction tasks over millions of documents. We demonstrate on two\ndatasets with millions of passages that Relation Coherence boosts extraction\nperformance and evaluate FabricQA-Extractor on large scale datasets.\n","authors":["Qiming Wang","Raul Castro Fernandez"],"pdf_url":"https://arxiv.org/pdf/2408.09226v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.06444v3","updated":"2024-08-17T15:04:58Z","published":"2023-11-11T00:22:57Z","title":"Mitigating Pooling Bias in E-commerce Search via False Negative\n Estimation","summary":" Efficient and accurate product relevance assessment is critical for user\nexperiences and business success. Training a proficient relevance assessment\nmodel requires high-quality query-product pairs, often obtained through\nnegative sampling strategies. Unfortunately, current methods introduce pooling\nbias by mistakenly sampling false negatives, diminishing performance and\nbusiness impact. To address this, we present Bias-mitigating Hard Negative\nSampling (BHNS), a novel negative sampling strategy tailored to identify and\nadjust for false negatives, building upon our original False Negative\nEstimation algorithm. Our experiments in the Instacart search setting confirm\nBHNS as effective for practical e-commerce use. Furthermore, comparative\nanalyses on public dataset showcase its domain-agnostic potential for diverse\napplications.\n","authors":["Xiaochen Wang","Xiao Xiao","Ruhan Zhang","Xuan Zhang","Taesik Na","Tejaswi Tenneti","Haixun Wang","Fenglong Ma"],"pdf_url":"https://arxiv.org/pdf/2311.06444v3.pdf","comment":"Submitted to WWW'24 Industry Track"},{"id":"http://arxiv.org/abs/2408.09199v1","updated":"2024-08-17T13:32:32Z","published":"2024-08-17T13:32:32Z","title":"TC-RAG:Turing-Complete RAG's Case study on Medical LLM Systems","summary":" In the pursuit of enhancing domain-specific Large Language Models (LLMs),\nRetrieval-Augmented Generation (RAG) emerges as a promising solution to\nmitigate issues such as hallucinations, outdated knowledge, and limited\nexpertise in highly specialized queries. However, existing approaches to RAG\nfall short by neglecting system state variables, which are crucial for ensuring\nadaptive control, retrieval halting, and system convergence. In this paper, we\nintroduce the TC-RAG through rigorous proof, a novel framework that addresses\nthese challenges by incorporating a Turing Complete System to manage state\nvariables, thereby enabling more efficient and accurate knowledge retrieval. By\nleveraging a memory stack system with adaptive retrieval, reasoning, and\nplanning capabilities, TC-RAG not only ensures the controlled halting of\nretrieval processes but also mitigates the accumulation of erroneous knowledge\nvia Push and Pop actions. In the case study of the medical domain, our\nextensive experiments on real-world healthcare datasets demonstrate the\nsuperiority of TC-RAG over existing methods in accuracy by over 7.20\\%. Our\ndataset and code have been available at\nhttps://https://github.com/Artessay/SAMA.git.\n","authors":["Xinke Jiang","Yue Fang","Rihong Qiu","Haoyu Zhang","Yongxin Xu","Hao Chen","Wentao Zhang","Ruizhe Zhang","Yuchen Fang","Xu Chu","Junfeng Zhao","Yasha Wang"],"pdf_url":"https://arxiv.org/pdf/2408.09199v1.pdf","comment":"version 1.0"},{"id":"http://arxiv.org/abs/2408.09168v1","updated":"2024-08-17T11:11:31Z","published":"2024-08-17T11:11:31Z","title":"Ranking Across Different Content Types: The Robust Beauty of Multinomial\n Blending","summary":" An increasing number of media streaming services have expanded their\nofferings to include entities of multiple content types. For instance, audio\nstreaming services that started by offering music only, now also offer\npodcasts, merchandise items, and videos. Ranking items across different content\ntypes into a single slate poses a significant challenge for traditional\nlearning-to-rank (LTR) algorithms due to differing user engagement patterns for\ndifferent content types. We explore a simple method for cross-content-type\nranking, called multinomial blending (MB), which can be used in conjunction\nwith most existing LTR algorithms. We compare MB to existing baselines not only\nin terms of ranking quality but also from other industry-relevant perspectives\nsuch as interpretability, ease-of-use, and stability in dynamic environments\nwith changing user behavior and ranking model retraining. Finally, we report\nthe results of an A/B test from an Amazon Music ranking use-case.\n","authors":["Jan Malte Lichtenberg","Giuseppe Di Benedetto","Matteo Ruffini"],"pdf_url":"https://arxiv.org/pdf/2408.09168v1.pdf","comment":"To appear in 18th ACM Conference on Recommender Systems (RecSys24),\n Bari, Italy. ACM, New York, NY, USA, 3 pages"},{"id":"http://arxiv.org/abs/2408.09070v1","updated":"2024-08-17T02:15:07Z","published":"2024-08-17T02:15:07Z","title":"CodeTaxo: Enhancing Taxonomy Expansion with Limited Examples via Code\n Language Prompts","summary":" Taxonomies play a crucial role in various applications by providing a\nstructural representation of knowledge. The task of taxonomy expansion involves\nintegrating emerging concepts into existing taxonomies by identifying\nappropriate parent concepts for these new query concepts. Previous approaches\ntypically relied on self-supervised methods that generate annotation data from\nexisting taxonomies. However, these methods are less effective when the\nexisting taxonomy is small (fewer than 100 entities). In this work, we\nintroduce \\textsc{CodeTaxo}, a novel approach that leverages large language\nmodels through code language prompts to capture the taxonomic structure.\nExtensive experiments on five real-world benchmarks from different domains\ndemonstrate that \\textsc{CodeTaxo} consistently achieves superior performance\nacross all evaluation metrics, significantly outperforming previous\nstate-of-the-art methods. The code and data are available at\n\\url{https://github.com/QingkaiZeng/CodeTaxo-Pub}.\n","authors":["Qingkai Zeng","Yuyang Bai","Zhaoxuan Tan","Zhenyu Wu","Shangbin Feng","Meng Jiang"],"pdf_url":"https://arxiv.org/pdf/2408.09070v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2401.11818v2","updated":"2024-08-17T05:45:15Z","published":"2024-01-22T10:26:52Z","title":"MInD: Improving Multimodal Sentiment Analysis via Multimodal Information\n Disentanglement","summary":" Learning effective joint representations has been a central task in\nmulti-modal sentiment analysis. Previous works addressing this task focus on\nexploring sophisticated fusion techniques to enhance performance. However, the\ninherent heterogeneity of distinct modalities remains a core problem that\nbrings challenges in fusing and coordinating the multi-modal signals at both\nthe representational level and the informational level, impeding the full\nexploitation of multi-modal information. To address this problem, we propose\nthe Multi-modal Information Disentanglement (MInD) method, which decomposes the\nmulti-modal inputs into modality-invariant and modality-specific components\nthrough a shared encoder and multiple private encoders. Furthermore, by\nexplicitly training generated noise in an adversarial manner, MInD is able to\nisolate uninformativeness, thus improves the learned representations.\nTherefore, the proposed disentangled decomposition allows for a fusion process\nthat is simpler than alternative methods and results in improved performance.\nExperimental evaluations conducted on representative benchmark datasets\ndemonstrate MInD's effectiveness in both multi-modal emotion recognition and\nmulti-modal humor detection tasks. Code will be released upon acceptance of the\npaper.\n","authors":["Weichen Dai","Xingyu Li","Zeyu Wang","Pengbo Hu","Ji Qi","Jianlin Peng","Yi Zhou"],"pdf_url":"https://arxiv.org/pdf/2401.11818v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19340v2","updated":"2024-08-17T03:01:59Z","published":"2024-07-27T21:00:36Z","title":"Integrating Large Language Models into a Tri-Modal Architecture for\n Automated Depression Classification","summary":" Major Depressive Disorder (MDD) is a pervasive mental health condition that\naffects 300 million people worldwide. This work presents a novel, BiLSTM-based\ntri-modal model-level fusion architecture for the binary classification of\ndepression from clinical interview recordings. The proposed architecture\nincorporates Mel Frequency Cepstral Coefficients, Facial Action Units, and uses\na two-shot learning based GPT-4 model to process text data. This is the first\nwork to incorporate large language models into a multi-modal architecture for\nthis task. It achieves impressive results on the DAIC-WOZ AVEC 2016 Challenge\ncross-validation split and Leave-One-Subject-Out cross-validation split,\nsurpassing all baseline models and multiple state-of-the-art models. In\nLeave-One-Subject-Out testing, it achieves an accuracy of 91.01%, an F1-Score\nof 85.95%, a precision of 80%, and a recall of 92.86%.\n","authors":["Santosh V. Patapati"],"pdf_url":"https://arxiv.org/pdf/2407.19340v2.pdf","comment":"Keywords: Multi-Modal Neural Networks, Deep Learning, Large Language\n Models, Depression Diagnosis, Biomedical Informatics, DAIC-WOZ"}]},"2024-08-20T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2408.11051v1","updated":"2024-08-20T17:57:46Z","published":"2024-08-20T17:57:46Z","title":"FLAME: Learning to Navigate with Multimodal LLM in Urban Environments","summary":" Large Language Models (LLMs) have demonstrated potential in\nVision-and-Language Navigation (VLN) tasks, yet current applications face\nchallenges. While LLMs excel in general conversation scenarios, they struggle\nwith specialized navigation tasks, yielding suboptimal performance compared to\nspecialized VLN models. We introduce FLAME (FLAMingo-Architected Embodied\nAgent), a novel Multimodal LLM-based agent and architecture designed for urban\nVLN tasks that efficiently handles multiple observations. Our approach\nimplements a three-phase tuning technique for effective adaptation to\nnavigation tasks, including single perception tuning for street view\ndescription, multiple perception tuning for trajectory summarization, and\nend-to-end training on VLN datasets. The augmented datasets are synthesized\nautomatically. Experimental results demonstrate FLAME's superiority over\nexisting methods, surpassing state-of-the-art methods by a 7.3% increase in\ntask completion rate on Touchdown dataset. This work showcases the potential of\nMultimodal LLMs (MLLMs) in complex navigation tasks, representing an\nadvancement towards practical applications of MLLMs in embodied AI. Project\npage: https://flame-sjtu.github.io\n","authors":["Yunzhe Xu","Yiyuan Pan","Zhe Liu","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2408.11051v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.11049v1","updated":"2024-08-20T17:57:31Z","published":"2024-08-20T17:57:31Z","title":"MagicDec: Breaking the Latency-Throughput Tradeoff for Long Context\n Generation with Speculative Decoding","summary":" Large Language Models (LLMs) have become more prevalent in long-context\napplications such as interactive chatbots, document analysis, and agent\nworkflows, but it is challenging to serve long-context requests with low\nlatency and high throughput. Speculative decoding (SD) is a widely used\ntechnique to reduce latency without sacrificing performance but the\nconventional wisdom suggests that its efficacy is limited to small batch sizes.\nIn MagicDec, we show that surprisingly SD can achieve speedup even for a high\nthroughput inference regime for moderate to long sequences. More interestingly,\nan intelligent drafting strategy can achieve better speedup with increasing\nbatch size based on our rigorous analysis. MagicDec first identifies the\nbottleneck shifts with increasing batch size and sequence length, and uses\nthese insights to deploy speculative decoding more effectively for high\nthroughput inference. Then, it leverages draft models with sparse KV cache to\naddress the KV bottleneck that scales with both sequence length and batch size.\n","authors":["Jian Chen","Vashisth Tiwari","Ranajoy Sadhukhan","Zhuoming Chen","Jinyuan Shi","Ian En-Hsu Yen","Beidi Chen"],"pdf_url":"https://arxiv.org/pdf/2408.11049v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10188v2","updated":"2024-08-20T17:56:24Z","published":"2024-08-19T17:48:08Z","title":"LongVILA: Scaling Long-Context Visual Language Models for Long Videos","summary":" Long-context capability is critical for multi-modal foundation models. We\nintroduce LongVILA, a full-stack solution for long-context vision-language\nmodels, including system, model training, and dataset development. On the\nsystem side, we introduce the first long-context Multi-Modal Sequence\nParallelism (MM-SP) system that enables long training and inference, enabling\n2M context length training on 256 GPUs without any gradient checkpointing.\nMM-SP is 2.1x - 5.7x faster than ring sequence parallelism and 1.1x - 1.4x\nfaster than Megatron context parallelism + tensor parallelism in text-only\nsettings. Moreover, it seamlessly integrates with Hugging Face Transformers.\nFor model training, we propose a five-stage pipeline comprising alignment,\npre-training, short supervised fine-tuning, context extension, and long\nsupervised fine-tuning. On datasets, we construct large-scale visual language\npre-training datasets and long video instruction-following datasets to support\nour multi-stage training process. LongVILA extends the number of frames of VILA\nfrom 8 to 1024, and improves the long video captioning score from 2.00 to 3.26\n(1.6x), achieving 99.5% accuracy in 1400-frames video (274k context length)\nneedle-in-a-haystack. LongVILA-8B demonstrates consistent accuracy improvements\non long videos in the VideoMME benchmark as the number of frames increases.\n","authors":["Fuzhao Xue","Yukang Chen","Dacheng Li","Qinghao Hu","Ligeng Zhu","Xiuyu Li","Yunhao Fang","Haotian Tang","Shang Yang","Zhijian Liu","Ethan He","Hongxu Yin","Pavlo Molchanov","Jan Kautz","Linxi Fan","Yuke Zhu","Yao Lu","Song Han"],"pdf_url":"https://arxiv.org/pdf/2408.10188v2.pdf","comment":"Code and models are available at\n https://github.com/NVlabs/VILA/blob/main/LongVILA.md"},{"id":"http://arxiv.org/abs/2408.11046v1","updated":"2024-08-20T17:55:15Z","published":"2024-08-20T17:55:15Z","title":"Inside the Black Box: Detecting Data Leakage in Pre-trained Language\n Encoders","summary":" Despite being prevalent in the general field of Natural Language Processing\n(NLP), pre-trained language models inherently carry privacy and copyright\nconcerns due to their nature of training on large-scale web-scraped data. In\nthis paper, we pioneer a systematic exploration of such risks associated with\npre-trained language encoders, specifically focusing on the membership leakage\nof pre-training data exposed through downstream models adapted from pre-trained\nlanguage encoders-an aspect largely overlooked in existing literature. Our\nstudy encompasses comprehensive experiments across four types of pre-trained\nencoder architectures, three representative downstream tasks, and five\nbenchmark datasets. Intriguingly, our evaluations reveal, for the first time,\nthe existence of membership leakage even when only the black-box output of the\ndownstream model is exposed, highlighting a privacy risk far greater than\npreviously assumed. Alongside, we present in-depth analysis and insights toward\nguiding future researchers and practitioners in addressing the privacy\nconsiderations in developing pre-trained language models.\n","authors":["Yuan Xin","Zheng Li","Ning Yu","Dingfan Chen","Mario Fritz","Michael Backes","Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.11046v1.pdf","comment":"ECAI24"},{"id":"http://arxiv.org/abs/2404.01099v2","updated":"2024-08-20T17:54:08Z","published":"2024-04-01T13:12:30Z","title":"What is in Your Safe Data? Identifying Benign Data that Breaks Safety","summary":" Current Large Language Models (LLMs), even those tuned for safety and\nalignment, are susceptible to jailbreaking. Some have found that just further\nfine-tuning an aligned model with benign data (i.e., data without harmful\ncontent) surprisingly leads to substantial degradation in safety. We delve into\nthe data-centric aspects of why benign fine-tuning inadvertently contributes to\njailbreaking. First, we represent fine-tuning data through two lenses:\nrepresentation and gradient spaces. Additionally, we propose a bi-directional\nanchoring method that, during the selection process, prioritizes data points\nthat are close to harmful examples and far from benign ones. Our approach\neffectively identifies subsets of benign data that are more likely to degrade\nthe model's safety after fine-tuning. Training on just 100 of these seemingly\nbenign datapoints surprisingly leads to the fine-tuned model affirmatively\nresponding to >70% of tested harmful requests, compared to <20% after\nfine-tuning on randomly selected data. We also observe that the selected data\nfrequently appear as lists, bullet points, or math questions, indicating a\nsystematic pattern in fine-tuning data that contributes to jailbreaking.\n","authors":["Luxi He","Mengzhou Xia","Peter Henderson"],"pdf_url":"https://arxiv.org/pdf/2404.01099v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11029v1","updated":"2024-08-20T17:30:48Z","published":"2024-08-20T17:30:48Z","title":"Scaling Law with Learning Rate Annealing","summary":" We find that the cross-entropy loss curves of neural language models\nempirically adhere to a scaling law with learning rate (LR) annealing over\ntraining steps ($s$): $$L(s) = L_0 + A\\cdot S_1^{-\\alpha} - C\\cdot S_2$$ Where\n$S_1$ is forward area and $S_2$ is learning rate annealing area. This\nformulation takes into account two factors: (1) The forward scaling defined as\ntypical scaling law, and (2) the additional loss drop brought by LR annealing.\nTherefore, this formulation can describe the full loss curve at each step,\nrather than the single loss point at the end of training. Applying the scaling\nlaw with LR annealing and fitting only one or two training curves, we can\naccurately predict the loss of language model training at any given step and\nacross any learning rate scheduler (LRS). Furthermore, this equation accurately\ndescribes the dynamics during training process, and provides a theoretical\nverification and explanation for numerous experimental findings of previous\nstudies, particularly those focusing on LR schedule and LR annealing. The\nresulting insights, also serve as a guide for researchers to select critical\nLRS in advance by prediction using our equation. Most significantly, since all\nthe points in a full training curve follow the equation, we can achieve\naccurate loss prediction at any given step across any learning rate scheduler,\nwhile expending less than 1\\% of the computational cost required by the\nchinchilla scaling law to fit language modeling loss. This approach extremely\ndemocratizes scaling law fitting and predicting in developing large language\nmodels.\n","authors":["Howe Tissue","Venus Wang","Lu Wang"],"pdf_url":"https://arxiv.org/pdf/2408.11029v1.pdf","comment":"25 pages, 23 figures"},{"id":"http://arxiv.org/abs/2310.10830v2","updated":"2024-08-20T17:28:14Z","published":"2023-10-16T21:05:12Z","title":"Fake News in Sheep's Clothing: Robust Fake News Detection Against\n LLM-Empowered Style Attacks","summary":" It is commonly perceived that fake news and real news exhibit distinct\nwriting styles, such as the use of sensationalist versus objective language.\nHowever, we emphasize that style-related features can also be exploited for\nstyle-based attacks. Notably, the advent of powerful Large Language Models\n(LLMs) has empowered malicious actors to mimic the style of trustworthy news\nsources, doing so swiftly, cost-effectively, and at scale. Our analysis reveals\nthat LLM-camouflaged fake news content significantly undermines the\neffectiveness of state-of-the-art text-based detectors (up to 38% decrease in\nF1 Score), implying a severe vulnerability to stylistic variations. To address\nthis, we introduce SheepDog, a style-robust fake news detector that prioritizes\ncontent over style in determining news veracity. SheepDog achieves this\nresilience through (1) LLM-empowered news reframings that inject style\ndiversity into the training process by customizing articles to match different\nstyles; (2) a style-agnostic training scheme that ensures consistent veracity\npredictions across style-diverse reframings; and (3) content-focused veracity\nattributions that distill content-centric guidelines from LLMs for debunking\nfake news, offering supplementary cues and potential intepretability that\nassist veracity prediction. Extensive experiments on three real-world\nbenchmarks demonstrate SheepDog's style robustness and adaptability to various\nbackbones.\n","authors":["Jiaying Wu","Jiafeng Guo","Bryan Hooi"],"pdf_url":"https://arxiv.org/pdf/2310.10830v2.pdf","comment":"Accepted to KDD 2024 (Research Track)"},{"id":"http://arxiv.org/abs/2408.11021v1","updated":"2024-08-20T17:21:10Z","published":"2024-08-20T17:21:10Z","title":"Athena: Safe Autonomous Agents with Verbal Contrastive Learning","summary":" Due to emergent capabilities, large language models (LLMs) have been utilized\nas language-based agents to perform a variety of tasks and make decisions with\nan increasing degree of autonomy. These autonomous agents can understand\nhigh-level instructions, interact with their environments, and execute complex\ntasks using a selection of tools available to them. As the capabilities of the\nagents expand, ensuring their safety and trustworthiness becomes more\nimperative. In this study, we introduce the Athena framework which leverages\nthe concept of verbal contrastive learning where past safe and unsafe\ntrajectories are used as in-context (contrastive) examples to guide the agent\ntowards safety while fulfilling a given task. The framework also incorporates a\ncritiquing mechanism to guide the agent to prevent risky actions at every step.\nFurthermore, due to the lack of existing benchmarks on the safety reasoning\nability of LLM-based agents, we curate a set of 80 toolkits across 8 categories\nwith 180 scenarios to provide a safety evaluation benchmark. Our experimental\nevaluation, with both closed- and open-source LLMs, indicates verbal\ncontrastive learning and interaction-level critiquing improve the safety rate\nsignificantly.\n","authors":["Tanmana Sadhu","Ali Pesaranghader","Yanan Chen","Dong Hoon Yi"],"pdf_url":"https://arxiv.org/pdf/2408.11021v1.pdf","comment":"9 pages, 2 figures, 4 tables"},{"id":"http://arxiv.org/abs/2305.00050v3","updated":"2024-08-20T17:16:20Z","published":"2023-04-28T19:00:43Z","title":"Causal Reasoning and Large Language Models: Opening a New Frontier for\n Causality","summary":" The causal capabilities of large language models (LLMs) are a matter of\nsignificant debate, with critical implications for the use of LLMs in\nsocietally impactful domains such as medicine, science, law, and policy. We\nconduct a \"behavorial\" study of LLMs to benchmark their capability in\ngenerating causal arguments. Across a wide range of tasks, we find that LLMs\ncan generate text corresponding to correct causal arguments with high\nprobability, surpassing the best-performing existing methods. Algorithms based\non GPT-3.5 and 4 outperform existing algorithms on a pairwise causal discovery\ntask (97%, 13 points gain), counterfactual reasoning task (92%, 20 points gain)\nand event causality (86% accuracy in determining necessary and sufficient\ncauses in vignettes). We perform robustness checks across tasks and show that\nthe capabilities cannot be explained by dataset memorization alone, especially\nsince LLMs generalize to novel datasets that were created after the training\ncutoff date.\n That said, LLMs exhibit unpredictable failure modes, and we discuss the kinds\nof errors that may be improved and what are the fundamental limits of LLM-based\nanswers. Overall, by operating on the text metadata, LLMs bring capabilities so\nfar understood to be restricted to humans, such as using collected knowledge to\ngenerate causal graphs or identifying background causal context from natural\nlanguage. As a result, LLMs may be used by human domain experts to save effort\nin setting up a causal analysis, one of the biggest impediments to the\nwidespread adoption of causal methods. Given that LLMs ignore the actual data,\nour results also point to a fruitful research direction of developing\nalgorithms that combine LLMs with existing causal techniques. Code and datasets\nare available at https://github.com/py-why/pywhy-llm.\n","authors":["Emre Kıcıman","Robert Ness","Amit Sharma","Chenhao Tan"],"pdf_url":"https://arxiv.org/pdf/2305.00050v3.pdf","comment":"Added three novel datasets. To be published in TMLR. Authors listed\n alphabetically"},{"id":"http://arxiv.org/abs/2408.11006v1","updated":"2024-08-20T17:00:04Z","published":"2024-08-20T17:00:04Z","title":"While GitHub Copilot Excels at Coding, Does It Ensure Responsible\n Output?","summary":" The rapid development of large language models (LLMs) has significantly\nadvanced code completion capabilities, giving rise to a new generation of\nLLM-based Code Completion Tools (LCCTs). Unlike general-purpose LLMs, these\ntools possess unique workflows, integrating multiple information sources as\ninput and prioritizing code suggestions over natural language interaction,\nwhich introduces distinct security challenges. Additionally, LCCTs often rely\non proprietary code datasets for training, raising concerns about the potential\nexposure of sensitive data. This paper exploits these distinct characteristics\nof LCCTs to develop targeted attack methodologies on two critical security\nrisks: jailbreaking and training data extraction attacks. Our experimental\nresults expose significant vulnerabilities within LCCTs, including a 99.4%\nsuccess rate in jailbreaking attacks on GitHub Copilot and a 46.3% success rate\non Amazon Q. Furthermore, We successfully extracted sensitive user data from\nGitHub Copilot, including 54 real email addresses and 314 physical addresses\nassociated with GitHub usernames. Our study also demonstrates that these\ncode-based attack methods are effective against general-purpose LLMs, such as\nthe GPT series, highlighting a broader security misalignment in the handling of\ncode by modern LLMs. These findings underscore critical security challenges\nassociated with LCCTs and suggest essential directions for strengthening their\nsecurity frameworks. The example code and attack samples from our research are\nprovided at https://github.com/Sensente/Security-Attacks-on-LCCTs.\n","authors":["Wen Cheng","Ke Sun","Xinyu Zhang","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2408.11006v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10997v1","updated":"2024-08-20T16:43:55Z","published":"2024-08-20T16:43:55Z","title":"Disentangling segmental and prosodic factors to non-native speech\n comprehensibility","summary":" Current accent conversion (AC) systems do not disentangle the two main\nsources of non-native accent: segmental and prosodic characteristics. Being\nable to manipulate a non-native speaker's segmental and/or prosodic channels\nindependently is critical to quantify how these two channels contribute to\nspeech comprehensibility and social attitudes. We present an AC system that not\nonly decouples voice quality from accent, but also disentangles the latter into\nits segmental and prosodic characteristics. The system is able to generate\naccent conversions that combine (1) the segmental characteristics from a source\nutterance, (2) the voice characteristics from a target utterance, and (3) the\nprosody of a reference utterance. We show that vector quantization of acoustic\nembeddings and removal of consecutive duplicated codewords allows the system to\ntransfer prosody and improve voice similarity. We conduct perceptual listening\ntests to quantify the individual contributions of segmental features and\nprosody on the perceived comprehensibility of non-native speech. Our results\nindicate that, contrary to prior research in non-native speech, segmental\nfeatures have a larger impact on comprehensibility than prosody. The proposed\nAC system may also be used to study how segmental and prosody cues affect\nsocial attitudes towards non-native speech.\n","authors":["Waris Quamer","Ricardo Gutierrez-Osuna"],"pdf_url":"https://arxiv.org/pdf/2408.10997v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10995v1","updated":"2024-08-20T16:43:05Z","published":"2024-08-20T16:43:05Z","title":"CTP-LLM: Clinical Trial Phase Transition Prediction Using Large Language\n Models","summary":" New medical treatment development requires multiple phases of clinical\ntrials. Despite the significant human and financial costs of bringing a drug to\nmarket, less than 20% of drugs in testing will make it from the first phase to\nfinal approval. Recent literature indicates that the design of the trial\nprotocols significantly contributes to trial performance. We investigated\nClinical Trial Outcome Prediction (CTOP) using trial design documents to\npredict phase transitions automatically. We propose CTP-LLM, the first Large\nLanguage Model (LLM) based model for CTOP. We also introduce the\nPhaseTransition (PT) Dataset; which labels trials based on their progression\nthrough the regulatory process and serves as a benchmark for CTOP evaluation.\nOur fine-tuned GPT-3.5-based model (CTP-LLM) predicts clinical trial phase\ntransition by analyzing the trial's original protocol texts without requiring\nhuman-selected features. CTP-LLM achieves a 67% accuracy rate in predicting\ntrial phase transitions across all phases and a 75% accuracy rate specifically\nin predicting the transition from Phase~III to final approval. Our experimental\nperformance highlights the potential of LLM-powered applications in forecasting\nclinical trial outcomes and assessing trial design.\n","authors":["Michael Reinisch","Jianfeng He","Chenxi Liao","Sauleh Ahmad Siddiqui","Bei Xiao"],"pdf_url":"https://arxiv.org/pdf/2408.10995v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10979v1","updated":"2024-08-20T16:13:28Z","published":"2024-08-20T16:13:28Z","title":"The fusion of phonography and ideographic characters into virtual\n Chinese characters -- Based on Chinese and English","summary":" The characters used in modern countries are mainly divided into ideographic\ncharacters and phonetic characters, both of which have their advantages and\ndisadvantages. Chinese is difficult to learn and easy to master, while English\nis easy to learn but has a large vocabulary. There is still no language that\ncombines the advantages of both languages and has less memory capacity, can\nform words, and is easy to learn. Therefore, inventing new characters that can\nbe combined and the popularization of deep knowledge, and reduce disputes\nthrough communication. Firstly, observe the advantages and disadvantages of\nChinese and English, such as their vocabulary, information content, and ease of\nlearning in deep scientific knowledge, and create a new writing system. Then,\nuse comparative analysis to observe the total score of the new language.\nThrough this article, it can be concluded that the new text combines the\nadvantages of both pictographic and alphabetical writing: new characters that\ncan be combined into words reduces the vocabulary that needs to be learned;\nSpecial prefixes allow beginners to quickly guess the approximate category and\nmeaning of unseen words; New characters can enable humans to quickly learn more\nadvanced knowledge.\n","authors":["Hongfa Zi","Zhen Liu"],"pdf_url":"https://arxiv.org/pdf/2408.10979v1.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2408.10962v1","updated":"2024-08-20T15:57:18Z","published":"2024-08-20T15:57:18Z","title":"NLP for The Greek Language: A Longer Survey","summary":" English language is in the spotlight of the Natural Language Processing (NLP)\ncommunity with other languages, like Greek, lagging behind in terms of offered\nmethods, tools and resources. Due to the increasing interest in NLP, in this\npaper we try to condense research efforts for the automatic processing of Greek\nlanguage covering the last three decades. In particular, we list and briefly\ndiscuss related works, resources and tools, categorized according to various\nprocessing layers and contexts. We are not restricted to the modern form of\nGreek language but also cover Ancient Greek and various Greek dialects. This\nsurvey can be useful for researchers and students interested in NLP tasks,\nInformation Retrieval and Knowledge Management for the Greek language.\n","authors":["Katerina Papantoniou","Yannis Tzitzikas"],"pdf_url":"https://arxiv.org/pdf/2408.10962v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09172v2","updated":"2024-08-20T15:51:59Z","published":"2024-08-17T11:33:23Z","title":"Unc-TTP: A Method for Classifying LLM Uncertainty to Improve In-Context\n Example Selection","summary":" Nowadays, Large Language Models (LLMs) have demonstrated exceptional\nperformance across various downstream tasks. However, it is challenging for\nusers to discern whether the responses are generated with certainty or are\nfabricated to meet user expectations. Estimating the uncertainty of LLMs is\nparticularly challenging due to their vast scale and the lack of white-box\naccess. In this work, we propose a novel Uncertainty Tripartite Testing\nParadigm (Unc-TTP) to classify LLM uncertainty, via evaluating the consistency\nof LLM outputs when incorporating label interference into the sampling-based\napproach. Based on Unc-TTP outputs, we aggregate instances into certain and\nuncertain categories. Further, we conduct a detailed analysis of the\nuncertainty properties of LLMs and show Unc-TTP's superiority over the existing\nsampling-based methods. In addition, we leverage the obtained uncertainty\ninformation to guide in-context example selection, demonstrating that Unc-TTP\nobviously outperforms retrieval-based and sampling-based approaches in\nselecting more informative examples. Our work paves a new way to classify the\nuncertainty of both open- and closed-source LLMs, and introduces a practical\napproach to exploit this uncertainty to improve LLMs performance.\n","authors":["Hsiu-Yuan Huang","Zichen Wu","Yutong Yang","Junzhao Zhang","Yunfang Wu"],"pdf_url":"https://arxiv.org/pdf/2408.09172v2.pdf","comment":"9 pages, long paper"},{"id":"http://arxiv.org/abs/2406.13629v2","updated":"2024-08-20T15:48:49Z","published":"2024-06-19T15:25:29Z","title":"InstructRAG: Instructing Retrieval-Augmented Generation via\n Self-Synthesized Rationales","summary":" Retrieval-augmented generation (RAG) has shown promising potential to enhance\nthe accuracy and factuality of language models (LMs). However, imperfect\nretrievers or noisy corpora can introduce misleading or even erroneous\ninformation to the retrieved contents, posing a significant challenge to the\ngeneration quality. Existing RAG methods typically address this challenge by\ndirectly predicting final answers despite potentially noisy inputs, resulting\nin an implicit denoising process that is difficult to interpret and verify. On\nthe other hand, the acquisition of explicit denoising supervision is often\ncostly, involving significant human efforts. In this work, we propose\nInstructRAG, where LMs explicitly learn the denoising process through\nself-synthesized rationales -- First, we instruct the LM to explain how the\nground-truth answer is derived from retrieved documents. Then, these rationales\ncan be used either as demonstrations for in-context learning of explicit\ndenoising or as supervised fine-tuning data to train the model. Compared to\nstandard RAG approaches, InstructRAG requires no additional supervision, allows\nfor easier verification of the predicted answers, and effectively improves\ngeneration accuracy. Experiments show InstructRAG consistently outperforms\nexisting RAG methods in both training-free and trainable scenarios, achieving a\nrelative improvement of 8.3% over the best baseline method on average across\nfive knowledge-intensive benchmarks. Extensive analysis indicates that\nInstructRAG scales well with increased numbers of retrieved documents and\nconsistently exhibits robust denoising ability even in out-of-domain datasets,\ndemonstrating strong generalizability.\n","authors":["Zhepei Wei","Wei-Lin Chen","Yu Meng"],"pdf_url":"https://arxiv.org/pdf/2406.13629v2.pdf","comment":"Code: https://github.com/weizhepei/InstructRAG"},{"id":"http://arxiv.org/abs/2406.03151v3","updated":"2024-08-20T15:41:27Z","published":"2024-06-05T11:15:45Z","title":"Which Side Are You On? A Multi-task Dataset for End-to-End Argument\n Summarisation and Evaluation","summary":" With the recent advances of large language models (LLMs), it is no longer\ninfeasible to build an automated debate system that helps people to synthesise\npersuasive arguments. Previous work attempted this task by integrating multiple\ncomponents. In our work, we introduce an argument mining dataset that captures\nthe end-to-end process of preparing an argumentative essay for a debate, which\ncovers the tasks of claim and evidence identification (Task 1 ED), evidence\nconvincingness ranking (Task 2 ECR), argumentative essay summarisation and\nhuman preference ranking (Task 3 ASR) and metric learning for automated\nevaluation of resulting essays, based on human feedback along argument quality\ndimensions (Task 4 SQE). Our dataset contains 14k examples of claims that are\nfully annotated with the various properties supporting the aforementioned\ntasks. We evaluate multiple generative baselines for each of these tasks,\nincluding representative LLMs. We find, that while they show promising results\non individual tasks in our benchmark, their end-to-end performance on all four\ntasks in succession deteriorates significantly, both in automated measures as\nwell as in human-centred evaluation. This challenge presented by our proposed\ndataset motivates future research on end-to-end argument mining and\nsummarisation. The repository of this project is available at\nhttps://github.com/HaoBytes/ArgSum-Datatset\n","authors":["Hao Li","Yuping Wu","Viktor Schlegel","Riza Batista-Navarro","Tharindu Madusanka","Iqra Zahid","Jiayan Zeng","Xiaochi Wang","Xinran He","Yizhi Li","Goran Nenadic"],"pdf_url":"https://arxiv.org/pdf/2406.03151v3.pdf","comment":"Published on ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2408.10947v1","updated":"2024-08-20T15:36:30Z","published":"2024-08-20T15:36:30Z","title":"Dr.Academy: A Benchmark for Evaluating Questioning Capability in\n Education for Large Language Models","summary":" Teachers are important to imparting knowledge and guiding learners, and the\nrole of large language models (LLMs) as potential educators is emerging as an\nimportant area of study. Recognizing LLMs' capability to generate educational\ncontent can lead to advances in automated and personalized learning. While LLMs\nhave been tested for their comprehension and problem-solving skills, their\ncapability in teaching remains largely unexplored. In teaching, questioning is\na key skill that guides students to analyze, evaluate, and synthesize core\nconcepts and principles. Therefore, our research introduces a benchmark to\nevaluate the questioning capability in education as a teacher of LLMs through\nevaluating their generated educational questions, utilizing Anderson and\nKrathwohl's taxonomy across general, monodisciplinary, and interdisciplinary\ndomains. We shift the focus from LLMs as learners to LLMs as educators,\nassessing their teaching capability through guiding them to generate questions.\nWe apply four metrics, including relevance, coverage, representativeness, and\nconsistency, to evaluate the educational quality of LLMs' outputs. Our results\nindicate that GPT-4 demonstrates significant potential in teaching general,\nhumanities, and science courses; Claude2 appears more apt as an\ninterdisciplinary teacher. Furthermore, the automatic scores align with human\nperspectives.\n","authors":["Yuyan Chen","Chenwei Wu","Songzhou Yan","Panjun Liu","Haoyu Zhou","Yanghua Xiao"],"pdf_url":"https://arxiv.org/pdf/2408.10947v1.pdf","comment":"Accepted to ACL 2024"},{"id":"http://arxiv.org/abs/2408.10943v1","updated":"2024-08-20T15:33:16Z","published":"2024-08-20T15:33:16Z","title":"SysBench: Can Large Language Models Follow System Messages?","summary":" Large Language Models (LLMs) have become instrumental across various\napplications, with the customization of these models to specific scenarios\nbecoming increasingly critical. System message, a fundamental component of\nLLMs, is consist of carefully crafted instructions that guide the behavior of\nmodel to meet intended goals. Despite the recognized potential of system\nmessages to optimize AI-driven solutions, there is a notable absence of a\ncomprehensive benchmark for evaluating how well different LLMs follow these\nsystem messages. To fill this gap, we introduce SysBench, a benchmark that\nsystematically analyzes system message following ability in terms of three\nchallenging aspects: constraint complexity, instruction misalignment and\nmulti-turn stability. In order to enable effective evaluation, SysBench\nconstructs multi-turn user conversations covering various interaction\nrelationships, based on six common types of constraints from system messages in\nreal-world scenarios. Our dataset contains 500 system messages from various\ndomains, each paired with 5 turns of user conversations, which have been\nmanually formulated and checked to guarantee high quality. SysBench provides\nextensive evaluation across various LLMs, measuring their ability to follow\nspecified constraints given in system messages. The results highlight both the\nstrengths and weaknesses of existing models, offering key insights and\ndirections for future research. The open source library SysBench is available\nat https://github.com/PKU-Baichuan-MLSystemLab/SysBench.\n","authors":["Yanzhao Qin","Tao Zhang","Tao Zhang","Yanjun Shen","Wenjing Luo","Haoze Sun","Yan Zhang","Yujing Qiao","Weipeng Chen","Zenan Zhou","Wentao Zhang","Bin Cui"],"pdf_url":"https://arxiv.org/pdf/2408.10943v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09235v2","updated":"2024-08-20T15:12:08Z","published":"2024-08-17T16:01:45Z","title":"Reference-Guided Verdict: LLMs-as-Judges in Automatic Evaluation of\n Free-Form Text","summary":" The emergence of Large Language Models (LLMs) as chat assistants capable of\ngenerating human-like conversations has amplified the need for robust\nevaluation methods, particularly for open-ended tasks. Conventional metrics\nlike BLEU and ROUGE, while useful, are increasingly inadequate for capturing\nthe subtle semantics and contextual richness of such generative outputs. We\npropose a reference-guided verdict method that automates the evaluation process\nby leveraging multiple LLMs-as-judges. Through experiments on three open-ended\nquestion-answering tasks, we demonstrate that combining multiple LLMs-as-judges\nsignificantly improves the reliability and accuracy of evaluations,\nparticularly in complex tasks where a single model might struggle. Our findings\nreveal a strong correlation with human evaluations, establishing our method as\na viable and effective alternative to traditional metrics and human judgments,\nparticularly in the context of LLM-based chat assistants where the complexity\nand diversity of responses challenge existing benchmarks.\n","authors":["Sher Badshah","Hassan Sajjad"],"pdf_url":"https://arxiv.org/pdf/2408.09235v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10923v1","updated":"2024-08-20T15:05:02Z","published":"2024-08-20T15:05:02Z","title":"LBC: Language-Based-Classifier for Out-Of-Variable Generalization","summary":" Large Language Models (LLMs) have great success in natural language\nprocessing tasks such as response generation. However, their use in tabular\ndata has been limited due to their inferior performance compared to traditional\nmachine learning models (TMLs) such as XGBoost. We find that the pre-trained\nknowledge of LLMs enables them to interpret new variables that appear in a test\nwithout additional training, a capability central to the concept of\nOut-of-Variable (OOV). From the findings, we propose a\nLanguage-Based-Classifier (LBC), a classifier that maximizes the benefits of\nLLMs to outperform TMLs on OOV tasks. LBC employs three key methodological\nstrategies: 1) Categorical changes to adjust data to better fit the model's\nunderstanding, 2) Advanced order and indicator to enhance data representation\nto the model, and 3) Using verbalizer to map logit scores to classes during\ninference to generate model predictions. These strategies, combined with the\npre-trained knowledge of LBC, emphasize the model's ability to effectively\nhandle OOV tasks. We empirically and theoretically validate the superiority of\nLBC. LBC is the first study to apply an LLM-based model to OOV tasks. The\nsource code is at\nhttps://github.com/ASDASDanonymous/Language-Based-Classifier-forOOVtasks.\n","authors":["Kangjun Noh","Baekryun Seong","Hoyoon Byun","Sungjin Song","Kyungwoo Song"],"pdf_url":"https://arxiv.org/pdf/2408.10923v1.pdf","comment":"16 pages, 7 figures, 4 tables"},{"id":"http://arxiv.org/abs/2408.10918v1","updated":"2024-08-20T15:03:35Z","published":"2024-08-20T15:03:35Z","title":"CHECKWHY: Causal Fact Verification via Argument Structure","summary":" With the growing complexity of fact verification tasks, the concern with\n\"thoughtful\" reasoning capabilities is increasing. However, recent fact\nverification benchmarks mainly focus on checking a narrow scope of semantic\nfactoids within claims and lack an explicit logical reasoning process. In this\npaper, we introduce CheckWhy, a challenging dataset tailored to a novel causal\nfact verification task: checking the truthfulness of the causal relation within\nclaims through rigorous reasoning steps. CheckWhy consists of over 19K \"why\"\nclaim-evidence-argument structure triplets with supports, refutes, and not\nenough info labels. Each argument structure is composed of connected evidence,\nrepresenting the reasoning process that begins with foundational evidence and\nprogresses toward claim establishment. Through extensive experiments on\nstate-of-the-art models, we validate the importance of incorporating the\nargument structure for causal fact verification. Moreover, the automated and\nhuman evaluation of argument structure generation reveals the difficulty in\nproducing satisfying argument structure by fine-tuned models or\nChain-of-Thought prompted LLMs, leaving considerable room for future\nimprovements.\n","authors":["Jiasheng Si","Yibo Zhao","Yingjie Zhu","Haiyang Zhu","Wenpeng Lu","Deyu Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.10918v1.pdf","comment":"Accepted by ACL2024; Awarded as Outstanding Paper Award and Area\n Chair Award"},{"id":"http://arxiv.org/abs/2408.10914v1","updated":"2024-08-20T14:58:13Z","published":"2024-08-20T14:58:13Z","title":"To Code, or Not To Code? Exploring Impact of Code in Pre-training","summary":" Including code in the pre-training data mixture, even for models not\nspecifically designed for code, has become a common practice in LLMs\npre-training. While there has been anecdotal consensus among practitioners that\ncode data plays a vital role in general LLMs' performance, there is only\nlimited work analyzing the precise impact of code on non-code tasks. In this\nwork, we systematically investigate the impact of code data on general\nperformance. We ask \"what is the impact of code data used in pre-training on a\nlarge variety of downstream tasks beyond code generation\". We conduct extensive\nablations and evaluate across a broad range of natural language reasoning\ntasks, world knowledge tasks, code benchmarks, and LLM-as-a-judge win-rates for\nmodels with sizes ranging from 470M to 2.8B parameters. Across settings, we\nfind a consistent results that code is a critical building block for\ngeneralization far beyond coding tasks and improvements to code quality have an\noutsized impact across all tasks. In particular, compared to text-only\npre-training, the addition of code results in up to relative increase of 8.2%\nin natural language (NL) reasoning, 4.2% in world knowledge, 6.6% improvement\nin generative win-rates, and a 12x boost in code performance respectively. Our\nwork suggests investments in code quality and preserving code during\npre-training have positive impacts.\n","authors":["Viraat Aryabumi","Yixuan Su","Raymond Ma","Adrien Morisot","Ivan Zhang","Acyr Locatelli","Marzieh Fadaee","Ahmet Üstün","Sara Hooker"],"pdf_url":"https://arxiv.org/pdf/2408.10914v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.05890v2","updated":"2024-08-20T14:51:04Z","published":"2024-07-08T12:52:46Z","title":"Affordances-Oriented Planning using Foundation Models for Continuous\n Vision-Language Navigation","summary":" LLM-based agents have demonstrated impressive zero-shot performance in\nvision-language navigation (VLN) task. However, existing LLM-based methods\noften focus only on solving high-level task planning by selecting nodes in\npredefined navigation graphs for movements, overlooking low-level control in\nnavigation scenarios. To bridge this gap, we propose AO-Planner, a novel\nAffordances-Oriented Planner for continuous VLN task. Our AO-Planner integrates\nvarious foundation models to achieve affordances-oriented low-level motion\nplanning and high-level decision-making, both performed in a zero-shot setting.\nSpecifically, we employ a Visual Affordances Prompting (VAP) approach, where\nthe visible ground is segmented by SAM to provide navigational affordances,\nbased on which the LLM selects potential candidate waypoints and plans\nlow-level paths towards selected waypoints. We further propose a high-level\nPathAgent which marks planned paths into the image input and reasons the most\nprobable path by comprehending all environmental information. Finally, we\nconvert the selected path into 3D coordinates using camera intrinsic parameters\nand depth information, avoiding challenging 3D predictions for LLMs.\nExperiments on the challenging R2R-CE and RxR-CE datasets show that AO-Planner\nachieves state-of-the-art zero-shot performance (8.8% improvement on SPL). Our\nmethod can also serve as a data annotator to obtain pseudo-labels, distilling\nits waypoint prediction ability into a learning-based predictor. This new\npredictor does not require any waypoint data from the simulator and achieves\n47% SR competing with supervised methods. We establish an effective connection\nbetween LLM and 3D world, presenting novel prospects for employing foundation\nmodels in low-level motion control.\n","authors":["Jiaqi Chen","Bingqian Lin","Xinmin Liu","Lin Ma","Xiaodan Liang","Kwan-Yee K. Wong"],"pdf_url":"https://arxiv.org/pdf/2407.05890v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10903v1","updated":"2024-08-20T14:47:38Z","published":"2024-08-20T14:47:38Z","title":"BEYOND DIALOGUE: A Profile-Dialogue Alignment Framework Towards General\n Role-Playing Language Model","summary":" The rapid advancement of large language models (LLMs) has revolutionized\nrole-playing, enabling the development of general role-playing models. However,\ncurrent role-playing training has two significant issues: (I) Using a\npredefined role profile to prompt dialogue training for specific scenarios\nusually leads to inconsistencies and even conflicts between the dialogue and\nthe profile, resulting in training biases. (II) The model learns to imitate the\nrole based solely on the profile, neglecting profile-dialogue alignment at the\nsentence level. In this work, we propose a simple yet effective framework\ncalled BEYOND DIALOGUE, designed to overcome these hurdles. This framework\ninnovatively introduces \"beyond dialogue\" tasks to align dialogue with profile\ntraits based on each specific scenario, thereby eliminating biases during\ntraining. Furthermore, by adopting an innovative prompting mechanism that\ngenerates reasoning outcomes for training, the framework allows the model to\nachieve fine-grained alignment between profile and dialogue at the sentence\nlevel. The aforementioned methods are fully automated and low-cost.\nAdditionally, the integration of automated dialogue and objective evaluation\nmethods forms a comprehensive framework, paving the way for general\nrole-playing. Experimental results demonstrate that our model excels in\nadhering to and reflecting various dimensions of role profiles, outperforming\nmost proprietary general and specialized role-playing baselines. All code and\ndatasets are available at https://github.com/yuyouyu32/BeyondDialogue.\n","authors":["Yeyong Yu","Rusheng Yu","Haojie Wei","Zhanqiu Zhang","Quan Qian"],"pdf_url":"https://arxiv.org/pdf/2408.10903v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10902v1","updated":"2024-08-20T14:45:23Z","published":"2024-08-20T14:45:23Z","title":"Soda-Eval: Open-Domain Dialogue Evaluation in the age of LLMs","summary":" Although human evaluation remains the gold standard for open-domain dialogue\nevaluation, the growing popularity of automated evaluation using Large Language\nModels (LLMs) has also extended to dialogue. However, most frameworks leverage\nbenchmarks that assess older chatbots on aspects such as fluency and relevance,\nwhich are not reflective of the challenges associated with contemporary models.\nIn fact, a qualitative analysis on Soda, a GPT-3.5 generated dialogue dataset,\nsuggests that current chatbots may exhibit several recurring issues related to\ncoherence and commonsense knowledge, but generally produce highly fluent and\nrelevant responses.\n Noting the aforementioned limitations, this paper introduces Soda-Eval, an\nannotated dataset based on Soda that covers over 120K turn-level assessments\nacross 10K dialogues, where the annotations were generated by GPT-4. Using\nSoda-Eval as a benchmark, we then study the performance of several open-access\ninstruction-tuned LLMs, finding that dialogue evaluation remains challenging.\nFine-tuning these models improves performance over few-shot inferences, both in\nterms of correlation and explanation.\n","authors":["John Mendonça","Isabel Trancoso","Alon Lavie"],"pdf_url":"https://arxiv.org/pdf/2408.10902v1.pdf","comment":"22 pages, 10 figures"},{"id":"http://arxiv.org/abs/2404.06209v2","updated":"2024-08-20T14:35:03Z","published":"2024-04-09T10:58:21Z","title":"Elephants Never Forget: Memorization and Learning of Tabular Data in\n Large Language Models","summary":" While many have shown how Large Language Models (LLMs) can be applied to a\ndiverse set of tasks, the critical issues of data contamination and\nmemorization are often glossed over. In this work, we address this concern for\ntabular data. Specifically, we introduce a variety of different techniques to\nassess whether a language model has seen a tabular dataset during training.\nThis investigation reveals that LLMs have memorized many popular tabular\ndatasets verbatim. We then compare the few-shot learning performance of LLMs on\ndatasets that were seen during training to the performance on datasets released\nafter training. We find that LLMs perform better on datasets seen during\ntraining, indicating that memorization leads to overfitting. At the same time,\nLLMs show non-trivial performance on novel datasets and are surprisingly robust\nto data transformations. We then investigate the in-context statistical\nlearning abilities of LLMs. While LLMs are significantly better than random at\nsolving statistical classification problems, the sample efficiency of few-shot\nlearning lags behind traditional statistical learning algorithms, especially as\nthe dimension of the problem increases. This suggests that much of the observed\nfew-shot performance on novel real-world datasets is due to the LLM's world\nknowledge. Overall, our results highlight the importance of testing whether an\nLLM has seen an evaluation dataset during pre-training. We release the\nhttps://github.com/interpretml/LLM-Tabular-Memorization-Checker Python package\nto test LLMs for memorization of tabular datasets.\n","authors":["Sebastian Bordt","Harsha Nori","Vanessa Rodrigues","Besmira Nushi","Rich Caruana"],"pdf_url":"https://arxiv.org/pdf/2404.06209v2.pdf","comment":"COLM camera ready"},{"id":"http://arxiv.org/abs/2408.10839v1","updated":"2024-08-20T13:34:17Z","published":"2024-08-20T13:34:17Z","title":"Benchmarking Large Language Models for Math Reasoning Tasks","summary":" The use of Large Language Models (LLMs) in mathematical reasoning has become\na cornerstone of related research, demonstrating the intelligence of these\nmodels and enabling potential practical applications through their advanced\nperformance, such as in educational settings. Despite the variety of datasets\nand in-context learning algorithms designed to improve the ability of LLMs to\nautomate mathematical problem solving, the lack of comprehensive benchmarking\nacross different datasets makes it complicated to select an appropriate model\nfor specific tasks. In this project, we present a benchmark that fairly\ncompares seven state-of-the-art in-context learning algorithms for mathematical\nproblem solving across five widely used mathematical datasets on four powerful\nfoundation models. Furthermore, we explore the trade-off between efficiency and\nperformance, highlighting the practical applications of LLMs for mathematical\nreasoning. Our results indicate that larger foundation models like GPT-4o and\nLLaMA 3-70B can solve mathematical reasoning independently from the concrete\nprompting strategy, while for smaller models the in-context learning approach\nsignificantly influences the performance. Moreover, the optimal prompt depends\non the chosen foundation model. We open-source our benchmark code to support\nthe integration of additional models in future research.\n","authors":["Kathrin Seßler","Yao Rong","Emek Gözlüklü","Enkelejda Kasneci"],"pdf_url":"https://arxiv.org/pdf/2408.10839v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10819v1","updated":"2024-08-20T13:13:41Z","published":"2024-08-20T13:13:41Z","title":"Exploiting Large Language Models Capabilities for Question Answer-Driven\n Knowledge Graph Completion Across Static and Temporal Domains","summary":" Knowledge graph completion (KGC) aims to identify missing triples in a\nknowledge graph (KG). This is typically achieved through tasks such as link\nprediction and instance completion. However, these methods often focus on\neither static knowledge graphs (SKGs) or temporal knowledge graphs (TKGs),\naddressing only within-scope triples. This paper introduces a new generative\ncompletion framework called Generative Subgraph-based KGC (GS-KGC). GS-KGC\nemploys a question-answering format to directly generate target entities,\naddressing the challenge of questions having multiple possible answers. We\npropose a strategy that extracts subgraphs centered on entities and\nrelationships within the KG, from which negative samples and neighborhood\ninformation are separately obtained to address the one-to-many problem. Our\nmethod generates negative samples using known facts to facilitate the discovery\nof new information. Furthermore, we collect and refine neighborhood path data\nof known entities, providing contextual information to enhance reasoning in\nlarge language models (LLMs). Our experiments evaluated the proposed method on\nfour SKGs and two TKGs, achieving state-of-the-art Hits@1 metrics on five\ndatasets. Analysis of the results shows that GS-KGC can discover new triples\nwithin existing KGs and generate new facts beyond the closed KG, effectively\nbridging the gap between closed-world and open-world KGC.\n","authors":["Rui Yang","Jiahao Zhu","Jianping Man","Li Fang","Yi Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.10819v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10811v1","updated":"2024-08-20T13:05:41Z","published":"2024-08-20T13:05:41Z","title":"Beyond English-Centric LLMs: What Language Do Multilingual Language\n Models Think in?","summary":" In this study, we investigate whether non-English-centric LLMs, despite their\nstrong performance, `think' in their respective dominant language: more\nprecisely, `think' refers to how the representations of intermediate layers,\nwhen un-embedded into the vocabulary space, exhibit higher probabilities for\ncertain dominant languages during generation. We term such languages as\ninternal $\\textbf{latent languages}$.\n We examine the latent language of three typical categories of models for\nJapanese processing: Llama2, an English-centric model; Swallow, an\nEnglish-centric model with continued pre-training in Japanese; and LLM-jp, a\nmodel pre-trained on balanced English and Japanese corpora. Our empirical\nfindings reveal that, unlike Llama2 which relies exclusively on English as the\ninternal latent language, Japanese-specific Swallow and LLM-jp employ both\nJapanese and English, exhibiting dual internal latent languages. For any given\ntarget language, the model preferentially activates the latent language most\nclosely related to it. In addition, we explore how intermediate layers respond\nto questions involving cultural conflicts between latent internal and target\noutput languages. We further explore how the language identity shifts across\nlayers while keeping consistent semantic meaning reflected in the intermediate\nlayer representations.\n This study deepens the understanding of non-English-centric large language\nmodels, highlighting the intricate dynamics of language representation within\ntheir intermediate layers.\n","authors":["Chengzhi Zhong","Fei Cheng","Qianying Liu","Junfeng Jiang","Zhen Wan","Chenhui Chu","Yugo Murawaki","Sadao Kurohashi"],"pdf_url":"https://arxiv.org/pdf/2408.10811v1.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2408.10808v1","updated":"2024-08-20T12:58:16Z","published":"2024-08-20T12:58:16Z","title":"ColBERT Retrieval and Ensemble Response Scoring for Language Model\n Question Answering","summary":" Domain-specific question answering remains challenging for language models,\ngiven the deep technical knowledge required to answer questions correctly. This\ndifficulty is amplified for smaller language models that cannot encode as much\ninformation in their parameters as larger models. The \"Specializing Large\nLanguage Models for Telecom Networks\" challenge aimed to enhance the\nperformance of two small language models, Phi-2 and Falcon-7B in\ntelecommunication question answering. In this paper, we present our question\nanswering systems for this challenge. Our solutions achieved leading marks of\n81.9% accuracy for Phi-2 and 57.3% for Falcon-7B. We have publicly released our\ncode and fine-tuned models.\n","authors":["Alex Gichamba","Tewodros Kederalah Idris","Brian Ebiyau","Eric Nyberg","Teruko Mitamura"],"pdf_url":"https://arxiv.org/pdf/2408.10808v1.pdf","comment":"This work has been submitted to the 2024 IEEE Globecom Workshops for\n possible publication. Copyright may be transferred without notice, after\n which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2408.10795v1","updated":"2024-08-20T12:43:58Z","published":"2024-08-20T12:43:58Z","title":"Adversarial Attack for Explanation Robustness of Rationalization Models","summary":" Rationalization models, which select a subset of input text as\nrationale-crucial for humans to understand and trust predictions-have recently\nemerged as a prominent research area in eXplainable Artificial Intelligence.\nHowever, most of previous studies mainly focus on improving the quality of the\nrationale, ignoring its robustness to malicious attack. Specifically, whether\nthe rationalization models can still generate high-quality rationale under the\nadversarial attack remains unknown. To explore this, this paper proposes UAT2E,\nwhich aims to undermine the explainability of rationalization models without\naltering their predictions, thereby eliciting distrust in these models from\nhuman users. UAT2E employs the gradient-based search on triggers and then\ninserts them into the original input to conduct both the non-target and target\nattack. Experimental results on five datasets reveal the vulnerability of\nrationalization models in terms of explanation, where they tend to select more\nmeaningless tokens under attacks. Based on this, we make a series of\nrecommendations for improving rationalization models in terms of explanation.\n","authors":["Yuankai Zhang","Lingxiao Kong","Haozhao Wang","Ruixuan Li","Jun Wang","Yuhua Li","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2408.10795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00868v3","updated":"2024-08-20T12:37:02Z","published":"2024-03-01T04:39:16Z","title":"SoftTiger: A Clinical Foundation Model for Healthcare Workflows","summary":" We introduce SoftTiger, a clinical large language model (CLaM) designed as a\nfoundation model for healthcare workflows. The narrative and unstructured\nnature of clinical notes is a major obstacle for healthcare intelligentization.\nWe address a critical problem of structuring clinical notes into clinical data,\naccording to international interoperability standards. We collect and annotate\ndata for three subtasks, namely, international patient summary, clinical\nimpression and medical encounter. We then supervised fine-tuned a\nstate-of-the-art LLM using public and credentialed clinical data. The training\nis orchestrated in a way that the target model can first support basic clinical\ntasks such as abbreviation expansion and temporal information extraction, and\nthen learn to perform more complex downstream clinical tasks. Moreover, we\naddress several modeling challenges in the healthcare context, e.g., extra long\ncontext window. Our blind pairwise evaluation shows that SoftTiger outperforms\nother popular open-source models and GPT-3.5, comparable to Gemini-pro, with a\nmild gap from GPT-4. We believe that LLMs may become a step-stone towards\nhealthcare digitalization and democratization. Therefore, we publicly release\nSoftTiger models at scales of 13 billion and 70 billion parameters, as well as\ndatasets and code for our innovative scalable evaluation, hopefully, making a\nsignificant contribution to the healthcare industry.\n","authors":["Ye Chen","Igor Couto","Wei Cai","Cong Fu","Bruno Dorneles"],"pdf_url":"https://arxiv.org/pdf/2403.00868v3.pdf","comment":"Accepted to AAAI 2024 Spring Symposium on Clinical Foundation Models,\n Stanford University, Stanford, California"},{"id":"http://arxiv.org/abs/2408.04472v2","updated":"2024-08-20T12:36:06Z","published":"2024-08-08T14:02:45Z","title":"Can LLMs Beat Humans in Debating? A Dynamic Multi-agent Framework for\n Competitive Debate","summary":" Competitive debate is a complex task of computational argumentation. Large\nLanguage Models (LLMs) suffer from hallucinations and lack competitiveness in\nthis field. To address these challenges, we introduce Agent for Debate\n(Agent4Debate), a dynamic multi-agent framework based on LLMs designed to\nenhance their capabilities in competitive debate. Drawing inspiration from\nhuman behavior in debate preparation and execution, Agent4Debate employs a\ncollaborative architecture where four specialized agents, involving Searcher,\nAnalyzer, Writer, and Reviewer, dynamically interact and cooperate. These\nagents work throughout the debate process, covering multiple stages from\ninitial research and argument formulation to rebuttal and summary. To\ncomprehensively evaluate framework performance, we construct the Competitive\nDebate Arena, comprising 66 carefully selected Chinese debate motions. We\nrecruit ten experienced human debaters and collect records of 200 debates\ninvolving Agent4Debate, baseline models, and humans. The evaluation employs the\nDebatrix automatic scoring system and professional human reviewers based on the\nestablished Debatrix-Elo and Human-Elo ranking. Experimental results indicate\nthat the state-of-the-art Agent4Debate exhibits capabilities comparable to\nthose of humans. Furthermore, ablation studies demonstrate the effectiveness of\neach component in the agent structure.\n","authors":["Yiqun Zhang","Xiaocui Yang","Shi Feng","Daling Wang","Yifei Zhang","Kaisong Song"],"pdf_url":"https://arxiv.org/pdf/2408.04472v2.pdf","comment":"12 pages (including appendix), 7 figures"},{"id":"http://arxiv.org/abs/2408.10774v1","updated":"2024-08-20T12:13:04Z","published":"2024-08-20T12:13:04Z","title":"Flexora: Flexible Low Rank Adaptation for Large Language Models","summary":" Large Language Models (LLMs) are driving advancements in artificial\nintelligence by increasing the scale of model parameters, which has\nsignificantly enhanced generalization ability and unlocked new capabilities in\npractice. However, their performance in specific downstream tasks is usually\nhindered by their knowledge boundaries on these tasks. Thus, fine-tuning\ntechniques, especially the widely used Low-Rank Adaptation (LoRA) method, have\nbeen introduced to expand the boundaries on these tasks, whereas LoRA would\nunderperform on certain tasks owing to its potential overfitting on these\ntasks. To overcome this overfitting and improve the performance of LoRA, we\npropose the flexible low rank adaptation (Flexora) method to automatically and\nflexibly select the most important layers needing to be fine-tuned to achieve\nthe best performance on different downstream tasks. Specifically, Flexora\nfirstly frames this layer selection problem as a well-defined hyperparameter\noptimization (HPO) problem, then addresses it using the unrolled\ndifferentiation (UD) method, and finally selects the most useful layers based\non the optimized hyperparameters. Our extensive experiments on many pretrained\nmodels and natural language tasks show that Flexora is able to consistently\nimprove over the existing baselines, indicating the effectiveness of our\nFlexora in practice. We additionally provide insightful theoretical results and\nmany ablation studies to deliver a comprehensive understanding of our Flexora.\n","authors":["Chenxing Wei","Yao Shu","Ying Tiffany He","Fei Richard Yu"],"pdf_url":"https://arxiv.org/pdf/2408.10774v1.pdf","comment":"29 pages, 13 figures"},{"id":"http://arxiv.org/abs/2408.10764v1","updated":"2024-08-20T12:00:35Z","published":"2024-08-20T12:00:35Z","title":"Predicting Rewards Alongside Tokens: Non-disruptive Parameter Insertion\n for Efficient Inference Intervention in Large Language Model","summary":" Transformer-based large language models (LLMs) exhibit limitations such as\ngenerating unsafe responses, unreliable reasoning, etc. Existing inference\nintervention approaches attempt to mitigate these issues by finetuning\nadditional models to produce calibration signals (such as rewards) that guide\nthe LLM's decoding process. However, this solution introduces substantial time\nand space overhead due to the separate models required. This work proposes\nNon-disruptive parameters insertion (Otter), inserting extra parameters into\nthe transformer architecture to predict calibration signals along with the\noriginal LLM output. Otter offers state-of-the-art performance on multiple\ndemanding tasks while saving up to 86.5\\% extra space and 98.5\\% extra time.\nFurthermore, Otter seamlessly integrates with existing inference engines,\nrequiring only a one-line code change, and the original model response remains\naccessible after the parameter insertion. Our code is publicly available at\n\\url{https://github.com/chenhan97/Otter}\n","authors":["Chenhan Yuan","Fei Huang","Ru Peng","Keming Lu","Bowen Yu","Chang Zhou","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.10764v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2408.10729v1","updated":"2024-08-20T10:57:34Z","published":"2024-08-20T10:57:34Z","title":"Towards Efficient Large Language Models for Scientific Text: A Review","summary":" Large language models (LLMs) have ushered in a new era for processing complex\ninformation in various fields, including science. The increasing amount of\nscientific literature allows these models to acquire and understand scientific\nknowledge effectively, thus improving their performance in a wide range of\ntasks. Due to the power of LLMs, they require extremely expensive computational\nresources, intense amounts of data, and training time. Therefore, in recent\nyears, researchers have proposed various methodologies to make scientific LLMs\nmore affordable. The most well-known approaches align in two directions. It can\nbe either focusing on the size of the models or enhancing the quality of data.\nTo date, a comprehensive review of these two families of methods has not yet\nbeen undertaken. In this paper, we (I) summarize the current advances in the\nemerging abilities of LLMs into more accessible AI solutions for science, and\n(II) investigate the challenges and opportunities of developing affordable\nsolutions for scientific domains using LLMs.\n","authors":["Huy Quoc To","Ming Liu","Guangyan Huang"],"pdf_url":"https://arxiv.org/pdf/2408.10729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19056v2","updated":"2024-08-20T10:56:18Z","published":"2024-03-27T23:45:31Z","title":"CAUSE: Counterfactual Assessment of User Satisfaction Estimation in\n Task-Oriented Dialogue Systems","summary":" An important unexplored aspect in previous work on user satisfaction\nestimation for Task-Oriented Dialogue (TOD) systems is their evaluation in\nterms of robustness for the identification of user dissatisfaction: current\nbenchmarks for user satisfaction estimation in TOD systems are highly skewed\ntowards dialogues for which the user is satisfied. The effect of having a more\nbalanced set of satisfaction labels on performance is unknown. However,\nbalancing the data with more dissatisfactory dialogue samples requires further\ndata collection and human annotation, which is costly and time-consuming. In\nthis work, we leverage large language models (LLMs) and unlock their ability to\ngenerate satisfaction-aware counterfactual dialogues to augment the set of\noriginal dialogues of a test collection. We gather human annotations to ensure\nthe reliability of the generated samples. We evaluate two open-source LLMs as\nuser satisfaction estimators on our augmented collection against\nstate-of-the-art fine-tuned models. Our experiments show that when used as\nfew-shot user satisfaction estimators, open-source LLMs show higher robustness\nto the increase in the number of dissatisfaction labels in the test collection\nthan the fine-tuned state-of-the-art models. Our results shed light on the need\nfor data augmentation approaches for user satisfaction estimation in TOD\nsystems. We release our aligned counterfactual dialogues, which are curated by\nhuman annotation, to facilitate further research on this topic.\n","authors":["Amin Abolghasemi","Zhaochun Ren","Arian Askari","Mohammad Aliannejadi","Maarten de Rijke","Suzan Verberne"],"pdf_url":"https://arxiv.org/pdf/2403.19056v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10724v1","updated":"2024-08-20T10:45:36Z","published":"2024-08-20T10:45:36Z","title":"Crafting Tomorrow's Headlines: Neural News Generation and Detection in\n English, Turkish, Hungarian, and Persian","summary":" In the era dominated by information overload and its facilitation with Large\nLanguage Models (LLMs), the prevalence of misinformation poses a significant\nthreat to public discourse and societal well-being. A critical concern at\npresent involves the identification of machine-generated news. In this work, we\ntake a significant step by introducing a benchmark dataset designed for neural\nnews detection in four languages: English, Turkish, Hungarian, and Persian. The\ndataset incorporates outputs from multiple multilingual generators (in both,\nzero-shot and fine-tuned setups) such as BloomZ, LLaMa-2, Mistral, Mixtral, and\nGPT-4. Next, we experiment with a variety of classifiers, ranging from those\nbased on linguistic features to advanced Transformer-based models and LLMs\nprompting. We present the detection results aiming to delve into the\ninterpretablity and robustness of machine-generated texts detectors across all\ntarget languages.\n","authors":["Cem Üyük","Danica Rovó","Shaghayegh Kolli","Rabia Varol","Georg Groh","Daryna Dementieva"],"pdf_url":"https://arxiv.org/pdf/2408.10724v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10722v1","updated":"2024-08-20T10:44:29Z","published":"2024-08-20T10:44:29Z","title":"MEGen: Generative Backdoor in Large Language Models via Model Editing","summary":" Large language models (LLMs) have demonstrated remarkable capabilities. Their\npowerful generative abilities enable flexible responses based on various\nqueries or instructions. Emerging as widely adopted generalists for diverse\ntasks, LLMs are still vulnerable to backdoors. This paper proposes an\nediting-based generative backdoor, named MEGen, aiming to create a customized\nbackdoor for NLP tasks with the least side effects. In our approach, we first\nleverage a language model to insert a trigger selected on fixed metrics into\nthe input, then design a pipeline of model editing to directly embed a backdoor\ninto an LLM. By adjusting a small set of local parameters with a mini-batch of\nsamples, MEGen significantly enhances time efficiency and achieves high\nrobustness. Experimental results indicate that our backdoor attack strategy\nachieves a high attack success rate on poison data while maintaining the\nmodel's performance on clean data. Notably, the backdoored model, when\ntriggered, can freely output pre-set dangerous information while successfully\ncompleting downstream tasks. This suggests that future LLM applications could\nbe guided to deliver certain dangerous information, thus altering the LLM's\ngenerative style. We believe this approach provides insights for future LLM\napplications and the execution of backdoor attacks on conversational AI\nsystems.\n","authors":["Jiyang Qiu","Xinbei Ma","Zhuosheng Zhang","Hai Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.10722v1.pdf","comment":"Working in progress"},{"id":"http://arxiv.org/abs/2408.10718v1","updated":"2024-08-20T10:40:35Z","published":"2024-08-20T10:40:35Z","title":"CodeJudge-Eval: Can Large Language Models be Good Judges in Code\n Understanding?","summary":" Recent advancements in large language models (LLMs) have showcased impressive\ncode generation capabilities, primarily evaluated through language-to-code\nbenchmarks. However, these benchmarks may not fully capture a model's code\nunderstanding abilities. We introduce CodeJudge-Eval (CJ-Eval), a novel\nbenchmark designed to assess LLMs' code understanding abilities from the\nperspective of code judging rather than code generation. CJ-Eval challenges\nmodels to determine the correctness of provided code solutions, encompassing\nvarious error types and compilation issues. By leveraging a diverse set of\nproblems and a fine-grained judging system, CJ-Eval addresses the limitations\nof traditional benchmarks, including the potential memorization of solutions.\nEvaluation of 12 well-known LLMs on CJ-Eval reveals that even state-of-the-art\nmodels struggle, highlighting the benchmark's ability to probe deeper into\nmodels' code understanding abilities. Our benchmark will be available at\n\\url{https://github.com/CodeLLM-Research/CodeJudge-Eval}.\n","authors":["Yuwei Zhao","Ziyang Luo","Yuchen Tian","Hongzhan Lin","Weixiang Yan","Annan Li","Jing Ma"],"pdf_url":"https://arxiv.org/pdf/2408.10718v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2407.03627v4","updated":"2024-08-20T10:27:56Z","published":"2024-07-04T04:30:04Z","title":"DSLR: Document Refinement with Sentence-Level Re-ranking and\n Reconstruction to Enhance Retrieval-Augmented Generation","summary":" Recent advancements in Large Language Models (LLMs) have significantly\nimproved their performance across various Natural Language Processing (NLP)\ntasks. However, LLMs still struggle with generating non-factual responses due\nto limitations in their parametric memory. Retrieval-Augmented Generation (RAG)\nsystems address this issue by incorporating external knowledge with a retrieval\nmodule. Despite their successes, however, current RAG systems face challenges\nwith retrieval failures and the limited ability of LLMs to filter out\nirrelevant information. Therefore, in this work, we propose DSLR (Document\nRefinement with Sentence-Level Re-ranking and Reconstruction), an unsupervised\nframework that decomposes retrieved documents into sentences, filters out\nirrelevant sentences, and reconstructs them again into coherent passages. We\nexperimentally validate DSLR on multiple open-domain QA datasets and the\nresults demonstrate that DSLR significantly enhances the RAG performance over\nconventional fixed-size passage. Furthermore, our DSLR enhances performance in\nspecific, yet realistic scenarios without the need for additional training,\nproviding an effective and efficient solution for refining retrieved documents\nin RAG systems.\n","authors":["Taeho Hwang","Soyeong Jeong","Sukmin Cho","SeungYoon Han","Jong C. Park"],"pdf_url":"https://arxiv.org/pdf/2407.03627v4.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2308.05680v2","updated":"2024-08-20T10:24:50Z","published":"2023-08-10T16:33:17Z","title":"Breaking Language Barriers with MMTweets: Advancing Cross-Lingual\n Debunked Narrative Retrieval for Fact-Checking","summary":" Finding previously debunked narratives involves identifying claims that have\nalready undergone fact-checking. The issue intensifies when similar false\nclaims persist in multiple languages, despite the availability of debunks for\nseveral months in another language. Hence, automatically finding debunks (or\nfact-checks) in multiple languages is crucial to make the best use of scarce\nfact-checkers' resources. Mainly due to the lack of readily available data,\nthis is an understudied problem, particularly when considering the\ncross-lingual scenario, i.e. the retrieval of debunks in a language different\nfrom the language of the online post being checked. This study introduces\ncross-lingual debunked narrative retrieval and addresses this research gap by:\n(i) creating Multilingual Misinformation Tweets (MMTweets): a dataset that\nstands out, featuring cross-lingual pairs, images, human annotations, and\nfine-grained labels, making it a comprehensive resource compared to its\ncounterparts; (ii) conducting an extensive experiment to benchmark\nstate-of-the-art cross-lingual retrieval models and introducing multistage\nretrieval methods tailored for the task; and (iii) comprehensively evaluating\nretrieval models for their cross-lingual and cross-dataset transfer\ncapabilities within MMTweets, and conducting a retrieval latency analysis. We\nfind that MMTweets presents challenges for cross-lingual debunked narrative\nretrieval, highlighting areas for improvement in retrieval models. Nonetheless,\nthe study provides valuable insights for creating MMTweets datasets and\noptimising debunked narrative retrieval models to empower fact-checking\nendeavours. The dataset and annotation codebook are publicly available at\nhttps://doi.org/10.5281/zenodo.10637161.\n","authors":["Iknoor Singh","Carolina Scarton","Xingyi Song","Kalina Bontcheva"],"pdf_url":"https://arxiv.org/pdf/2308.05680v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10701v1","updated":"2024-08-20T09:58:01Z","published":"2024-08-20T09:58:01Z","title":"Ferret: Faster and Effective Automated Red Teaming with Reward-Based\n Scoring Technique","summary":" In today's era, where large language models (LLMs) are integrated into\nnumerous real-world applications, ensuring their safety and robustness is\ncrucial for responsible AI usage. Automated red-teaming methods play a key role\nin this process by generating adversarial attacks to identify and mitigate\npotential vulnerabilities in these models. However, existing methods often\nstruggle with slow performance, limited categorical diversity, and high\nresource demands. While Rainbow Teaming, a recent approach, addresses the\ndiversity challenge by framing adversarial prompt generation as a\nquality-diversity search, it remains slow and requires a large fine-tuned\nmutator for optimal performance. To overcome these limitations, we propose\nFerret, a novel approach that builds upon Rainbow Teaming by generating\nmultiple adversarial prompt mutations per iteration and using a scoring\nfunction to rank and select the most effective adversarial prompt. We explore\nvarious scoring functions, including reward models, Llama Guard, and\nLLM-as-a-judge, to rank adversarial mutations based on their potential harm to\nimprove the efficiency of the search for harmful mutations. Our results\ndemonstrate that Ferret, utilizing a reward model as a scoring function,\nimproves the overall attack success rate (ASR) to 95%, which is 46% higher than\nRainbow Teaming. Additionally, Ferret reduces the time needed to achieve a 90%\nASR by 15.2% compared to the baseline and generates adversarial prompts that\nare transferable i.e. effective on other LLMs of larger size. Our codes are\navailable at https://github.com/declare-lab/ferret.\n","authors":["Tej Deep Pala","Vernon Y. H. Toh","Rishabh Bhardwaj","Soujanya Poria"],"pdf_url":"https://arxiv.org/pdf/2408.10701v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10692v1","updated":"2024-08-20T09:42:26Z","published":"2024-08-20T09:42:26Z","title":"Unconditional Truthfulness: Learning Conditional Dependency for\n Uncertainty Quantification of Large Language Models","summary":" Uncertainty quantification (UQ) is a perspective approach to detecting Large\nLanguage Model (LLM) hallucinations and low quality output. In this work, we\naddress one of the challenges of UQ in generation tasks that arises from the\nconditional dependency between the generation steps of an LLM. We propose to\nlearn this dependency from data. We train a regression model, which target\nvariable is the gap between the conditional and the unconditional generation\nconfidence. During LLM inference, we use this learned conditional dependency\nmodel to modulate the uncertainty of the current generation step based on the\nuncertainty of the previous step. Our experimental evaluation on nine datasets\nand three LLMs shows that the proposed method is highly effective for\nuncertainty quantification, achieving substantial improvements over rivaling\napproaches.\n","authors":["Artem Vazhentsev","Ekaterina Fadeeva","Rui Xing","Alexander Panchenko","Preslav Nakov","Timothy Baldwin","Maxim Panov","Artem Shelmanov"],"pdf_url":"https://arxiv.org/pdf/2408.10692v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10682v1","updated":"2024-08-20T09:36:04Z","published":"2024-08-20T09:36:04Z","title":"Towards Robust Knowledge Unlearning: An Adversarial Framework for\n Assessing and Improving Unlearning Robustness in Large Language Models","summary":" LLM have achieved success in many fields but still troubled by problematic\ncontent in the training corpora. LLM unlearning aims at reducing their\ninfluence and avoid undesirable behaviours. However, existing unlearning\nmethods remain vulnerable to adversarial queries and the unlearned knowledge\nresurfaces after the manually designed attack queries. As part of a red-team\neffort to proactively assess the vulnerabilities of unlearned models, we design\nDynamic Unlearning Attack (DUA), a dynamic and automated framework to attack\nthese models and evaluate their robustness. It optimizes adversarial suffixes\nto reintroduce the unlearned knowledge in various scenarios. We find that\nunlearned knowledge can be recovered in $55.2\\%$ of the questions, even without\nrevealing the unlearned model's parameters. In response to this vulnerability,\nwe propose Latent Adversarial Unlearning (LAU), a universal framework that\neffectively enhances the robustness of the unlearned process. It formulates the\nunlearning process as a min-max optimization problem and resolves it through\ntwo stages: an attack stage, where perturbation vectors are trained and added\nto the latent space of LLMs to recover the unlearned knowledge, and a defense\nstage, where previously trained perturbation vectors are used to enhance\nunlearned model's robustness. With our LAU framework, we obtain two robust\nunlearning methods, AdvGA and AdvNPO. We conduct extensive experiments across\nmultiple unlearning benchmarks and various models, and demonstrate that they\nimprove the unlearning effectiveness by over $53.5\\%$, cause only less than a\n$11.6\\%$ reduction in neighboring knowledge, and have almost no impact on the\nmodel's general capabilities.\n","authors":["Hongbang Yuan","Zhuoran Jin","Pengfei Cao","Yubo Chen","Kang Liu","Jun Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.10682v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2408.10681v1","updated":"2024-08-20T09:35:24Z","published":"2024-08-20T09:35:24Z","title":"HMoE: Heterogeneous Mixture of Experts for Language Modeling","summary":" Mixture of Experts (MoE) offers remarkable performance and computational\nefficiency by selectively activating subsets of model parameters.\nTraditionally, MoE models use homogeneous experts, each with identical\ncapacity. However, varying complexity in input data necessitates experts with\ndiverse capabilities, while homogeneous MoE hinders effective expert\nspecialization and efficient parameter utilization. In this study, we propose a\nnovel Heterogeneous Mixture of Experts (HMoE), where experts differ in size and\nthus possess diverse capacities. This heterogeneity allows for more specialized\nexperts to handle varying token complexities more effectively. To address the\nimbalance in expert activation, we propose a novel training objective that\nencourages the frequent activation of smaller experts, enhancing computational\nefficiency and parameter utilization. Extensive experiments demonstrate that\nHMoE achieves lower loss with fewer activated parameters and outperforms\nconventional homogeneous MoE models on various pre-training evaluation\nbenchmarks. Codes will be released upon acceptance.\n","authors":["An Wang","Xingwu Sun","Ruobing Xie","Shuaipeng Li","Jiaqi Zhu","Zhen Yang","Pinxue Zhao","J. N. Han","Zhanhui Kang","Di Wang","Naoaki Okazaki","Cheng-zhong Xu"],"pdf_url":"https://arxiv.org/pdf/2408.10681v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10680v1","updated":"2024-08-20T09:31:59Z","published":"2024-08-20T09:31:59Z","title":"Towards Rehearsal-Free Multilingual ASR: A LoRA-based Case Study on\n Whisper","summary":" Pre-trained multilingual speech foundation models, like Whisper, have shown\nimpressive performance across different languages. However, adapting these\nmodels to new or specific languages is computationally extensive and faces\ncatastrophic forgetting problems. Addressing these issues, our study\ninvestigates strategies to enhance the model on new languages in the absence of\noriginal training data, while also preserving the established performance on\nthe original languages. Specifically, we first compare various LoRA-based\nmethods to find out their vulnerability to forgetting. To mitigate this issue,\nwe propose to leverage the LoRA parameters from the original model for\napproximate orthogonal gradient descent on the new samples. Additionally, we\nalso introduce a learnable rank coefficient to allocate trainable parameters\nfor more efficient training. Our experiments with a Chinese Whisper model (for\nUyghur and Tibetan) yield better results with a more compact parameter set.\n","authors":["Tianyi Xu","Kaixun Huang","Pengcheng Guo","Yu Zhou","Longtao Huang","Hui Xue","Lei Xie"],"pdf_url":"https://arxiv.org/pdf/2408.10680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00223v2","updated":"2024-08-20T09:26:16Z","published":"2023-09-01T02:56:20Z","title":"The FruitShell French synthesis system at the Blizzard 2023 Challenge","summary":" This paper presents a French text-to-speech synthesis system for the Blizzard\nChallenge 2023. The challenge consists of two tasks: generating high-quality\nspeech from female speakers and generating speech that closely resembles\nspecific individuals. Regarding the competition data, we conducted a screening\nprocess to remove missing or erroneous text data. We organized all symbols\nexcept for phonemes and eliminated symbols that had no pronunciation or zero\nduration. Additionally, we added word boundary and start/end symbols to the\ntext, which we have found to improve speech quality based on our previous\nexperience. For the Spoke task, we performed data augmentation according to the\ncompetition rules. We used an open-source G2P model to transcribe the French\ntexts into phonemes. As the G2P model uses the International Phonetic Alphabet\n(IPA), we applied the same transcription process to the provided competition\ndata for standardization. However, due to compiler limitations in recognizing\nspecial symbols from the IPA chart, we followed the rules to convert all\nphonemes into the phonetic scheme used in the competition data. Finally, we\nresampled all competition audio to a uniform sampling rate of 16 kHz. We\nemployed a VITS-based acoustic model with the hifigan vocoder. For the Spoke\ntask, we trained a multi-speaker model and incorporated speaker information\ninto the duration predictor, vocoder, and flow layers of the model. The\nevaluation results of our system showed a quality MOS score of 3.6 for the Hub\ntask and 3.4 for the Spoke task, placing our system at an average level among\nall participating teams.\n","authors":["Xin Qi","Xiaopeng Wang","Zhiyong Wang","Wang Liu","Mingming Ding","Shuchen Shi"],"pdf_url":"https://arxiv.org/pdf/2309.00223v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10868v3","updated":"2024-08-20T09:25:23Z","published":"2024-06-16T09:36:32Z","title":"Identifying Query-Relevant Neurons in Large Language Models for\n Long-Form Texts","summary":" Large Language Models (LLMs) possess vast amounts of knowledge within their\nparameters, prompting research into methods for locating and editing this\nknowledge. Previous work has largely focused on locating entity-related (often\nsingle-token) facts in smaller models. However, several key questions remain\nunanswered: (1) How can we effectively locate query-relevant neurons in\ncontemporary autoregressive LLMs, such as Llama and Mistral? (2) How can we\naddress the challenge of long-form text generation? (3) Are there localized\nknowledge regions in LLMs? In this study, we introduce Query-Relevant Neuron\nCluster Attribution (QRNCA), a novel architecture-agnostic framework capable of\nidentifying query-relevant neurons in LLMs. QRNCA allows for the examination of\nlong-form answers beyond triplet facts by employing the proxy task of\nmulti-choice question answering. To evaluate the effectiveness of our detected\nneurons, we build two multi-choice QA datasets spanning diverse domains and\nlanguages. Empirical evaluations demonstrate that our method outperforms\nbaseline methods significantly. Further, analysis of neuron distributions\nreveals the presence of visible localized regions, particularly within\ndifferent domains. Finally, we show potential applications of our detected\nneurons in knowledge editing and neuron-based prediction.\n","authors":["Lihu Chen","Adam Dejl","Francesca Toni"],"pdf_url":"https://arxiv.org/pdf/2406.10868v3.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2408.10663v1","updated":"2024-08-20T09:05:03Z","published":"2024-08-20T09:05:03Z","title":"REInstruct: Building Instruction Data from Unlabeled Corpus","summary":" Manually annotating instruction data for large language models is difficult,\ncostly, and hard to scale. Meanwhile, current automatic annotation methods\ntypically rely on distilling synthetic data from proprietary LLMs, which not\nonly limits the upper bound of the quality of the instruction data but also\nraises potential copyright issues. In this paper, we propose REInstruct, a\nsimple and scalable method to automatically build instruction data from an\nunlabeled corpus without heavy reliance on proprietary LLMs and human\nannotation. Specifically, REInstruct first selects a subset of unlabeled texts\nthat potentially contain well-structured helpful and insightful content and\nthen generates instructions for these texts. To generate accurate and relevant\nresponses for effective and robust training, REInstruct further proposes a\nrewriting-based approach to improve the quality of the generated instruction\ndata. By training Llama-7b on a combination of 3k seed data and 32k synthetic\ndata from REInstruct, fine-tuned model achieves a 65.41\\% win rate on\nAlpacaEval leaderboard against text-davinci-003, outperforming other\nopen-source, non-distilled instruction data construction methods. The code is\npublicly available at \\url{https://github.com/cs32963/REInstruct}.\n","authors":["Shu Chen","Xinyan Guan","Yaojie Lu","Hongyu Lin","Xianpei Han","Le Sun"],"pdf_url":"https://arxiv.org/pdf/2408.10663v1.pdf","comment":"Accepted by ACL2024 Findings"},{"id":"http://arxiv.org/abs/2408.08780v2","updated":"2024-08-20T09:01:09Z","published":"2024-08-16T14:49:04Z","title":"Large Language Models Might Not Care What You Are Saying: Prompt Format\n Beats Descriptions","summary":" With the help of in-context learning (ICL), large language models (LLMs) have\nachieved impressive performance across various tasks. However, the function of\ndescriptive instructions during ICL remains under-explored. In this work, we\npropose an ensemble prompt framework to describe the selection criteria of\nmultiple in-context examples, and preliminary experiments on machine\ntranslation (MT) across six translation directions confirm that this framework\nboosts ICL perfromance. But to our surprise, LLMs might not necessarily care\nwhat the descriptions actually say, and the performance gain is primarily\ncaused by the ensemble format, since the framework could lead to improvement\neven with random descriptive nouns. We further apply this new ensemble prompt\non a range of commonsense, math, logical reasoning and hallucination tasks with\nthree LLMs and achieve promising results, suggesting again that designing a\nproper prompt format would be much more effective and efficient than paying\neffort into specific descriptions. Our code will be publicly available once\nthis paper is published.\n","authors":["Chenming Tang","Zhixiang Wang","Yunfang Wu"],"pdf_url":"https://arxiv.org/pdf/2408.08780v2.pdf","comment":"10 pages, 6 figures, 3 tables"},{"id":"http://arxiv.org/abs/2408.09939v2","updated":"2024-08-20T08:59:22Z","published":"2024-08-19T12:21:34Z","title":"\"Image, Tell me your story!\" Predicting the original meta-context of\n visual misinformation","summary":" To assist human fact-checkers, researchers have developed automated\napproaches for visual misinformation detection. These methods assign veracity\nscores by identifying inconsistencies between the image and its caption, or by\ndetecting forgeries in the image. However, they neglect a crucial point of the\nhuman fact-checking process: identifying the original meta-context of the\nimage. By explaining what is actually true about the image, fact-checkers can\nbetter detect misinformation, focus their efforts on check-worthy visual\ncontent, engage in counter-messaging before misinformation spreads widely, and\nmake their explanation more convincing. Here, we fill this gap by introducing\nthe task of automated image contextualization. We create 5Pils, a dataset of\n1,676 fact-checked images with question-answer pairs about their original\nmeta-context. Annotations are based on the 5 Pillars fact-checking framework.\nWe implement a first baseline that grounds the image in its original\nmeta-context using the content of the image and textual evidence retrieved from\nthe open web. Our experiments show promising results while highlighting several\nopen challenges in retrieval and reasoning. We make our code and data publicly\navailable.\n","authors":["Jonathan Tonglet","Marie-Francine Moens","Iryna Gurevych"],"pdf_url":"https://arxiv.org/pdf/2408.09939v2.pdf","comment":"Preprint. Code available at https://github.com/UKPLab/5pils"},{"id":"http://arxiv.org/abs/2408.10646v1","updated":"2024-08-20T08:38:30Z","published":"2024-08-20T08:38:30Z","title":"Beneath the Surface of Consistency: Exploring Cross-lingual Knowledge\n Representation Sharing in LLMs","summary":" The veracity of a factoid is largely independent of the language it is\nwritten in. However, language models are inconsistent in their ability to\nanswer the same factual question across languages. This raises questions about\nhow LLMs represent a given fact across languages. We explore multilingual\nfactual knowledge through two aspects: the model's ability to answer a query\nconsistently across languages, and the ability to ''store'' answers in a shared\nrepresentation for several languages. We propose a methodology to measure the\nextent of representation sharing across languages by repurposing knowledge\nediting methods. We examine LLMs with various multilingual configurations using\na new multilingual dataset. We reveal that high consistency does not\nnecessarily imply shared representation, particularly for languages with\ndifferent scripts. Moreover, we find that script similarity is a dominant\nfactor in representation sharing. Finally, we observe that if LLMs could fully\nshare knowledge across languages, their accuracy in their best-performing\nlanguage could benefit an increase of up to 150\\% on average. These findings\nhighlight the need for improved multilingual knowledge representation in LLMs\nand suggest a path for the development of more robust and consistent\nmultilingual LLMs.\n","authors":["Maxim Ifergan","Leshem Choshen","Roee Aharoni","Idan Szpektor","Omri Abend"],"pdf_url":"https://arxiv.org/pdf/2408.10646v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10543v2","updated":"2024-08-20T08:36:26Z","published":"2024-02-16T10:11:20Z","title":"Strong hallucinations from negation and how to fix them","summary":" Despite great performance on many tasks, language models (LMs) still struggle\nwith reasoning, sometimes providing responses that cannot possibly be true\nbecause they stem from logical incoherence. We call such responses\n\\textit{strong hallucinations} and prove that they follow from an LM's\ncomputation of its internal representations for logical operators and outputs\nfrom those representations. Focusing on negation, we provide a novel solution\nin which negation is treated not as another element of a latent representation,\nbut as \\textit{an operation over an LM's latent representations that constrains\nhow they may evolve}. We show that our approach improves model performance in\ncloze prompting and natural language inference tasks with negation without\nrequiring training on sparse negative data.\n","authors":["Nicholas Asher","Swarnadeep Bhar"],"pdf_url":"https://arxiv.org/pdf/2402.10543v2.pdf","comment":"Proceedings of the 62nd Annual Meeting of the Association for\n Computational Linguistics (Findings)"},{"id":"http://arxiv.org/abs/2408.10642v1","updated":"2024-08-20T08:32:44Z","published":"2024-08-20T08:32:44Z","title":"Minor SFT loss for LLM fine-tune to increase performance and reduce\n model deviation","summary":" Instruct LLM provide a paradigm used in large scale language model to align\nLLM to human preference. The paradigm contains supervised fine tuning and\nreinforce learning from human feedback. This paradigm is also used in\ndownstream scenarios to adapt LLM to specific corpora and applications.\nComparing to SFT, there are many efforts focused on RLHF and several algorithms\nbeing proposed, such as PPO, DPO, IPO, KTO, MinorDPO and etc. Meanwhile most\nefforts for SFT are focused on how to collect, filter and mix high quality\ndata. In this article with insight from DPO and MinorDPO, we propose a training\nmetric for SFT to measure the discrepancy between the optimized model and the\noriginal model, and a loss function MinorSFT that can increase the training\neffectiveness, and reduce the discrepancy between the optimized LLM and\noriginal LLM.\n","authors":["Shiming Xie","Hong Chen","Fred Yu","Zeye Sun","Xiuyu Wu"],"pdf_url":"https://arxiv.org/pdf/2408.10642v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.10635v1","updated":"2024-08-20T08:22:04Z","published":"2024-08-20T08:22:04Z","title":"Strategist: Learning Strategic Skills by LLMs via Bi-Level Tree Search","summary":" In this paper, we propose a new method Strategist that utilizes LLMs to\nacquire new skills for playing multi-agent games through a self-improvement\nprocess. Our method gathers quality feedback through self-play simulations with\nMonte Carlo tree search and LLM-based reflection, which can then be used to\nlearn high-level strategic skills such as how to evaluate states that guide the\nlow-level execution.We showcase how our method can be used in both action\nplanning and dialogue generation in the context of games, achieving good\nperformance on both tasks. Specifically, we demonstrate that our method can\nhelp train agents with better performance than both traditional reinforcement\nlearning-based approaches and other LLM-based skill learning approaches in\ngames including the Game of Pure Strategy (GOPS) and The Resistance: Avalon.\n","authors":["Jonathan Light","Min Cai","Weiqin Chen","Guanzhi Wang","Xiusi Chen","Wei Cheng","Yisong Yue","Ziniu Hu"],"pdf_url":"https://arxiv.org/pdf/2408.10635v1.pdf","comment":"website: https://llm-strategist.github.io"},{"id":"http://arxiv.org/abs/2408.10631v1","updated":"2024-08-20T08:13:52Z","published":"2024-08-20T08:13:52Z","title":"LLM-Barber: Block-Aware Rebuilder for Sparsity Mask in One-Shot for\n Large Language Models","summary":" Large language models (LLMs) have grown significantly in scale, leading to a\ncritical need for efficient model pruning techniques. Existing post-training\npruning techniques primarily focus on measuring weight importance on converged\ndense models to determine salient weights to retain. However, they often\noverlook the changes in weight importance during the pruning process, which can\nlead to performance degradation in the pruned models. To address this issue, we\npresent LLM-Barber (Block-Aware Rebuilder for Sparsity Mask in One-Shot), a\nnovel one-shot pruning framework that rebuilds the sparsity mask of pruned\nmodels without any retraining or weight reconstruction. LLM-Barber incorporates\nblock-aware error optimization across Self-Attention and MLP blocks, ensuring\nglobal performance optimization. Inspired by the recent discovery of prominent\noutliers in LLMs, LLM-Barber introduces an innovative pruning metric that\nidentifies weight importance using weights multiplied by gradients. Our\nexperiments show that LLM-Barber can efficiently prune models like LLaMA and\nOPT families with 7B to 13B parameters on a single A100 GPU in just 30 minutes,\nachieving state-of-the-art results in both perplexity and zero-shot performance\nacross various language benchmarks. Code is available at\nhttps://github.com/YupengSu/LLM-Barber.\n","authors":["Yupeng Su","Ziyi Guan","Xiaoqun Liu","Tianlai Jin","Dongkuan Wu","Graziano Chesi","Ngai Wong","Hao Yu"],"pdf_url":"https://arxiv.org/pdf/2408.10631v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.13664v3","updated":"2024-08-20T08:10:27Z","published":"2023-10-20T17:05:27Z","title":"Explainable Depression Symptom Detection in Social Media","summary":" Users of social platforms often perceive these sites as supportive spaces to\npost about their mental health issues. Those conversations contain important\ntraces about individuals' health risks. Recently, researchers have exploited\nthis online information to construct mental health detection models, which aim\nto identify users at risk on platforms like Twitter, Reddit or Facebook. Most\nof these models are centred on achieving good classification results, ignoring\nthe explainability and interpretability of the decisions. Recent research has\npointed out the importance of using clinical markers, such as the use of\nsymptoms, to improve trust in the computational models by health professionals.\nIn this paper, we propose using transformer-based architectures to detect and\nexplain the appearance of depressive symptom markers in the users' writings. We\npresent two approaches: i) train a model to classify, and another one to\nexplain the classifier's decision separately and ii) unify the two tasks\nsimultaneously using a single model. Additionally, for this latter manner, we\nalso investigated the performance of recent conversational LLMs when using\nin-context learning. Our natural language explanations enable clinicians to\ninterpret the models' decisions based on validated symptoms, enhancing trust in\nthe automated process. We evaluate our approach using recent symptom-based\ndatasets, employing both offline and expert-in-the-loop metrics to assess the\nquality of the explanations generated by our models. The experimental results\nshow that it is possible to achieve good classification results while\ngenerating interpretable symptom-based explanations.\n","authors":["Eliseo Bao","Anxo Pérez","Javier Parapar"],"pdf_url":"https://arxiv.org/pdf/2310.13664v3.pdf","comment":"Accepted for publication in Health Information Science and Systems"},{"id":"http://arxiv.org/abs/2404.08382v2","updated":"2024-08-20T08:07:49Z","published":"2024-04-12T10:36:15Z","title":"Look at the Text: Instruction-Tuned Language Models are More Robust\n Multiple Choice Selectors than You Think","summary":" Multiple choice questions (MCQs) are commonly used to evaluate the\ncapabilities of large language models (LLMs). One common way to evaluate the\nmodel response is to rank the candidate answers based on the log probability of\nthe first token prediction. An alternative way is to examine the text output.\nPrior work has shown that first token probabilities lack robustness to changes\nin MCQ phrasing, and that first token probabilities do not match text answers\nfor instruction-tuned models. Therefore, in this paper, we investigate the\nrobustness of text answers. We show that the text answers are more robust to\nquestion perturbations than the first token probabilities, when the first token\nanswers mismatch the text answers. The difference in robustness increases as\nthe mismatch rate becomes greater. As the mismatch reaches over 50\\%, the text\nanswer is more robust to option order changes than the debiased first token\nprobabilities using state-of-the-art debiasing methods such as PriDe. Our\nfindings provide further evidence for the benefits of text answer evaluation\nover first token probability evaluation.\n","authors":["Xinpeng Wang","Chengzhi Hu","Bolei Ma","Paul Röttger","Barbara Plank"],"pdf_url":"https://arxiv.org/pdf/2404.08382v2.pdf","comment":"COLM 2024"},{"id":"http://arxiv.org/abs/2408.10615v1","updated":"2024-08-20T07:49:38Z","published":"2024-08-20T07:49:38Z","title":"Enhancing Robustness in Large Language Models: Prompting for Mitigating\n the Impact of Irrelevant Information","summary":" In recent years, Large language models (LLMs) have garnered significant\nattention due to their superior performance in complex reasoning tasks.\nHowever, recent studies may diminish their reasoning capabilities markedly when\nproblem descriptions contain irrelevant information, even with the use of\nadvanced prompting techniques. To further investigate this issue, a dataset of\nprimary school mathematics problems containing irrelevant information, named\nGSMIR, was constructed. Testing prominent LLMs and prompting techniques on this\ndataset revealed that while LLMs can identify irrelevant information, they do\nnot effectively mitigate the interference it causes once identified. A novel\nautomatic construction method, ATF, which enhances the ability of LLMs to\nidentify and self-mitigate the influence of irrelevant information, is proposed\nto address this shortcoming. This method operates in two steps: first, analysis\nof irrelevant information, followed by its filtering. The ATF method, as\ndemonstrated by experimental results, significantly improves the reasoning\nperformance of LLMs and prompting techniques, even in the presence of\nirrelevant information on the GSMIR dataset.\n","authors":["Ming Jiang","Tingting Huang","Biao Guo","Yao Lu","Feng Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.10615v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00497v2","updated":"2024-08-20T07:47:49Z","published":"2024-06-01T16:56:19Z","title":"Recent Advances in End-to-End Simultaneous Speech Translation","summary":" Simultaneous speech translation (SimulST) is a demanding task that involves\ngenerating translations in real-time while continuously processing speech\ninput. This paper offers a comprehensive overview of the recent developments in\nSimulST research, focusing on four major challenges. Firstly, the complexities\nassociated with processing lengthy and continuous speech streams pose\nsignificant hurdles. Secondly, satisfying real-time requirements presents\ninherent difficulties due to the need for immediate translation output.\nThirdly, striking a balance between translation quality and latency constraints\nremains a critical challenge. Finally, the scarcity of annotated data adds\nanother layer of complexity to the task. Through our exploration of these\nchallenges and the proposed solutions, we aim to provide valuable insights into\nthe current landscape of SimulST research and suggest promising directions for\nfuture exploration.\n","authors":["Xiaoqian Liu","Guoqiang Hu","Yangfan Du","Erfeng He","Yingfeng Luo","Chen Xu","Tong Xiao","Jingbo Zhu"],"pdf_url":"https://arxiv.org/pdf/2406.00497v2.pdf","comment":"Accepted by IJCAI 2024"},{"id":"http://arxiv.org/abs/2408.10608v1","updated":"2024-08-20T07:40:12Z","published":"2024-08-20T07:40:12Z","title":"Promoting Equality in Large Language Models: Identifying and Mitigating\n the Implicit Bias based on Bayesian Theory","summary":" Large language models (LLMs) are trained on extensive text corpora, which\ninevitably include biased information. Although techniques such as Affective\nAlignment can mitigate some negative impacts of these biases, existing\nprompt-based attack methods can still extract these biases from the model's\nweights. Moreover, these biases frequently appear subtly when LLMs are prompted\nto perform identical tasks across different demographic groups, thereby\ncamouflaging their presence. To address this issue, we have formally defined\nthe implicit bias problem and developed an innovative framework for bias\nremoval based on Bayesian theory, Bayesian-Theory based Bias Removal (BTBR).\nBTBR employs likelihood ratio screening to pinpoint data entries within\npublicly accessible biased datasets that represent biases inadvertently\nincorporated during the LLM training phase. It then automatically constructs\nrelevant knowledge triples and expunges bias information from LLMs using model\nediting techniques. Through extensive experimentation, we have confirmed the\npresence of the implicit bias problem in LLMs and demonstrated the\neffectiveness of our BTBR approach.\n","authors":["Yongxin Deng","Xihe Qiu","Xiaoyu Tan","Jing Pan","Chen Jue","Zhijun Fang","Yinghui Xu","Wei Chu","Yuan Qi"],"pdf_url":"https://arxiv.org/pdf/2408.10608v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10604v1","updated":"2024-08-20T07:37:06Z","published":"2024-08-20T07:37:06Z","title":"Multilingual Non-Factoid Question Answering with Silver Answers","summary":" Most existing Question Answering Datasets (QuADs) primarily focus on\nfactoid-based short-context Question Answering (QA) in high-resource languages.\nHowever, the scope of such datasets for low-resource languages remains limited,\nwith only a few works centered on factoid-based QuADs and none on non-factoid\nQuADs. Therefore, this work presents MuNfQuAD, a multilingual QuAD with\nnon-factoid questions. It utilizes interrogative sub-headings from BBC news\narticles as questions and the corresponding paragraphs as silver answers. The\ndataset comprises over 370K QA pairs across 38 languages, encompassing several\nlow-resource languages, and stands as the largest multilingual QA dataset to\ndate. Based on the manual annotations of 790 QA-pairs from MuNfQuAD (golden\nset), we observe that 98\\% of questions can be answered using their\ncorresponding silver answer. Our fine-tuned Answer Paragraph Selection (APS)\nmodel outperforms the baselines. The APS model attained an accuracy of 80\\% and\n72\\%, as well as a macro F1 of 72\\% and 66\\%, on the MuNfQuAD testset and the\ngolden set, respectively. Furthermore, the APS model effectively generalizes\ncertain a language within the golden set, even after being fine-tuned on silver\nlabels.\n","authors":["Ritwik Mishra","Sreeram Vennam","Rajiv Ratn Shah","Ponnurangam Kumaraguru"],"pdf_url":"https://arxiv.org/pdf/2408.10604v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09420v2","updated":"2024-08-20T07:18:55Z","published":"2024-08-18T09:31:13Z","title":"Enhancing Startup Success Predictions in Venture Capital: A GraphRAG\n Augmented Multivariate Time Series Method","summary":" In the Venture Capital(VC) industry, predicting the success of startups is\nchallenging due to limited financial data and the need for subjective revenue\nforecasts. Previous methods based on time series analysis or deep learning\noften fall short as they fail to incorporate crucial inter-company\nrelationships such as competition and collaboration. Regarding the issues, we\npropose a novel approach using GrahphRAG augmented time series model. With\nGraphRAG, time series predictive methods are enhanced by integrating these\nvital relationships into the analysis framework, allowing for a more dynamic\nunderstanding of the startup ecosystem in venture capital. Our experimental\nresults demonstrate that our model significantly outperforms previous models in\nstartup success predictions. To the best of our knowledge, our work is the\nfirst application work of GraphRAG.\n","authors":["Zitian Gao","Yihao Xiao"],"pdf_url":"https://arxiv.org/pdf/2408.09420v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2312.13936,\n arXiv:2312.04876, arXiv:2402.11454 by other authors"},{"id":"http://arxiv.org/abs/2408.10593v1","updated":"2024-08-20T07:10:40Z","published":"2024-08-20T07:10:40Z","title":"An Efficient Sign Language Translation Using Spatial Configuration and\n Motion Dynamics with LLMs","summary":" Gloss-free Sign Language Translation (SLT) converts sign videos directly into\nspoken language sentences without relying on glosses. Recently, Large Language\nModels (LLMs) have shown remarkable translation performance in gloss-free\nmethods by harnessing their powerful natural language generation capabilities.\nHowever, these methods often rely on domain-specific fine-tuning of visual\nencoders to achieve optimal results. By contrast, this paper emphasizes the\nimportance of capturing the spatial configurations and motion dynamics inherent\nin sign language. With this in mind, we introduce Spatial and Motion-based Sign\nLanguage Translation (SpaMo), a novel LLM-based SLT framework. The core idea of\nSpaMo is simple yet effective. We first extract spatial and motion features\nusing off-the-shelf visual encoders and then input these features into an LLM\nwith a language prompt. Additionally, we employ a visual-text alignment process\nas a warm-up before the SLT supervision. Our experiments demonstrate that SpaMo\nachieves state-of-the-art performance on two popular datasets, PHOENIX14T and\nHow2Sign.\n","authors":["Eui Jun Hwang","Sukmin Cho","Junmyeong Lee","Jong C. Park"],"pdf_url":"https://arxiv.org/pdf/2408.10593v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2404.10199v5","updated":"2024-08-20T06:53:45Z","published":"2024-04-16T00:50:43Z","title":"CULTURE-GEN: Revealing Global Cultural Perception in Language Models\n through Natural Language Prompting","summary":" As the utilization of large language models (LLMs) has proliferated\nworld-wide, it is crucial for them to have adequate knowledge and fair\nrepresentation for diverse global cultures. In this work, we uncover culture\nperceptions of three SOTA models on 110 countries and regions on 8\nculture-related topics through culture-conditioned generations, and extract\nsymbols from these generations that are associated to each culture by the LLM.\nWe discover that culture-conditioned generation consist of linguistic \"markers\"\nthat distinguish marginalized cultures apart from default cultures. We also\ndiscover that LLMs have an uneven degree of diversity in the culture symbols,\nand that cultures from different geographic regions have different presence in\nLLMs' culture-agnostic generation. Our findings promote further research in\nstudying the knowledge and fairness of global culture perception in LLMs. Code\nand Data can be found here: https://github.com/huihanlhh/Culture-Gen/\n","authors":["Huihan Li","Liwei Jiang","Jena D. Hwang","Hyunwoo Kim","Sebastin Santy","Taylor Sorensen","Bill Yuchen Lin","Nouha Dziri","Xiang Ren","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2404.10199v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07505v2","updated":"2024-08-20T06:50:48Z","published":"2024-08-14T12:32:41Z","title":"Large Language Models Know What Makes Exemplary Contexts","summary":" In-context learning (ICL) has proven to be a significant capability with the\nadvancement of Large Language models (LLMs). By instructing LLMs using few-shot\ndemonstrative examples, ICL enables them to perform a wide range of tasks\nwithout needing to update millions of parameters. This paper presents a unified\nframework for LLMs that allows them to self-select influential in-context\nexamples to compose their contexts; self-rank candidates with different\ndemonstration compositions; self-optimize the demonstration selection and\nordering through reinforcement learning. Specifically, our method designs a\nparameter-efficient retrieval head that generates the optimized demonstration\nafter training with rewards from LLM's own preference. Experimental results\nvalidate the proposed method's effectiveness in enhancing ICL performance.\nAdditionally, our approach effectively identifies and selects the most\nrepresentative examples for the current task, and includes more diversity in\nretrieval.\n","authors":["Quanyu Long","Jianda Chen","Wenya Wang","Sinno Jialin Pan"],"pdf_url":"https://arxiv.org/pdf/2408.07505v2.pdf","comment":"12 pages, 3 figures"},{"id":"http://arxiv.org/abs/2401.11880v3","updated":"2024-08-20T06:45:50Z","published":"2024-01-22T12:11:55Z","title":"PsySafe: A Comprehensive Framework for Psychological-based Attack,\n Defense, and Evaluation of Multi-agent System Safety","summary":" Multi-agent systems, when enhanced with Large Language Models (LLMs), exhibit\nprofound capabilities in collective intelligence. However, the potential misuse\nof this intelligence for malicious purposes presents significant risks. To\ndate, comprehensive research on the safety issues associated with multi-agent\nsystems remains limited. In this paper, we explore these concerns through the\ninnovative lens of agent psychology, revealing that the dark psychological\nstates of agents constitute a significant threat to safety. To tackle these\nconcerns, we propose a comprehensive framework (PsySafe) grounded in agent\npsychology, focusing on three key areas: firstly, identifying how dark\npersonality traits in agents can lead to risky behaviors; secondly, evaluating\nthe safety of multi-agent systems from the psychological and behavioral\nperspectives, and thirdly, devising effective strategies to mitigate these\nrisks. Our experiments reveal several intriguing phenomena, such as the\ncollective dangerous behaviors among agents, agents' self-reflection when\nengaging in dangerous behavior, and the correlation between agents'\npsychological assessments and dangerous behaviors. We anticipate that our\nframework and observations will provide valuable insights for further research\ninto the safety of multi-agent systems. We will make our data and code publicly\naccessible at https://github.com/AI4Good24/PsySafe.\n","authors":["Zaibin Zhang","Yongting Zhang","Lijun Li","Hongzhi Gao","Lijun Wang","Huchuan Lu","Feng Zhao","Yu Qiao","Jing Shao"],"pdf_url":"https://arxiv.org/pdf/2401.11880v3.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2311.09263v3","updated":"2024-08-20T06:34:37Z","published":"2023-11-15T07:37:28Z","title":"Auto-ICL: In-Context Learning without Human Supervision","summary":" With in-context learning ability, the performance of large language models\ncan be significantly boosted when provided with appropriate context. However,\nexisting in-context learning methods mainly rely on human-provided contexts,\nsuch as labeled examples and explicit instructions. Writing context by humans\nis labor-intensive on various tasks and limits the model to tasks manageable by\nhumans. To overcome these limitations, we propose Automatic In-Context Learning\nframework that enables the model to autonomously generate examples and\ninstructions for problem-solving. With experiments across various models and\ndatasets, results show that model-generated contexts outperform human-annotated\ncontexts, including Few-Shot and Few-Shot-CoT methods, and surpass existing\nself-generated context methods like Zero-CoT and Auto-CoT.\n","authors":["Jinghan Yang","Shuming Ma","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2311.09263v3.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2408.10573v1","updated":"2024-08-20T06:24:47Z","published":"2024-08-20T06:24:47Z","title":"Putting People in LLMs' Shoes: Generating Better Answers via Question\n Rewriter","summary":" Large Language Models (LLMs) have demonstrated significant capabilities,\nparticularly in the domain of question answering (QA). However, their\neffectiveness in QA is often undermined by the vagueness of user questions. To\naddress this issue, we introduce single-round instance-level prompt\noptimization, referred to as question rewriter. By enhancing the\nintelligibility of human questions for black-box LLMs, our question rewriter\nimproves the quality of generated answers. The rewriter is optimized using\ndirect preference optimization based on feedback collected from automatic\ncriteria for evaluating generated answers; therefore, its training does not\nrequire costly human annotations. The experiments across multiple black-box\nLLMs and long-form question answering (LFQA) datasets demonstrate the efficacy\nof our method. This paper provides a practical framework for training question\nrewriters and sets a precedent for future explorations in prompt optimization\nwithin LFQA tasks. Code is available at\n\\url{https://github.com/3244we/Question-Rewriter}.\n","authors":["Junhao Chen","Bowen Wang","Zhouqiang jiang","Yuta Nakashima"],"pdf_url":"https://arxiv.org/pdf/2408.10573v1.pdf","comment":"7 pages, 4 figures, 5 tables"},{"id":"http://arxiv.org/abs/2408.10557v1","updated":"2024-08-20T05:45:04Z","published":"2024-08-20T05:45:04Z","title":"Speech Representation Learning Revisited: The Necessity of Separate\n Learnable Parameters and Robust Data Augmentation","summary":" Speech modeling methods learn one embedding for a fixed segment of speech,\ntypically in between 10-25 ms. The information present in speech can be divided\ninto two categories: \"what is being said\" (content) and \"how it is expressed\"\n(other) and these two are orthogonal in nature causing the optimization\nalgorithm to find a sub-optimal solution if forced to optimize together. This\nleads to sub-optimal performance in one or all downstream tasks as shown by\nprevious studies. Current self-supervised learning (SSL) methods such as HuBERT\nare very good at modeling the content information present in speech. Data\naugmentation improves the performance on tasks which require effective modeling\nof other information but this leads to a divided capacity of the model. In this\nwork, we conduct a preliminary study to understand the importance of modeling\nother information using separate learnable parameters. We propose a modified\nversion of HuBERT, termed Other HuBERT (O-HuBERT), to test our hypothesis. Our\nfindings are twofold: first, the O-HuBERT method is able to utilize all layers\nto build complex features to encode other information; second, a robust data\naugmentation strategy is essential for learning the information required by\ntasks that depend on other information and to achieve state-of-the-art (SOTA)\nperformance on the SUPERB benchmark with a similarly sized model (100 million\nparameters) and pre-training data (960 hours).\n","authors":["Hemant Yadav","Sunayana Sitaram","Rajiv Ratn Shah"],"pdf_url":"https://arxiv.org/pdf/2408.10557v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09632v2","updated":"2024-08-20T05:28:27Z","published":"2024-08-19T01:30:14Z","title":"MoDeGPT: Modular Decomposition for Large Language Model Compression","summary":" Large Language Models (LLMs) have reshaped the landscape of artificial\nintelligence by demonstrating exceptional performance across various tasks.\nHowever, substantial computational requirements make their deployment\nchallenging on devices with limited resources. Recently, compression methods\nusing low-rank matrix techniques have shown promise, yet these often lead to\ndegraded accuracy or introduce significant overhead in parameters and inference\nlatency. This paper introduces \\textbf{Mo}dular \\textbf{De}composition\n(MoDeGPT), a novel structured compression framework that does not need recovery\nfine-tuning while resolving the above drawbacks. MoDeGPT partitions the\nTransformer block into modules comprised of matrix pairs and reduces the hidden\ndimensions via reconstructing the module-level outputs. MoDeGPT is developed\nbased on a theoretical framework that utilizes three well-established matrix\ndecomposition algorithms -- Nystr\\\"om approximation, CR decomposition, and SVD\n-- and applies them to our redefined transformer modules. Our comprehensive\nexperiments show MoDeGPT, without backward propagation, matches or surpasses\nprevious structured compression methods that rely on gradient information, and\nsaves 98% of compute costs on compressing a 13B model. On \\textsc{Llama}-2/3\nand OPT models, MoDeGPT maintains 90-95% zero-shot performance with 25-30%\ncompression rates. Moreover, the compression can be done on a single GPU within\na few hours and increases the inference throughput by up to 46%.\n","authors":["Chi-Heng Lin","Shangqian Gao","James Seale Smith","Abhishek Patel","Shikhar Tuli","Yilin Shen","Hongxia Jin","Yen-Chang Hsu"],"pdf_url":"https://arxiv.org/pdf/2408.09632v2.pdf","comment":"31 pages, 9 figures"},{"id":"http://arxiv.org/abs/2401.14869v2","updated":"2024-08-20T05:27:44Z","published":"2024-01-26T13:55:32Z","title":"F-Eval: Assessing Fundamental Abilities with Refined Evaluation Methods","summary":" Large language models (LLMs) garner significant attention for their\nunprecedented performance, leading to an increasing number of researches\nevaluating LLMs. However, these evaluation benchmarks are limited to assessing\nthe instruction-following capabilities, overlooking the fundamental abilities\nthat emerge during the pre-training stage. Previous subjective evaluation\nmethods mainly reply on scoring by API models. However, in the absence of\nreferences, large models have shown limited ability to discern subtle\ndifferences. To bridge the gap, we propose F-Eval, a bilingual evaluation\nbenchmark to evaluate the fundamental abilities, including expression,\ncommonsense and logic. The tasks in F-Eval include multi-choice objective\ntasks, open-ended objective tasks, reference-based subjective tasks and\nreference-free subjective tasks. For reference-free subjective tasks, we devise\nnew evaluation methods, serving as alternatives to scoring by API models. We\nconduct evaluations on 13 advanced LLMs. Results show that our evaluation\nmethods show higher correlation coefficients and larger distinction than other\nevaluators. Additionally, we discuss the influence of different model sizes,\ndimensions, and normalization methods. We anticipate that F-Eval will\nfacilitate the study of LLMs' fundamental abilities.\n","authors":["Yu Sun","Keyu Chen","Shujie Wang","Peiji Li","Qipeng Guo","Hang Yan","Xipeng Qiu","Xuanjing Huang","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2401.14869v2.pdf","comment":"ACL 2024"},{"id":"http://arxiv.org/abs/2408.10548v1","updated":"2024-08-20T04:59:19Z","published":"2024-08-20T04:59:19Z","title":"Language Modeling on Tabular Data: A Survey of Foundations, Techniques\n and Evolution","summary":" Tabular data, a prevalent data type across various domains, presents unique\nchallenges due to its heterogeneous nature and complex structural\nrelationships. Achieving high predictive performance and robustness in tabular\ndata analysis holds significant promise for numerous applications. Influenced\nby recent advancements in natural language processing, particularly transformer\narchitectures, new methods for tabular data modeling have emerged. Early\ntechniques concentrated on pre-training transformers from scratch, often\nencountering scalability issues. Subsequently, methods leveraging pre-trained\nlanguage models like BERT have been developed, which require less data and\nyield enhanced performance. The recent advent of large language models, such as\nGPT and LLaMA, has further revolutionized the field, facilitating more advanced\nand diverse applications with minimal fine-tuning. Despite the growing\ninterest, a comprehensive survey of language modeling techniques for tabular\ndata remains absent. This paper fills this gap by providing a systematic review\nof the development of language modeling for tabular data, encompassing: (1) a\ncategorization of different tabular data structures and data types; (2) a\nreview of key datasets used in model training and tasks used for evaluation;\n(3) a summary of modeling techniques including widely-adopted data processing\nmethods, popular architectures, and training objectives; (4) the evolution from\nadapting traditional Pre-training/Pre-trained language models to the\nutilization of large language models; (5) an identification of persistent\nchallenges and potential future research directions in language modeling for\ntabular data analysis. GitHub page associated with this survey is available at:\nhttps://github.com/lanxiang1017/Language-Modeling-on-Tabular-Data-Survey.git.\n","authors":["Yucheng Ruan","Xiang Lan","Jingying Ma","Yizhi Dong","Kai He","Mengling Feng"],"pdf_url":"https://arxiv.org/pdf/2408.10548v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11709v3","updated":"2024-08-20T04:33:19Z","published":"2024-06-17T16:28:21Z","title":"Instruct, Not Assist: LLM-based Multi-Turn Planning and Hierarchical\n Questioning for Socratic Code Debugging","summary":" Socratic questioning is an effective teaching strategy, encouraging critical\nthinking and problem-solving. The conversational capabilities of large language\nmodels (LLMs) show great potential for providing scalable, real-time student\nguidance. However, current LLMs often give away solutions directly, making them\nineffective instructors. We tackle this issue in the code debugging domain with\nTreeInstruct, an Instructor agent guided by a novel state space-based planning\nalgorithm. TreeInstruct asks probing questions to help students independently\nidentify and resolve errors. It estimates a student's conceptual and\nsyntactical knowledge to dynamically construct a question tree based on their\nresponses and current knowledge state, effectively addressing both independent\nand dependent mistakes concurrently in a multi-turn interaction setting. In\naddition to using an existing single-bug debugging benchmark, we construct a\nmore challenging multi-bug dataset of 150 coding problems, incorrect solutions,\nand bug fixes -- all carefully constructed and annotated by experts. Extensive\nevaluation shows TreeInstruct's state-of-the-art performance on both datasets,\nproving it to be a more effective instructor than baselines. Furthermore, a\nreal-world case study with five students of varying skill levels further\ndemonstrates TreeInstruct's ability to guide students to debug their code\nefficiently with minimal turns and highly Socratic questioning. We provide our\ncode and datasets at http://github.com/agarwalishika/TreeInstruct .\n","authors":["Priyanka Kargupta","Ishika Agarwal","Dilek Hakkani-Tur","Jiawei Han"],"pdf_url":"https://arxiv.org/pdf/2406.11709v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06583v4","updated":"2024-08-20T04:32:37Z","published":"2024-08-13T02:43:19Z","title":"A Structure-aware Generative Model for Biomedical Event Extraction","summary":" Biomedical Event Extraction (BEE) is a challenging task that involves\nmodeling complex relationships between fine-grained entities in biomedical\ntext. BEE has traditionally been formulated as a classification problem. With\nthe recent technological advancements in large language models (LLMs),\ngeneration-based models that cast event extraction as a sequence generation\nproblem have attracted much attention from the NLP research communities.\nHowever, current generative models often overlook the importance of\ncross-instance information from complex event structures such as nested events\nand overlapping events, which contribute to over 20% of the events in the\nbenchmark datasets. In this paper, we propose an event structure-aware\ngenerative model named GenBEE, which can capture complex event structures in\nbiomedical text for biomedical event extraction. In particular, GenBEE\nconstructs event prompts that distill knowledge from LLMs for incorporating\nboth label semantics and argument dependency relationships into the proposed\nmodel. In addition, GenBEE also generates prefixes with event structural\nprompts to incorporate structural features for improving the model's overall\nperformance. We have evaluated the proposed GenBEE model on three widely used\nbiomedical event extraction benchmark datasets, namely MLEE, GE11, and PHEE.\nExperimental results show that GenBEE has achieved state-of-the-art performance\non the MLEE and GE11 datasets, and achieved competitive results when compared\nto the state-of-the-art classification-based models on the PHEE dataset.\n","authors":["Haohan Yuan","Siu Cheung Hui","Haopeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.06583v4.pdf","comment":"8 pages, 4 figures, 6 tables"},{"id":"http://arxiv.org/abs/2408.10536v1","updated":"2024-08-20T04:30:26Z","published":"2024-08-20T04:30:26Z","title":"Synergistic Approach for Simultaneous Optimization of Monolingual,\n Cross-lingual, and Multilingual Information Retrieval","summary":" Information retrieval across different languages is an increasingly important\nchallenge in natural language processing. Recent approaches based on\nmultilingual pre-trained language models have achieved remarkable success, yet\nthey often optimize for either monolingual, cross-lingual, or multilingual\nretrieval performance at the expense of others. This paper proposes a novel\nhybrid batch training strategy to simultaneously improve zero-shot retrieval\nperformance across monolingual, cross-lingual, and multilingual settings while\nmitigating language bias. The approach fine-tunes multilingual language models\nusing a mix of monolingual and cross-lingual question-answer pair batches\nsampled based on dataset size. Experiments on XQuAD-R, MLQA-R, and MIRACL\nbenchmark datasets show that the proposed method consistently achieves\ncomparable or superior results in zero-shot retrieval across various languages\nand retrieval tasks compared to monolingual-only or cross-lingual-only\ntraining. Hybrid batch training also substantially reduces language bias in\nmultilingual retrieval compared to monolingual training. These results\ndemonstrate the effectiveness of the proposed approach for learning\nlanguage-agnostic representations that enable strong zero-shot retrieval\nperformance across diverse languages.\n","authors":["Adel Elmahdy","Sheng-Chieh Lin","Amin Ahmad"],"pdf_url":"https://arxiv.org/pdf/2408.10536v1.pdf","comment":"15 pages, 2 figures, 13 tables"},{"id":"http://arxiv.org/abs/2401.00763v3","updated":"2024-08-20T04:11:26Z","published":"2024-01-01T14:06:55Z","title":"New Job, New Gender? Measuring the Social Bias in Image Generation\n Models","summary":" Image generation models can generate or edit images from a given text. Recent\nadvancements in image generation technology, exemplified by DALL-E and\nMidjourney, have been groundbreaking. These advanced models, despite their\nimpressive capabilities, are often trained on massive Internet datasets, making\nthem susceptible to generating content that perpetuates social stereotypes and\nbiases, which can lead to severe consequences. Prior research on assessing bias\nwithin image generation models suffers from several shortcomings, including\nlimited accuracy, reliance on extensive human labor, and lack of comprehensive\nanalysis. In this paper, we propose BiasPainter, a novel evaluation framework\nthat can accurately, automatically and comprehensively trigger social bias in\nimage generation models. BiasPainter uses a diverse range of seed images of\nindividuals and prompts the image generation models to edit these images using\ngender, race, and age-neutral queries. These queries span 62 professions, 39\nactivities, 57 types of objects, and 70 personality traits. The framework then\ncompares the edited images to the original seed images, focusing on the\nsignificant changes related to gender, race, and age. BiasPainter adopts a key\ninsight that these characteristics should not be modified when subjected to\nneutral prompts. Built upon this design, BiasPainter can trigger the social\nbias and evaluate the fairness of image generation models. We use BiasPainter\nto evaluate six widely-used image generation models, such as stable diffusion\nand Midjourney. Experimental results show that BiasPainter can successfully\ntrigger social bias in image generation models. According to our human\nevaluation, BiasPainter can achieve 90.8% accuracy on automatic bias detection,\nwhich is significantly higher than the results reported in previous work.\n","authors":["Wenxuan Wang","Haonan Bai","Jen-tse Huang","Yuxuan Wan","Youliang Yuan","Haoyi Qiu","Nanyun Peng","Michael R. Lyu"],"pdf_url":"https://arxiv.org/pdf/2401.00763v3.pdf","comment":"ACM MM 2024 Oral"},{"id":"http://arxiv.org/abs/2408.10528v1","updated":"2024-08-20T04:06:21Z","published":"2024-08-20T04:06:21Z","title":"NoMatterXAI: Generating \"No Matter What\" Alterfactual Examples for\n Explaining Black-Box Text Classification Models","summary":" In Explainable AI (XAI), counterfactual explanations (CEs) are a well-studied\nmethod to communicate feature relevance through contrastive reasoning of \"what\nif\" to explain AI models' predictions. However, they only focus on important\n(i.e., relevant) features and largely disregard less important (i.e.,\nirrelevant) ones. Such irrelevant features can be crucial in many applications,\nespecially when users need to ensure that an AI model's decisions are not\naffected or biased against specific attributes such as gender, race, religion,\nor political affiliation. To address this gap, the concept of alterfactual\nexplanations (AEs) has been proposed. AEs explore an alternative reality of \"no\nmatter what\", where irrelevant features are substituted with alternative\nfeatures (e.g., \"republicans\" -> \"democrats\") within the same attribute (e.g.,\n\"politics\") while maintaining a similar prediction output. This serves to\nvalidate whether AI model predictions are influenced by the specified\nattributes. Despite the promise of AEs, there is a lack of computational\napproaches to systematically generate them, particularly in the text domain,\nwhere creating AEs for AI text classifiers presents unique challenges. This\npaper addresses this challenge by formulating AE generation as an optimization\nproblem and introducing MoMatterXAI, a novel algorithm that generates AEs for\ntext classification tasks. Our approach achieves high fidelity of up to 95%\nwhile preserving context similarity of over 90% across multiple models and\ndatasets. A human study further validates the effectiveness of AEs in\nexplaining AI text classifiers to end users. All codes will be publicly\navailable.\n","authors":["Tuc Nguyen","James Michels","Hua Shen","Thai Le"],"pdf_url":"https://arxiv.org/pdf/2408.10528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10524v1","updated":"2024-08-20T04:00:19Z","published":"2024-08-20T04:00:19Z","title":"XCB: an effective contextual biasing approach to bias cross-lingual\n phrases in speech recognition","summary":" Contextualized ASR models have been demonstrated to effectively improve the\nrecognition accuracy of uncommon phrases when a predefined phrase list is\navailable. However, these models often struggle with bilingual settings, which\nare prevalent in code-switching speech recognition. In this study, we make the\ninitial attempt to address this challenge by introducing a Cross-lingual\nContextual Biasing(XCB) module. Specifically, we augment a pre-trained ASR\nmodel for the dominant language by integrating an auxiliary language biasing\nmodule and a supplementary language-specific loss, aimed at enhancing the\nrecognition of phrases in the secondary language. Experimental results\nconducted on our in-house code-switching dataset have validated the efficacy of\nour approach, demonstrating significant improvements in the recognition of\nbiasing phrases in the secondary language, even without any additional\ninference overhead. Additionally, our proposed system exhibits both efficiency\nand generalization when is applied by the unseen ASRU-2019 test set.\n","authors":["Xucheng Wan","Naijun Zheng","Kai Liu","Huan Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.10524v1.pdf","comment":"accepted to NCMMSC 2024"},{"id":"http://arxiv.org/abs/2307.06281v5","updated":"2024-08-20T03:56:03Z","published":"2023-07-12T16:23:09Z","title":"MMBench: Is Your Multi-modal Model an All-around Player?","summary":" Large vision-language models (VLMs) have recently achieved remarkable\nprogress, exhibiting impressive multimodal perception and reasoning abilities.\nHowever, effectively evaluating these large VLMs remains a major challenge,\nhindering future development in this domain. Traditional benchmarks like VQAv2\nor COCO Caption provide quantitative performance measurements but lack\nfine-grained ability assessment and robust evaluation metrics. Meanwhile,\nsubjective benchmarks, such as OwlEval, offer comprehensive evaluations of a\nmodel's abilities by incorporating human labor, which is not scalable and may\ndisplay significant bias. In response to these challenges, we propose MMBench,\na bilingual benchmark for assessing the multi-modal capabilities of VLMs.\nMMBench methodically develops a comprehensive evaluation pipeline, primarily\ncomprised of the following key features: 1. MMBench is meticulously curated\nwith well-designed quality control schemes, surpassing existing similar\nbenchmarks in terms of the number and variety of evaluation questions and\nabilities; 2. MMBench introduces a rigorous CircularEval strategy and\nincorporates large language models to convert free-form predictions into\npre-defined choices, which helps to yield accurate evaluation results for\nmodels with limited instruction-following capabilities. 3. MMBench incorporates\nmultiple-choice questions in both English and Chinese versions, enabling an\napples-to-apples comparison of VLMs' performance under a bilingual context. To\nsummarize, MMBench is a systematically designed objective benchmark for a\nrobust and holistic evaluation of vision-language models. We hope MMBench will\nassist the research community in better evaluating their models and facilitate\nfuture progress in this area. The evalutation code of MMBench has been\nintegrated into VLMEvalKit: https://github.com/open-compass/VLMEvalKit.\n","authors":["Yuan Liu","Haodong Duan","Yuanhan Zhang","Bo Li","Songyang Zhang","Wangbo Zhao","Yike Yuan","Jiaqi Wang","Conghui He","Ziwei Liu","Kai Chen","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2307.06281v5.pdf","comment":"Accepted in ECCV2024 as Oral Presentation"},{"id":"http://arxiv.org/abs/2408.10516v1","updated":"2024-08-20T03:33:04Z","published":"2024-08-20T03:33:04Z","title":"Data Augmentation Integrating Dialogue Flow and Style to Adapt Spoken\n Dialogue Systems to Low-Resource User Groups","summary":" This study addresses the interaction challenges encountered by spoken\ndialogue systems (SDSs) when engaging with users who exhibit distinct\nconversational behaviors, particularly minors, in scenarios where data are\nscarce. We propose a novel data augmentation framework to enhance SDS\nperformance for user groups with limited resources. Our approach leverages a\nlarge language model (LLM) to extract speaker styles and a pre-trained language\nmodel (PLM) to simulate dialogue act history. This method generates enriched\nand personalized dialogue data, facilitating improved interactions with unique\nuser demographics. Extensive experiments validate the efficacy of our\nmethodology, highlighting its potential to foster the development of more\nadaptive and inclusive dialogue systems.\n","authors":["Zhiyang Qi","Michimasa Inaba"],"pdf_url":"https://arxiv.org/pdf/2408.10516v1.pdf","comment":"Accepted to SIGDIAL 2024"},{"id":"http://arxiv.org/abs/2405.00981v2","updated":"2024-08-20T03:15:07Z","published":"2024-05-02T03:35:21Z","title":"Bayesian Optimization with LLM-Based Acquisition Functions for Natural\n Language Preference Elicitation","summary":" Designing preference elicitation (PE) methodologies that can quickly\nascertain a user's top item preferences in a cold-start setting is a key\nchallenge for building effective and personalized conversational recommendation\n(ConvRec) systems. While large language models (LLMs) enable fully natural\nlanguage (NL) PE dialogues, we hypothesize that monolithic LLM NL-PE approaches\nlack the multi-turn, decision-theoretic reasoning required to effectively\nbalance the exploration and exploitation of user preferences towards an\narbitrary item set. In contrast, traditional Bayesian optimization PE methods\ndefine theoretically optimal PE strategies, but cannot generate arbitrary NL\nqueries or reason over content in NL item descriptions -- requiring users to\nexpress preferences via ratings or comparisons of unfamiliar items. To overcome\nthe limitations of both approaches, we formulate NL-PE in a Bayesian\nOptimization (BO) framework that seeks to actively elicit NL feedback to\nidentify the best recommendation. Key challenges in generalizing BO to deal\nwith natural language feedback include determining: (a) how to leverage LLMs to\nmodel the likelihood of NL preference feedback as a function of item utilities,\nand (b) how to design an acquisition function for NL BO that can elicit\npreferences in the infinite space of language. We demonstrate our framework in\na novel NL-PE algorithm, PEBOL, which uses: 1) Natural Language Inference (NLI)\nbetween user preference utterances and NL item descriptions to maintain\nBayesian preference beliefs, and 2) BO strategies such as Thompson Sampling\n(TS) and Upper Confidence Bound (UCB) to steer LLM query generation. We\nnumerically evaluate our methods in controlled simulations, finding that after\n10 turns of dialogue, PEBOL can achieve an MRR@10 of up to 0.27 compared to the\nbest monolithic LLM baseline's MRR@10 of 0.17, despite relying on earlier and\nsmaller LLMs.\n","authors":["David Eric Austin","Anton Korikov","Armin Toroghi","Scott Sanner"],"pdf_url":"https://arxiv.org/pdf/2405.00981v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10497v1","updated":"2024-08-20T02:44:45Z","published":"2024-08-20T02:44:45Z","title":"QUITO-X: An Information Bottleneck-based Compression Algorithm with\n Cross-Attention","summary":" Generative LLM have achieved significant success in various industrial tasks\nand can effectively adapt to vertical domains and downstream tasks through ICL.\nHowever, with tasks becoming increasingly complex, the context length required\nby ICL is also getting longer, and two significant issues arise: (i) The\nexcessively long context leads to high costs and inference delays. (ii) A\nsubstantial amount of task-irrelevant information introduced by long contexts\nexacerbates the \"lost in the middle\" problem.\n Recently, compressing prompts by removing tokens according to some metric\nobtained from some causal language models, such as llama-7b, has emerged as an\neffective approach to mitigate these issues. However, the metric used by prior\nmethod such as self-information or PPL do not fully align with the objective of\ndistinuishing the most important tokens when conditioning on query. In this\nwork, we introduce information bottleneck theory to carefully examine the\nproperties required by the metric. Inspired by this, we use cross-attention in\nencoder-decoder architecture as a new metric. Our simple method leads to\nsignificantly better performance in smaller models with lower latency.\n We evaluate our method on four datasets: DROP, CoQA, SQuAD, and Quoref. The\nexperimental results show that, while maintaining the same performance, our\ncompression rate can improve by nearly 25% over previous SOTA. Remarkably, in\nexperiments where 25% of the tokens are removed, our model's EM score for\nanswers sometimes even exceeds that of the control group using uncompressed\ntext as context.\n","authors":["Yihang Wang","Xu Huang","Bowen Tian","Yixing Fan","Jiafeng Guo"],"pdf_url":"https://arxiv.org/pdf/2408.10497v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10490v1","updated":"2024-08-20T02:19:35Z","published":"2024-08-20T02:19:35Z","title":"Analysis of Plan-based Retrieval for Grounded Text Generation","summary":" In text generation, hallucinations refer to the generation of seemingly\ncoherent text that contradicts established knowledge. One compelling hypothesis\nis that hallucinations occur when a language model is given a generation task\noutside its parametric knowledge (due to rarity, recency, domain, etc.). A\ncommon strategy to address this limitation is to infuse the language models\nwith retrieval mechanisms, providing the model with relevant knowledge for the\ntask. In this paper, we leverage the planning capabilities of instruction-tuned\nLLMs and analyze how planning can be used to guide retrieval to further reduce\nthe frequency of hallucinations. We empirically evaluate several variations of\nour proposed approach on long-form text generation tasks. By improving the\ncoverage of relevant facts, plan-guided retrieval and generation can produce\nmore informative responses while providing a higher rate of attribution to\nsource documents.\n","authors":["Ameya Godbole","Nicholas Monath","Seungyeon Kim","Ankit Singh Rawat","Andrew McCallum","Manzil Zaheer"],"pdf_url":"https://arxiv.org/pdf/2408.10490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09656v2","updated":"2024-08-20T02:05:46Z","published":"2024-08-19T02:34:15Z","title":"A Comparison of Large Language Model and Human Performance on Random\n Number Generation Tasks","summary":" Random Number Generation Tasks (RNGTs) are used in psychology for examining\nhow humans generate sequences devoid of predictable patterns. By adapting an\nexisting human RNGT for an LLM-compatible environment, this preliminary study\ntests whether ChatGPT-3.5, a large language model (LLM) trained on\nhuman-generated text, exhibits human-like cognitive biases when generating\nrandom number sequences. Initial findings indicate that ChatGPT-3.5 more\neffectively avoids repetitive and sequential patterns compared to humans, with\nnotably lower repeat frequencies and adjacent number frequencies. Continued\nresearch into different models, parameters, and prompting methodologies will\ndeepen our understanding of how LLMs can more closely mimic human random\ngeneration behaviors, while also broadening their applications in cognitive and\nbehavioral science research.\n","authors":["Rachel M. Harrison"],"pdf_url":"https://arxiv.org/pdf/2408.09656v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10488v1","updated":"2024-08-20T02:01:30Z","published":"2024-08-20T02:01:30Z","title":"Event Stream based Sign Language Translation: A High-Definition\n Benchmark Dataset and A New Algorithm","summary":" Sign Language Translation (SLT) is a core task in the field of AI-assisted\ndisability. Unlike traditional SLT based on visible light videos, which is\neasily affected by factors such as lighting, rapid hand movements, and privacy\nbreaches, this paper proposes the use of high-definition Event streams for SLT,\neffectively mitigating the aforementioned issues. This is primarily because\nEvent streams have a high dynamic range and dense temporal signals, which can\nwithstand low illumination and motion blur well. Additionally, due to their\nsparsity in space, they effectively protect the privacy of the target person.\nMore specifically, we propose a new high-resolution Event stream sign language\ndataset, termed Event-CSL, which effectively fills the data gap in this area of\nresearch. It contains 14,827 videos, 14,821 glosses, and 2,544 Chinese words in\nthe text vocabulary. These samples are collected in a variety of indoor and\noutdoor scenes, encompassing multiple angles, light intensities, and camera\nmovements. We have benchmarked existing mainstream SLT works to enable fair\ncomparison for future efforts. Based on this dataset and several other\nlarge-scale datasets, we propose a novel baseline method that fully leverages\nthe Mamba model's ability to integrate temporal information of CNN features,\nresulting in improved sign language translation outcomes. Both the benchmark\ndataset and source code will be released on\nhttps://github.com/Event-AHU/OpenESL\n","authors":["Xiao Wang","Yao Rong","Fuling Wang","Jianing Li","Lin Zhu","Bo Jiang","Yaowei Wang"],"pdf_url":"https://arxiv.org/pdf/2408.10488v1.pdf","comment":"First Large-scale and High-Definition Benchmark Dataset for\n Event-based Sign Language Translation"},{"id":"http://arxiv.org/abs/2406.07835v3","updated":"2024-08-20T01:59:44Z","published":"2024-06-10T21:22:08Z","title":"SciRIFF: A Resource to Enhance Language Model Instruction-Following over\n Scientific Literature","summary":" We present SciRIFF (Scientific Resource for Instruction-Following and\nFinetuning), a dataset of 137K instruction-following demonstrations for 54\ntasks covering five essential scientific literature understanding capabilities:\ninformation extraction, summarization, question answering, claim verification,\nand classification. SciRIFF demonstrations are notable for their long input\ncontexts, detailed task specifications, and complex structured outputs. While\ninstruction-following resources are available in specific domains such as\nclinical medicine and chemistry, SciRIFF is the first dataset focused on\nextracting and synthesizing information from research literature across a wide\nrange of scientific fields. To demonstrate the utility of SciRIFF, we develop a\nsample-efficient strategy to adapt a general instruction-following model for\nscience by performing additional finetuning on a mix of general-domain and\nSciRIFF demonstrations. In evaluations on nine held-out scientific tasks, our\nmodel -- called SciTulu -- improves over a strong LLM baseline by 28.1% and\n6.5% at the 7B and 70B scales respectively, while maintaining general\ninstruction-following performance within 2% of the baseline. We are optimistic\nthat SciRIFF will facilitate the development and evaluation of LLMs to help\nresearchers navigate the ever-growing body of scientific literature. We release\nour dataset, model checkpoints, and data processing and evaluation code to\nenable further research.\n","authors":["David Wadden","Kejian Shi","Jacob Morrison","Aakanksha Naik","Shruti Singh","Nitzan Barzilay","Kyle Lo","Tom Hope","Luca Soldaini","Shannon Zejiang Shen","Doug Downey","Hannaneh Hajishirzi","Arman Cohan"],"pdf_url":"https://arxiv.org/pdf/2406.07835v3.pdf","comment":"Submitted to NeurIPS Datasets and Benchmarks 2024"},{"id":"http://arxiv.org/abs/2408.10474v1","updated":"2024-08-20T01:17:54Z","published":"2024-08-20T01:17:54Z","title":"LeCov: Multi-level Testing Criteria for Large Language Models","summary":" Large Language Models (LLMs) are widely used in many different domains, but\nbecause of their limited interpretability, there are questions about how\ntrustworthy they are in various perspectives, e.g., truthfulness and toxicity.\nRecent research has started developing testing methods for LLMs, aiming to\nuncover untrustworthy issues, i.e., defects, before deployment. However,\nsystematic and formalized testing criteria are lacking, which hinders a\ncomprehensive assessment of the extent and adequacy of testing exploration. To\nmitigate this threat, we propose a set of multi-level testing criteria, LeCov,\nfor LLMs. The criteria consider three crucial LLM internal components, i.e.,\nthe attention mechanism, feed-forward neurons, and uncertainty, and contain\nnine types of testing criteria in total. We apply the criteria in two\nscenarios: test prioritization and coverage-guided testing. The experiment\nevaluation, on three models and four datasets, demonstrates the usefulness and\neffectiveness of LeCov.\n","authors":["Xuan Xie","Jiayang Song","Yuheng Huang","Da Song","Fuyuan Zhang","Felix Juefei-Xu","Lei Ma"],"pdf_url":"https://arxiv.org/pdf/2408.10474v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10473v1","updated":"2024-08-20T01:05:45Z","published":"2024-08-20T01:05:45Z","title":"Enhancing One-shot Pruned Pre-trained Language Models through\n Sparse-Dense-Sparse Mechanism","summary":" Pre-trained language models (PLMs) are engineered to be robust in contextual\nunderstanding and exhibit outstanding performance in various natural language\nprocessing tasks. However, their considerable size incurs significant\ncomputational and storage costs. Modern pruning strategies employ one-shot\ntechniques to compress PLMs without the need for retraining on task-specific or\notherwise general data; however, these approaches often lead to an\nindispensable reduction in performance. In this paper, we propose SDS, a\nSparse-Dense-Sparse pruning framework to enhance the performance of the pruned\nPLMs from a weight distribution optimization perspective. We outline the\npruning process in three steps. Initially, we prune less critical connections\nin the model using conventional one-shot pruning methods. Next, we reconstruct\na dense model featuring a pruning-friendly weight distribution by reactivating\npruned connections with sparse regularization. Finally, we perform a second\npruning round, yielding a superior pruned model compared to the initial\npruning. Experimental results demonstrate that SDS outperforms the\nstate-of-the-art pruning techniques SparseGPT and Wanda under an identical\nsparsity configuration. For instance, SDS reduces perplexity by 9.13 on\nRaw-Wikitext2 and improves accuracy by an average of 2.05% across multiple\nzero-shot benchmarks for OPT-125M with 2:4 sparsity.\n","authors":["Guanchen Li","Xiandong Zhao","Lian Liu","Zeping Li","Dong Li","Lu Tian","Jie He","Ashish Sirasao","Emad Barsoum"],"pdf_url":"https://arxiv.org/pdf/2408.10473v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12879v2","updated":"2024-08-20T00:57:55Z","published":"2024-07-16T09:28:23Z","title":"Large Visual-Language Models Are Also Good Classifiers: A Study of\n In-Context Multimodal Fake News Detection","summary":" Large visual-language models (LVLMs) exhibit exceptional performance in\nvisual-language reasoning across diverse cross-modal benchmarks. Despite these\nadvances, recent research indicates that Large Language Models (LLMs), like\nGPT-3.5-turbo, underachieve compared to well-trained smaller models, such as\nBERT, in Fake News Detection (FND), prompting inquiries into LVLMs' efficacy in\nFND tasks. Although performance could improve through fine-tuning LVLMs, the\nsubstantial parameters and requisite pre-trained weights render it a\nresource-heavy endeavor for FND applications. This paper initially assesses the\nFND capabilities of two notable LVLMs, CogVLM and GPT4V, in comparison to a\nsmaller yet adeptly trained CLIP model in a zero-shot context. The findings\ndemonstrate that LVLMs can attain performance competitive with that of the\nsmaller model. Next, we integrate standard in-context learning (ICL) with\nLVLMs, noting improvements in FND performance, though limited in scope and\nconsistency. To address this, we introduce the \\textbf{I}n-context\n\\textbf{M}ultimodal \\textbf{F}ake \\textbf{N}ews \\textbf{D}etection (IMFND)\nframework, enriching in-context examples and test inputs with predictions and\ncorresponding probabilities from a well-trained smaller model. This strategic\nintegration directs the LVLMs' focus towards news segments associated with\nhigher probabilities, thereby improving their analytical accuracy. The\nexperimental results suggest that the IMFND framework significantly boosts the\nFND efficiency of LVLMs, achieving enhanced accuracy over the standard ICL\napproach across three publicly available FND datasets.\n","authors":["Ye Jiang","Yimin Wang"],"pdf_url":"https://arxiv.org/pdf/2407.12879v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10468v1","updated":"2024-08-20T00:40:49Z","published":"2024-08-20T00:40:49Z","title":"Tracing Privacy Leakage of Language Models to Training Data via Adjusted\n Influence Functions","summary":" The responses generated by Large Language Models (LLMs) can include sensitive\ninformation from individuals and organizations, leading to potential privacy\nleakage. This work implements Influence Functions (IFs) to trace privacy\nleakage back to the training data, thereby mitigating privacy concerns of\nLanguage Models (LMs). However, we notice that current IFs struggle to\naccurately estimate the influence of tokens with large gradient norms,\npotentially overestimating their influence. When tracing the most influential\nsamples, this leads to frequently tracing back to samples with large gradient\nnorm tokens, overshadowing the actual most influential samples even if their\ninfluences are well estimated. To address this issue, we propose Heuristically\nAdjusted IF (HAIF), which reduces the weight of tokens with large gradient\nnorms, thereby significantly improving the accuracy of tracing the most\ninfluential samples. To establish easily obtained groundtruth for tracing\nprivacy leakage, we construct two datasets, PII-E and PII-CR, representing two\ndistinct scenarios: one with identical text in the model outputs and\npre-training data, and the other where models leverage their reasoning\nabilities to generate text divergent from pre-training data. HAIF significantly\nimproves tracing accuracy, enhancing it by 20.96\\% to 73.71\\% on the PII-E\ndataset and 3.21\\% to 45.93\\% on the PII-CR dataset, compared to the best SOTA\nIFs against various GPT-2 and QWen-1.5 models. HAIF also outperforms SOTA IFs\non real-world pretraining data CLUECorpus2020, demonstrating strong robustness\nregardless prompt and response lengths.\n","authors":["Jinxin Liu","Zao Yang"],"pdf_url":"https://arxiv.org/pdf/2408.10468v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07910v3","updated":"2024-08-20T00:28:45Z","published":"2023-12-13T05:58:34Z","title":"PromptBench: A Unified Library for Evaluation of Large Language Models","summary":" The evaluation of large language models (LLMs) is crucial to assess their\nperformance and mitigate potential security risks. In this paper, we introduce\nPromptBench, a unified library to evaluate LLMs. It consists of several key\ncomponents that are easily used and extended by researchers: prompt\nconstruction, prompt engineering, dataset and model loading, adversarial prompt\nattack, dynamic evaluation protocols, and analysis tools. PromptBench is\ndesigned to be an open, general, and flexible codebase for research purposes\nthat can facilitate original study in creating new benchmarks, deploying\ndownstream applications, and designing new evaluation protocols. The code is\navailable at: https://github.com/microsoft/promptbench and will be continuously\nsupported.\n","authors":["Kaijie Zhu","Qinlin Zhao","Hao Chen","Jindong Wang","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2312.07910v3.pdf","comment":"Accepted by Journal of Machine Learning Research (JMLR); code:\n https://github.com/microsoft/promptbench"},{"id":"http://arxiv.org/abs/2408.11247v1","updated":"2024-08-20T23:54:26Z","published":"2024-08-20T23:54:26Z","title":"Unboxing Occupational Bias: Grounded Debiasing LLMs with U.S. Labor Data","summary":" Large Language Models (LLMs) are prone to inheriting and amplifying societal\nbiases embedded within their training data, potentially reinforcing harmful\nstereotypes related to gender, occupation, and other sensitive categories. This\nissue becomes particularly problematic as biased LLMs can have far-reaching\nconsequences, leading to unfair practices and exacerbating social inequalities\nacross various domains, such as recruitment, online content moderation, or even\nthe criminal justice system. Although prior research has focused on detecting\nbias in LLMs using specialized datasets designed to highlight intrinsic biases,\nthere has been a notable lack of investigation into how these findings\ncorrelate with authoritative datasets, such as those from the U.S. National\nBureau of Labor Statistics (NBLS). To address this gap, we conduct empirical\nresearch that evaluates LLMs in a ``bias-out-of-the-box\" setting, analyzing how\nthe generated outputs compare with the distributions found in NBLS data.\nFurthermore, we propose a straightforward yet effective debiasing mechanism\nthat directly incorporates NBLS instances to mitigate bias within LLMs. Our\nstudy spans seven different LLMs, including instructable, base, and\nmixture-of-expert models, and reveals significant levels of bias that are often\noverlooked by existing bias detection techniques. Importantly, our debiasing\nmethod, which does not rely on external datasets, demonstrates a substantial\nreduction in bias scores, highlighting the efficacy of our approach in creating\nfairer and more reliable LLMs.\n","authors":["Atmika Gorti","Manas Gaur","Aman Chadha"],"pdf_url":"https://arxiv.org/pdf/2408.11247v1.pdf","comment":"Accepted in AAAI Spring Symposium 2024"},{"id":"http://arxiv.org/abs/2408.11239v1","updated":"2024-08-20T23:36:00Z","published":"2024-08-20T23:36:00Z","title":"A Little Confidence Goes a Long Way","summary":" We introduce a group of related methods for binary classification tasks using\nprobes of the hidden state activations in large language models (LLMs).\nPerformance is on par with the largest and most advanced LLMs currently\navailable, but requiring orders of magnitude fewer computational resources and\nnot requiring labeled data. This approach involves translating class labels\ninto a semantically rich description, spontaneous symmetry breaking of\nmultilayer perceptron probes for unsupervised learning and inference, training\nprobes to generate confidence scores (prior probabilities) from hidden state\nactivations subject to known constraints via entropy maximization, and\nselecting the most confident probe model from an ensemble for prediction. These\ntechniques are evaluated on four datasets using five base LLMs.\n","authors":["John Scoville","Shang Gao","Devanshu Agrawal","Javed Qadrud-Din"],"pdf_url":"https://arxiv.org/pdf/2408.11239v1.pdf","comment":"13 pages, 2 figures"},{"id":"http://arxiv.org/abs/2408.11237v1","updated":"2024-08-20T23:30:00Z","published":"2024-08-20T23:30:00Z","title":"Out-of-Distribution Detection with Attention Head Masking for Multimodal\n Document Classification","summary":" Detecting out-of-distribution (OOD) data is crucial in machine learning\napplications to mitigate the risk of model overconfidence, thereby enhancing\nthe reliability and safety of deployed systems. The majority of existing OOD\ndetection methods predominantly address uni-modal inputs, such as images or\ntexts. In the context of multi-modal documents, there is a notable lack of\nextensive research on the performance of these methods, which have primarily\nbeen developed with a focus on computer vision tasks. We propose a novel\nmethodology termed as attention head masking (AHM) for multi-modal OOD tasks in\ndocument classification systems. Our empirical results demonstrate that the\nproposed AHM method outperforms all state-of-the-art approaches and\nsignificantly decreases the false positive rate (FPR) compared to existing\nsolutions up to 7.5\\%. This methodology generalizes well to multi-modal data,\nsuch as documents, where visual and textual information are modeled under the\nsame Transformer architecture. To address the scarcity of high-quality publicly\navailable document datasets and encourage further research on OOD detection for\ndocuments, we introduce FinanceDocs, a new document AI dataset. Our code and\ndataset are publicly available.\n","authors":["Christos Constantinou","Georgios Ioannides","Aman Chadha","Aaron Elkins","Edwin Simpson"],"pdf_url":"https://arxiv.org/pdf/2408.11237v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03715v2","updated":"2024-08-20T23:00:32Z","published":"2024-02-06T05:11:38Z","title":"Clarify: Improving Model Robustness With Natural Language Corrections","summary":" The standard way to teach models is by feeding them lots of data. However,\nthis approach often teaches models incorrect ideas because they pick up on\nmisleading signals in the data. To prevent such misconceptions, we must\nnecessarily provide additional information beyond the training data. Prior\nmethods incorporate additional instance-level supervision, such as labels for\nmisleading features or additional labels for debiased data. However, such\nstrategies require a large amount of labeler effort. We hypothesize that people\nare good at providing textual feedback at the concept level, a capability that\nexisting teaching frameworks do not leverage. We propose Clarify, a novel\ninterface and method for interactively correcting model misconceptions. Through\nClarify, users need only provide a short text description of a model's\nconsistent failure patterns. Then, in an entirely automated way, we use such\ndescriptions to improve the training process. Clarify is the first end-to-end\nsystem for user model correction. Our user studies show that non-expert users\ncan successfully describe model misconceptions via Clarify, leading to\nincreased worst-case performance in two datasets. We additionally conduct a\ncase study on a large-scale image dataset, ImageNet, using Clarify to find and\nrectify 31 novel hard subpopulations.\n","authors":["Yoonho Lee","Michelle S. Lam","Helena Vasconcelos","Michael S. Bernstein","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2402.03715v2.pdf","comment":"UIST 2024. Interface code available at\n https://github.com/yoonholee/Clarify"},{"id":"http://arxiv.org/abs/2408.11219v1","updated":"2024-08-20T22:35:47Z","published":"2024-08-20T22:35:47Z","title":"CoDi: Conversational Distillation for Grounded Question Answering","summary":" Distilling conversational skills into Small Language Models (SLMs) with\napproximately 1 billion parameters presents significant challenges. Firstly,\nSLMs have limited capacity in their model parameters to learn extensive\nknowledge compared to larger models. Secondly, high-quality conversational\ndatasets are often scarce, small, and domain-specific. Addressing these\nchallenges, we introduce a novel data distillation framework named CoDi (short\nfor Conversational Distillation, pronounced \"Cody\"), allowing us to synthesize\nlarge-scale, assistant-style datasets in a steerable and diverse manner.\nSpecifically, while our framework is task agnostic at its core, we explore and\nevaluate the potential of CoDi on the task of conversational grounded reasoning\nfor question answering. This is a typical on-device scenario for specialist\nSLMs, allowing for open-domain model responses, without requiring the model to\n\"memorize\" world knowledge in its limited weights. Our evaluations show that\nSLMs trained with CoDi-synthesized data achieve performance comparable to\nmodels trained on human-annotated data in standard metrics. Additionally, when\nusing our framework to generate larger datasets from web data, our models\nsurpass larger, instruction-tuned models in zero-shot conversational grounded\nreasoning tasks.\n","authors":["Patrick Huber","Arash Einolghozati","Rylan Conway","Kanika Narang","Matt Smith","Waqar Nayyar","Adithya Sagar","Ahmed Aly","Akshat Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2408.11219v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2407.20371v2","updated":"2024-08-20T21:49:26Z","published":"2024-07-29T18:42:39Z","title":"Gender, Race, and Intersectional Bias in Resume Screening via Language\n Model Retrieval","summary":" Artificial intelligence (AI) hiring tools have revolutionized resume\nscreening, and large language models (LLMs) have the potential to do the same.\nHowever, given the biases which are embedded within LLMs, it is unclear whether\nthey can be used in this scenario without disadvantaging groups based on their\nprotected attributes. In this work, we investigate the possibilities of using\nLLMs in a resume screening setting via a document retrieval framework that\nsimulates job candidate selection. Using that framework, we then perform a\nresume audit study to determine whether a selection of Massive Text Embedding\n(MTE) models are biased in resume screening scenarios. We simulate this for\nnine occupations, using a collection of over 500 publicly available resumes and\n500 job descriptions. We find that the MTEs are biased, significantly favoring\nWhite-associated names in 85.1\\% of cases and female-associated names in only\n11.1\\% of cases, with a minority of cases showing no statistically significant\ndifferences. Further analyses show that Black males are disadvantaged in up to\n100\\% of cases, replicating real-world patterns of bias in employment settings,\nand validate three hypotheses of intersectionality. We also find an impact of\ndocument length as well as the corpus frequency of names in the selection of\nresumes. These findings have implications for widely used AI tools that are\nautomating employment, fairness, and tech policy.\n","authors":["Kyra Wilson","Aylin Caliskan"],"pdf_url":"https://arxiv.org/pdf/2407.20371v2.pdf","comment":"To be published in Proceedings of the 2024 AAAI/ACM Conference on AI,\n Ethics, and Society; code available at\n https://github.com/kyrawilson/Resume-Screening-Bias"},{"id":"http://arxiv.org/abs/2408.11205v1","updated":"2024-08-20T21:33:17Z","published":"2024-08-20T21:33:17Z","title":"DSP-MLIR: A MLIR Dialect for Digital Signal Processing","summary":" Traditional Digital Signal Processing ( DSP ) compilers work at low level (\nC-level / assembly level ) and hence lose much of the optimization\nopportunities present at high-level ( domain-level ). The emerging multi-level\ncompiler infrastructure MLIR ( Multi-level Intermediate Representation ) allows\nto specify optimizations at higher level. In this paper, we utilize MLIR\nframework to introduce a DSP Dialect and perform domain-specific optimizations\nat dialect -level ( high-level ) and show the usefulness of these optimizations\non sample DSP apps. In particular, we develop a compiler for DSP and a DSL\n(Domain Specific Language) to ease the development of apps. We show the\nperformance improvement in execution time for these sample apps by upto 10x\nwhich would have been difficult if the IR were at C/ affine level.\n","authors":["Abhinav Kumar","Atharva Khedkar","Aviral Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2408.11205v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.16553v6","updated":"2024-08-20T20:51:22Z","published":"2024-01-29T20:44:10Z","title":"SelectLLM: Can LLMs Select Important Instructions to Annotate?","summary":" Instruction tuning benefits from large and diverse datasets; however,\ncreating such datasets involves a high cost of human labeling. While synthetic\ndatasets generated by large language models (LLMs) have partly solved this\nissue, they often contain low-quality data. One effective solution is\nselectively annotating unlabelled instructions, especially given the relative\nease of acquiring unlabeled instructions or texts from various sources.\nHowever, how to select unlabelled instructions is not well-explored, especially\nin the context of LLMs. Therefore, we introduce SelectLLM, an alternative\nframework that leverages the capabilities of LLMs to select unlabeled\ninstructions more effectively. Specifically, SelectLLM consists of two key\nsteps: Coreset-based clustering of unlabelled instructions for enlarging\ndiversity and prompting of LLM to identify the most beneficial instructions\nwithin each cluster. We evaluate SelectLLM on AlpacaEval2 and MT-Bench,\ndemonstrating its ability to outperform state-of-the-art methods like\nAlpagasus. In addition, we compare the performance and compatibility of\nSelectLLM with various LLMs, such as ChatGPT, LLaMA-3.1-70B, and Gemma-2-27b.\nSelectLLM's adaptability and robustness are further evidenced by its ability to\nmaintain high performance across both human and synthetic datasets. All code\nand data are publicly available (https://github.com/minnesotanlp/select-llm).\n","authors":["Ritik Sachin Parkar","Jaehyung Kim","Jong Inn Park","Dongyeop Kang"],"pdf_url":"https://arxiv.org/pdf/2401.16553v6.pdf","comment":"First Authors: Ritik Sachin Parkar and Jaehyung Kim | Second Author:\n Jong Inn Park | PI: Dongyeop Kang"},{"id":"http://arxiv.org/abs/2408.11189v1","updated":"2024-08-20T20:47:27Z","published":"2024-08-20T20:47:27Z","title":"Reading with Intent","summary":" Retrieval augmented generation (RAG) systems augment how knowledge language\nmodels are by integrating external information sources such as Wikipedia,\ninternal documents, scientific papers, or the open internet. RAG systems that\nrely on the open internet as their knowledge source have to contend with the\ncomplexities of human-generated content. Human communication extends much\ndeeper than just the words rendered as text. Intent, tonality, and connotation\ncan all change the meaning of what is being conveyed. Recent real-world\ndeployments of RAG systems have shown some difficulty in understanding these\nnuances of human communication. One significant challenge for these systems\nlies in processing sarcasm. Though the Large Language Models (LLMs) that make\nup the backbone of these RAG systems are able to detect sarcasm, they currently\ndo not always use these detections for the subsequent processing of text. To\naddress these issues, in this paper, we synthetically generate sarcastic\npassages from Natural Question's Wikipedia retrieval corpus. We then test the\nimpact of these passages on the performance of both the retriever and reader\nportion of the RAG pipeline. We introduce a prompting system designed to\nenhance the model's ability to interpret and generate responses in the presence\nof sarcasm, thus improving overall system performance. Finally, we conduct\nablation studies to validate the effectiveness of our approach, demonstrating\nimprovements in handling sarcastic content within RAG systems.\n","authors":["Benjamin Reichman","Kartik Talamadupula","Toshish Jawale","Larry Heck"],"pdf_url":"https://arxiv.org/pdf/2408.11189v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09667v2","updated":"2024-08-20T20:25:10Z","published":"2024-08-19T02:59:35Z","title":"BLADE: Benchmarking Language Model Agents for Data-Driven Science","summary":" Data-driven scientific discovery requires the iterative integration of\nscientific domain knowledge, statistical expertise, and an understanding of\ndata semantics to make nuanced analytical decisions, e.g., about which\nvariables, transformations, and statistical models to consider. LM-based agents\nequipped with planning, memory, and code execution capabilities have the\npotential to support data-driven science. However, evaluating agents on such\nopen-ended tasks is challenging due to multiple valid approaches, partially\ncorrect steps, and different ways to express the same decisions. To address\nthese challenges, we present BLADE, a benchmark to automatically evaluate\nagents' multifaceted approaches to open-ended research questions. BLADE\nconsists of 12 datasets and research questions drawn from existing scientific\nliterature, with ground truth collected from independent analyses by expert\ndata scientists and researchers. To automatically evaluate agent responses, we\ndeveloped corresponding computational methods to match different\nrepresentations of analyses to this ground truth. Though language models\npossess considerable world knowledge, our evaluation shows that they are often\nlimited to basic analyses. However, agents capable of interacting with the\nunderlying data demonstrate improved, but still non-optimal, diversity in their\nanalytical decision making. Our work enables the evaluation of agents for\ndata-driven science and provides researchers deeper insights into agents'\nanalysis approaches.\n","authors":["Ken Gu","Ruoxi Shang","Ruien Jiang","Keying Kuang","Richard-John Lin","Donghe Lyu","Yue Mao","Youran Pan","Teng Wu","Jiaqian Yu","Yikun Zhang","Tianmai M. Zhang","Lanyi Zhu","Mike A. Merrill","Jeffrey Heer","Tim Althoff"],"pdf_url":"https://arxiv.org/pdf/2408.09667v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11174v1","updated":"2024-08-20T20:13:19Z","published":"2024-08-20T20:13:19Z","title":"Combining Objective and Subjective Perspectives for Political News\n Understanding","summary":" Researchers and practitioners interested in computational politics rely on\nautomatic content analysis tools to make sense of the large amount of political\ntexts available on the Web. Such tools should provide objective and subjective\naspects at different granularity levels to make the analyses useful in\npractice. Existing methods produce interesting insights for objective aspects,\nbut are limited for subjective ones, are often limited to national contexts,\nand have limited explainability. We introduce a text analysis framework which\nintegrates both perspectives and provides a fine-grained processing of\nsubjective aspects. Information retrieval techniques and knowledge bases\ncomplement powerful natural language processing components to allow a flexible\naggregation of results at different granularity levels. Importantly, the\nproposed bottom-up approach facilitates the explainability of the obtained\nresults. We illustrate its functioning with insights on news outlets, political\norientations, topics, individual entities, and demographic segments. The\napproach is instantiated on a large corpus of French news, but is designed to\nwork seamlessly for other languages and countries.\n","authors":["Evan Dufraisse","Adrian Popescu","Julien Tourille","Armelle Brun","Olivier Hamon"],"pdf_url":"https://arxiv.org/pdf/2408.11174v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11172v1","updated":"2024-08-20T20:10:53Z","published":"2024-08-20T20:10:53Z","title":"SubgoalXL: Subgoal-based Expert Learning for Theorem Proving","summary":" Formal theorem proving, a field at the intersection of mathematics and\ncomputer science, has seen renewed interest with advancements in large language\nmodels (LLMs). This paper introduces SubgoalXL, a novel approach that\nsynergizes subgoal-based proofs with expert learning to enhance LLMs'\ncapabilities in formal theorem proving within the Isabelle environment.\nSubgoalXL addresses two critical challenges: the scarcity of specialized\nmathematics and theorem-proving data, and the need for improved multi-step\nreasoning abilities in LLMs. By optimizing data efficiency and employing\nsubgoal-level supervision, SubgoalXL extracts richer information from limited\nhuman-generated proofs. The framework integrates subgoal-oriented proof\nstrategies with an expert learning system, iteratively refining formal\nstatement, proof, and subgoal generators. Leveraging the Isabelle environment's\nadvantages in subgoal-based proofs, SubgoalXL achieves a new state-of-the-art\nperformance of 56.1\\% in Isabelle on the standard miniF2F dataset, marking an\nabsolute improvement of 4.9\\%. Notably, SubgoalXL successfully solves 41 AMC12,\n9 AIME, and 3 IMO problems from miniF2F. These results underscore the\neffectiveness of maximizing limited data utility and employing targeted\nguidance for complex reasoning in formal theorem proving, contributing to the\nongoing advancement of AI reasoning capabilities. The implementation is\navailable at \\url{https://github.com/zhaoxlpku/SubgoalXL}.\n","authors":["Xueliang Zhao","Lin Zheng","Haige Bo","Changran Hu","Urmish Thakker","Lingpeng Kong"],"pdf_url":"https://arxiv.org/pdf/2408.11172v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07832v2","updated":"2024-08-20T19:22:10Z","published":"2024-07-31T14:49:35Z","title":"LADDER: Language Driven Slice Discovery and Error Rectification","summary":" Error slice discovery associates structured patterns with model errors.\nExisting methods discover error slices by clustering the error-prone samples\nwith similar patterns or assigning discrete attributes to each sample for\npost-hoc analysis. While these methods aim for interpretability and easier\nmitigation through reweighting or rebalancing, they may not capture the full\ncomplexity of error patterns due to incomplete or missing attributes. Contrary\nto the existing approach, this paper utilizes the reasoning capabilities of the\nLarge Language Model (LLM) to analyze complex error patterns and generate\ntestable hypotheses. This paper proposes LADDER: Language Driven slice\nDiscovery and Error Rectification. It first projects the model's representation\ninto a language-aligned feature space (\\eg CLIP) to preserve semantics in the\noriginal model feature space. This ensures the accurate retrieval of sentences\nthat highlight the model's errors. Next, the LLM utilizes the sentences and\ngenerates hypotheses to discover error slices. Finally, we mitigate the error\nby fine-tuning the classification head by creating a group-balanced dataset\nusing the hypotheses. Our entire method does not require any attribute\nannotation, either explicitly or through external tagging models. We validate\nour method with \\textbf{five} image classification datasets. The code is\navailable\\footnote{\\url{https://github.com/batmanlab/Ladder}}\n","authors":["Shantanu Ghosh","Chenyu Wang","Kayhan Batmanghelich"],"pdf_url":"https://arxiv.org/pdf/2408.07832v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11006v2","updated":"2024-08-20T19:17:58Z","published":"2024-06-25T20:52:31Z","title":"Evaluating the Efficacy of Foundational Models: Advancing Benchmarking\n Practices to Enhance Fine-Tuning Decision-Making","summary":" Recently, large language models (LLMs) have expanded into various domains.\nHowever, there remains a need to evaluate how these models perform when\nprompted with commonplace queries compared to domain-specific queries, which\nmay be useful for benchmarking prior to fine-tuning for domain-specific\ndownstream tasks. This study evaluates LLMs, specifically Gemma-2B and\nGemma-7B, across diverse domains, including cybersecurity, medicine, and\nfinance, compared to common knowledge queries. This study utilizes a\ncomprehensive methodology to assess foundational models, which includes problem\nformulation, data analysis, and the development of ThroughCut, a novel outlier\ndetection technique that automatically identifies response throughput outliers\nbased on their conciseness. This methodological rigor enhances the credibility\nof the presented evaluation frameworks. This study focused on assessing\ninference time, response length, throughput, quality, and resource utilization\nand investigated the correlations between these factors. The results indicate\nthat model size and types of prompts used for inference significantly\ninfluenced response length and quality. In addition, common prompts, which\ninclude various types of queries, generate diverse and inconsistent responses\nat irregular intervals. In contrast, domain-specific prompts consistently\ngenerate concise responses within a reasonable time. Overall, this study\nunderscores the need for comprehensive evaluation frameworks to enhance the\nreliability of benchmarking procedures in multidomain AI research.\n","authors":["Oluyemi Enoch Amujo","Shanchieh Jay Yang"],"pdf_url":"https://arxiv.org/pdf/2407.11006v2.pdf","comment":"10 pages, 5 figures, 2 tables, and algorithms"},{"id":"http://arxiv.org/abs/2402.05070v3","updated":"2024-08-20T19:14:31Z","published":"2024-02-07T18:21:17Z","title":"A Roadmap to Pluralistic Alignment","summary":" With increased power and prevalence of AI systems, it is ever more critical\nthat AI systems are designed to serve all, i.e., people with diverse values and\nperspectives. However, aligning models to serve pluralistic human values\nremains an open research question. In this piece, we propose a roadmap to\npluralistic alignment, specifically using language models as a test bed. We\nidentify and formalize three possible ways to define and operationalize\npluralism in AI systems: 1) Overton pluralistic models that present a spectrum\nof reasonable responses; 2) Steerably pluralistic models that can steer to\nreflect certain perspectives; and 3) Distributionally pluralistic models that\nare well-calibrated to a given population in distribution. We also formalize\nand discuss three possible classes of pluralistic benchmarks: 1)\nMulti-objective benchmarks, 2) Trade-off steerable benchmarks, which\nincentivize models to steer to arbitrary trade-offs, and 3) Jury-pluralistic\nbenchmarks which explicitly model diverse human ratings. We use this framework\nto argue that current alignment techniques may be fundamentally limited for\npluralistic AI; indeed, we highlight empirical evidence, both from our own\nexperiments and from other work, that standard alignment procedures might\nreduce distributional pluralism in models, motivating the need for further\nresearch on pluralistic alignment.\n","authors":["Taylor Sorensen","Jared Moore","Jillian Fisher","Mitchell Gordon","Niloofar Mireshghallah","Christopher Michael Rytting","Andre Ye","Liwei Jiang","Ximing Lu","Nouha Dziri","Tim Althoff","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2402.05070v3.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2408.05241v3","updated":"2024-08-20T18:58:00Z","published":"2024-08-05T20:49:48Z","title":"Large Model Strategic Thinking, Small Model Efficiency: Transferring\n Theory of Mind in Large Language Models","summary":" As the performance of larger, newer Large Language Models continues to\nimprove for strategic Theory of Mind (ToM) tasks, the demand for these\nstate-of-the-art models increases commensurately. However, their deployment is\ncostly both in terms of processing power and time. In this paper, we\ninvestigate the feasibility of creating smaller, highly-performing specialized\nalgorithms by way of fine-tuning. To do this, we first present a large\npre-trained model with 20 unique scenarios that combine different social\ncontexts with games of varying social dilemmas, record its answers, and use\nthem for Q&A fine-tuning on a smaller model of the same family. Our focus is on\nin-context game-theoretic decision-making, the same domain within which human\ninteraction occurs and that requires both a theory of mind (or a semblance\nthereof) and an understanding of social dynamics. The smaller model is\ntherefore trained not just on the answers provided, but also on the motivations\nprovided by the larger model, which should contain advice and guidelines to\nnavigate both strategic dilemmas and social cues. We find that the fine-tuned\nsmaller language model consistently bridged the gap in performance between the\nsmaller pre-trained version of the model and its larger relative and that its\nimprovements extended in areas and contexts beyond the ones provided in the\ntraining examples, including on out-of-sample scenarios that include completely\ndifferent game structures. On average for all games, through fine-tuning, the\nsmaller model showed a 46% improvement measured as alignment towards the\nbehavior of the larger model, with 100% representing indistinguishable\nbehavior. When presented with out-of-sample social contexts and games, the\nfine-tuned model still displays remarkable levels of alignment, reaching an\nimprovement of 18% and 28% respectively.\n","authors":["Nunzio Lore","Alireza Sepehr Ilami","Babak Heydari"],"pdf_url":"https://arxiv.org/pdf/2408.05241v3.pdf","comment":"18 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.00699v3","updated":"2024-08-20T18:51:26Z","published":"2024-03-31T14:32:02Z","title":"How Much are Large Language Models Contaminated? A Comprehensive Survey\n and the LLMSanitize Library","summary":" With the rise of Large Language Models (LLMs) in recent years, abundant new\nopportunities are emerging, but also new challenges, among which contamination\nis quickly becoming critical. Business applications and fundraising in AI have\nreached a scale at which a few percentage points gained on popular\nquestion-answering benchmarks could translate into dozens of millions of\ndollars, placing high pressure on model integrity. At the same time, it is\nbecoming harder and harder to keep track of the data that LLMs have seen; if\nnot impossible with closed-source models like GPT-4 and Claude-3 not divulging\nany information on the training set. As a result, contamination becomes a major\nissue: LLMs' performance may not be reliable anymore, as the high performance\nmay be at least partly due to their previous exposure to the data. This\nlimitation jeopardizes the entire progress in the field of NLP, yet, there\nremains a lack of methods on how to efficiently detect contamination.In this\npaper, we survey all recent work on contamination detection with LLMs, and help\nthe community track contamination levels of LLMs by releasing an open-source\nPython library named LLMSanitize implementing major contamination detection\nalgorithms.\n","authors":["Mathieu Ravaut","Bosheng Ding","Fangkai Jiao","Hailin Chen","Xingxuan Li","Ruochen Zhao","Chengwei Qin","Caiming Xiong","Shafiq Joty"],"pdf_url":"https://arxiv.org/pdf/2404.00699v3.pdf","comment":"8 pages, 1 figure, 1 table"},{"id":"http://arxiv.org/abs/2408.11133v1","updated":"2024-08-20T18:31:20Z","published":"2024-08-20T18:31:20Z","title":"Public Health in Disaster: Emotional Health and Life Incidents\n Extraction during Hurricane Harvey","summary":" Countless disasters have resulted from climate change, causing severe damage\nto infrastructure and the economy. These disasters have significant societal\nimpacts, necessitating mental health services for the millions affected. To\nprepare for and respond effectively to such events, it is important to\nunderstand people's emotions and the life incidents they experience before and\nafter a disaster strikes. In this case study, we collected a dataset of\napproximately 400,000 public tweets related to the storm. Using a BERT-based\nmodel, we predicted the emotions associated with each tweet. To efficiently\nidentify these topics, we utilized the Latent Dirichlet Allocation (LDA)\ntechnique for topic modeling, which allowed us to bypass manual content\nanalysis and extract meaningful patterns from the data. However, rather than\nstopping at topic identification like previous methods \\cite{math11244910}, we\nfurther refined our analysis by integrating Graph Neural Networks (GNN) and\nLarge Language Models (LLM). The GNN was employed to generate embeddings and\nconstruct a similarity graph of the tweets, which was then used to optimize\nclustering. Subsequently, we used an LLM to automatically generate descriptive\nnames for each event cluster, offering critical insights for disaster\npreparedness and response strategies.\n","authors":["Thomas Hoang","Quynh Anh Nguyen","Long Nguyen"],"pdf_url":"https://arxiv.org/pdf/2408.11133v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11121v1","updated":"2024-08-20T18:23:38Z","published":"2024-08-20T18:23:38Z","title":"DOMBA: Double Model Balancing for Access-Controlled Language Models via\n Minimum-Bounded Aggregation","summary":" The utility of large language models (LLMs) depends heavily on the quality\nand quantity of their training data. Many organizations possess large data\ncorpora that could be leveraged to train or fine-tune LLMs tailored to their\nspecific needs. However, these datasets often come with access restrictions\nthat are based on user privileges and enforced by access control mechanisms.\nTraining LLMs on such datasets could result in exposure of sensitive\ninformation to unauthorized users. A straightforward approach for preventing\nsuch exposure is to train a separate model for each access level. This,\nhowever, may result in low utility models due to the limited amount of training\ndata per model compared to the amount in the entire organizational corpus.\nAnother approach is to train a single LLM on all the data while limiting the\nexposure of unauthorized information. However, current exposure-limiting\nmethods for LLMs are ineffective for access-controlled data, where sensitive\ninformation appears frequently across many training examples. We propose DOMBA\n- double model balancing - a simple approach for training and deploying LLMs\nthat provides high utility and access-control functionality with security\nguarantees. DOMBA aggregates the probability distributions of two models, each\ntrained on documents with (potentially many) different access levels, using a\n\"min-bounded\" average function (a function that is bounded by the smaller\nvalue, e.g., harmonic mean). A detailed mathematical analysis and extensive\nevaluation show that DOMBA safeguards restricted information while offering\nutility comparable to non-secure models.\n","authors":["Tom Segal","Asaf Shabtai","Yuval Elovici"],"pdf_url":"https://arxiv.org/pdf/2408.11121v1.pdf","comment":"11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2408.11119v1","updated":"2024-08-20T18:21:54Z","published":"2024-08-20T18:21:54Z","title":"Mistral-SPLADE: LLMs for for better Learned Sparse Retrieval","summary":" Learned Sparse Retrievers (LSR) have evolved into an effective retrieval\nstrategy that can bridge the gap between traditional keyword-based sparse\nretrievers and embedding-based dense retrievers. At its core, learned sparse\nretrievers try to learn the most important semantic keyword expansions from a\nquery and/or document which can facilitate better retrieval with overlapping\nkeyword expansions. LSR like SPLADE has typically been using encoder only\nmodels with MLM (masked language modeling) style objective in conjunction with\nknown ways of retrieval performance improvement such as hard negative mining,\ndistillation, etc. In this work, we propose to use decoder-only model for\nlearning semantic keyword expansion. We posit, decoder only models that have\nseen much higher magnitudes of data are better equipped to learn keyword\nexpansions needed for improved retrieval. We use Mistral as the backbone to\ndevelop our Learned Sparse Retriever similar to SPLADE and train it on a subset\nof sentence-transformer data which is often used for training text embedding\nmodels. Our experiments support the hypothesis that a sparse retrieval model\nbased on decoder only large language model (LLM) surpasses the performance of\nexisting LSR systems, including SPLADE and all its variants. The LLM based\nmodel (Echo-Mistral-SPLADE) now stands as a state-of-the-art learned sparse\nretrieval model on the BEIR text retrieval benchmark.\n","authors":["Meet Doshi","Vishwajeet Kumar","Rudra Murthy","Vignesh P","Jaydeep Sen"],"pdf_url":"https://arxiv.org/pdf/2408.11119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11081v1","updated":"2024-08-20T11:19:06Z","published":"2024-08-20T11:19:06Z","title":"What can Large Language Models Capture about Code Functional\n Equivalence?","summary":" Code-LLMs, LLMs pre-trained on large code corpora, have shown great progress\nin learning rich representations of the structure and syntax of code,\nsuccessfully using it to generate or classify code fragments. At the same time,\nunderstanding if they are able to do so because they capture code semantics,\nand how well, is still an open question. In this paper, we tackle this problem\nby introducing SeqCoBench, a benchmark for systematically assessing how\nCode-LLMs can capture code functional equivalence. SeqCoBench contains over 20\ncode transformations that either preserve or alter the semantics of Python\nprograms. We conduct extensive evaluations in different settings, including\nzero-shot and parameter-efficient finetuning methods on state-of-the-art\n(Code-)LLMs to see if they can discern semantically equivalent or different\npairs of programs in SeqCoBench. We find that the performance gap between these\nLLMs and classical match-based retrieval scores is minimal, with both\napproaches showing a concerning lack of depth in understanding code semantics.\n","authors":["Nickil Maveli","Antonio Vergari","Shay B. Cohen"],"pdf_url":"https://arxiv.org/pdf/2408.11081v1.pdf","comment":"37 pages"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2408.11055v1","updated":"2024-08-20T17:59:01Z","published":"2024-08-20T17:59:01Z","title":"Prompt-Guided Image-Adaptive Neural Implicit Lookup Tables for\n Interpretable Image Enhancement","summary":" In this paper, we delve into the concept of interpretable image enhancement,\na technique that enhances image quality by adjusting filter parameters with\neasily understandable names such as \"Exposure\" and \"Contrast\". Unlike using\npredefined image editing filters, our framework utilizes learnable filters that\nacquire interpretable names through training. Our contribution is two-fold.\nFirstly, we introduce a novel filter architecture called an image-adaptive\nneural implicit lookup table, which uses a multilayer perceptron to implicitly\ndefine the transformation from input feature space to output color space. By\nincorporating image-adaptive parameters directly into the input features, we\nachieve highly expressive filters. Secondly, we introduce a prompt guidance\nloss to assign interpretable names to each filter. We evaluate visual\nimpressions of enhancement results, such as exposure and contrast, using a\nvision and language model along with guiding prompts. We define a constraint to\nensure that each filter affects only the targeted visual impression without\ninfluencing other attributes, which allows us to obtain the desired filter\neffects. Experimental results show that our method outperforms existing\npredefined filter-based methods, thanks to the filters optimized to predict\ntarget results. Our source code is available at\nhttps://github.com/satoshi-kosugi/PG-IA-NILUT.\n","authors":["Satoshi Kosugi"],"pdf_url":"https://arxiv.org/pdf/2408.11055v1.pdf","comment":"Accepted to ACM Multimedia 2024"},{"id":"http://arxiv.org/abs/2408.11054v1","updated":"2024-08-20T17:58:59Z","published":"2024-08-20T17:58:59Z","title":"NeCo: Improving DINOv2's spatial representations in 19 GPU hours with\n Patch Neighbor Consistency","summary":" We propose sorting patch representations across views as a novel\nself-supervised learning signal to improve pretrained representations. To this\nend, we introduce NeCo: Patch Neighbor Consistency, a novel training loss that\nenforces patch-level nearest neighbor consistency across a student and teacher\nmodel, relative to reference batches. Our method leverages a differentiable\nsorting method applied on top of pretrained representations, such as\nDINOv2-registers to bootstrap the learning signal and further improve upon\nthem. This dense post-pretraining leads to superior performance across various\nmodels and datasets, despite requiring only 19 hours on a single GPU. We\ndemonstrate that this method generates high-quality dense feature encoders and\nestablish several new state-of-the-art results: +5.5% and + 6% for\nnon-parametric in-context semantic segmentation on ADE20k and Pascal VOC, and\n+7.2% and +5.7% for linear segmentation evaluations on COCO-Things and -Stuff.\n","authors":["Valentinos Pariza","Mohammadreza Salehi","Gertjan Burghouts","Francesco Locatello","Yuki M. Asano"],"pdf_url":"https://arxiv.org/pdf/2408.11054v1.pdf","comment":"Preprint. The webpage is accessible at:\n https://vpariza.github.io/NeCo/"},{"id":"http://arxiv.org/abs/2408.11051v1","updated":"2024-08-20T17:57:46Z","published":"2024-08-20T17:57:46Z","title":"FLAME: Learning to Navigate with Multimodal LLM in Urban Environments","summary":" Large Language Models (LLMs) have demonstrated potential in\nVision-and-Language Navigation (VLN) tasks, yet current applications face\nchallenges. While LLMs excel in general conversation scenarios, they struggle\nwith specialized navigation tasks, yielding suboptimal performance compared to\nspecialized VLN models. We introduce FLAME (FLAMingo-Architected Embodied\nAgent), a novel Multimodal LLM-based agent and architecture designed for urban\nVLN tasks that efficiently handles multiple observations. Our approach\nimplements a three-phase tuning technique for effective adaptation to\nnavigation tasks, including single perception tuning for street view\ndescription, multiple perception tuning for trajectory summarization, and\nend-to-end training on VLN datasets. The augmented datasets are synthesized\nautomatically. Experimental results demonstrate FLAME's superiority over\nexisting methods, surpassing state-of-the-art methods by a 7.3% increase in\ntask completion rate on Touchdown dataset. This work showcases the potential of\nMultimodal LLMs (MLLMs) in complex navigation tasks, representing an\nadvancement towards practical applications of MLLMs in embodied AI. Project\npage: https://flame-sjtu.github.io\n","authors":["Yunzhe Xu","Yiyuan Pan","Zhe Liu","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2408.11051v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.10188v2","updated":"2024-08-20T17:56:24Z","published":"2024-08-19T17:48:08Z","title":"LongVILA: Scaling Long-Context Visual Language Models for Long Videos","summary":" Long-context capability is critical for multi-modal foundation models. We\nintroduce LongVILA, a full-stack solution for long-context vision-language\nmodels, including system, model training, and dataset development. On the\nsystem side, we introduce the first long-context Multi-Modal Sequence\nParallelism (MM-SP) system that enables long training and inference, enabling\n2M context length training on 256 GPUs without any gradient checkpointing.\nMM-SP is 2.1x - 5.7x faster than ring sequence parallelism and 1.1x - 1.4x\nfaster than Megatron context parallelism + tensor parallelism in text-only\nsettings. Moreover, it seamlessly integrates with Hugging Face Transformers.\nFor model training, we propose a five-stage pipeline comprising alignment,\npre-training, short supervised fine-tuning, context extension, and long\nsupervised fine-tuning. On datasets, we construct large-scale visual language\npre-training datasets and long video instruction-following datasets to support\nour multi-stage training process. LongVILA extends the number of frames of VILA\nfrom 8 to 1024, and improves the long video captioning score from 2.00 to 3.26\n(1.6x), achieving 99.5% accuracy in 1400-frames video (274k context length)\nneedle-in-a-haystack. LongVILA-8B demonstrates consistent accuracy improvements\non long videos in the VideoMME benchmark as the number of frames increases.\n","authors":["Fuzhao Xue","Yukang Chen","Dacheng Li","Qinghao Hu","Ligeng Zhu","Xiuyu Li","Yunhao Fang","Haotian Tang","Shang Yang","Zhijian Liu","Ethan He","Hongxu Yin","Pavlo Molchanov","Jan Kautz","Linxi Fan","Yuke Zhu","Yao Lu","Song Han"],"pdf_url":"https://arxiv.org/pdf/2408.10188v2.pdf","comment":"Code and models are available at\n https://github.com/NVlabs/VILA/blob/main/LongVILA.md"},{"id":"http://arxiv.org/abs/2311.13254v2","updated":"2024-08-20T17:53:39Z","published":"2023-11-22T09:18:49Z","title":"Unified Domain Adaptive Semantic Segmentation","summary":" Unsupervised Domain Adaptive Semantic Segmentation (UDA-SS) aims to transfer\nthe supervision from a labeled source domain to an unlabeled target domain. The\nmajority of existing UDA-SS works typically consider images whilst recent\nattempts have extended further to tackle videos by modeling the temporal\ndimension. Although the two lines of research share the major challenges --\novercoming the underlying domain distribution shift, their studies are largely\nindependent, resulting in fragmented insights, a lack of holistic\nunderstanding, and missed opportunities for cross-pollination of ideas. This\nfragmentation prevents the unification of methods, leading to redundant efforts\nand suboptimal knowledge transfer across image and video domains. Under this\nobservation, we advocate unifying the study of UDA-SS across video and image\nscenarios, enabling a more comprehensive understanding, synergistic\nadvancements, and efficient knowledge sharing. To that end, we explore the\nunified UDA-SS from a general data augmentation perspective, serving as a\nunifying conceptual framework, enabling improved generalization, and potential\nfor cross-pollination of ideas, ultimately contributing to the overall progress\nand practical impact of this field of research. Specifically, we propose a\nQuad-directional Mixup (QuadMix) method, characterized by tackling distinct\npoint attributes and feature inconsistencies through four-directional paths for\nintra- and inter-domain mixing in a feature space. To deal with temporal shifts\nwith videos, we incorporate optical flow-guided feature aggregation across\nspatial and temporal dimensions for fine-grained domain alignment. Extensive\nexperiments show that our method outperforms the state-of-the-art works by\nlarge margins on four challenging UDA-SS benchmarks. Our source code and models\nwill be released at \\url{https://github.com/ZHE-SAPI/UDASS}.\n","authors":["Zhe Zhang","Gaochang Wu","Jing Zhang","Xiatian Zhu","Dacheng Tao","Tianyou Chai"],"pdf_url":"https://arxiv.org/pdf/2311.13254v2.pdf","comment":"18 pages,10 figures, 11 tables"},{"id":"http://arxiv.org/abs/2408.11039v1","updated":"2024-08-20T17:48:20Z","published":"2024-08-20T17:48:20Z","title":"Transfusion: Predict the Next Token and Diffuse Images with One\n Multi-Modal Model","summary":" We introduce Transfusion, a recipe for training a multi-modal model over\ndiscrete and continuous data. Transfusion combines the language modeling loss\nfunction (next token prediction) with diffusion to train a single transformer\nover mixed-modality sequences. We pretrain multiple Transfusion models up to 7B\nparameters from scratch on a mixture of text and image data, establishing\nscaling laws with respect to a variety of uni- and cross-modal benchmarks. Our\nexperiments show that Transfusion scales significantly better than quantizing\nimages and training a language model over discrete image tokens. By introducing\nmodality-specific encoding and decoding layers, we can further improve the\nperformance of Transfusion models, and even compress each image to just 16\npatches. We further demonstrate that scaling our Transfusion recipe to 7B\nparameters and 2T multi-modal tokens produces a model that can generate images\nand text on a par with similar scale diffusion models and language models,\nreaping the benefits of both worlds.\n","authors":["Chunting Zhou","Lili Yu","Arun Babu","Kushal Tirumala","Michihiro Yasunaga","Leonid Shamis","Jacob Kahn","Xuezhe Ma","Luke Zettlemoyer","Omer Levy"],"pdf_url":"https://arxiv.org/pdf/2408.11039v1.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2408.11032v1","updated":"2024-08-20T17:33:20Z","published":"2024-08-20T17:33:20Z","title":"Atmospheric Transport Modeling of CO$_2$ with Neural Networks","summary":" Accurately describing the distribution of CO$_2$ in the atmosphere with\natmospheric tracer transport models is essential for greenhouse gas monitoring\nand verification support systems to aid implementation of international climate\nagreements. Large deep neural networks are poised to revolutionize weather\nprediction, which requires 3D modeling of the atmosphere. While similar in this\nregard, atmospheric transport modeling is subject to new challenges. Both,\nstable predictions for longer time horizons and mass conservation throughout\nneed to be achieved, while IO plays a larger role compared to computational\ncosts. In this study we explore four different deep neural networks (UNet,\nGraphCast, Spherical Fourier Neural Operator and SwinTransformer) which have\nproven as state-of-the-art in weather prediction to assess their usefulness for\natmospheric tracer transport modeling. For this, we assemble the CarbonBench\ndataset, a systematic benchmark tailored for machine learning emulators of\nEulerian atmospheric transport. Through architectural adjustments, we decouple\nthe performance of our emulators from the distribution shift caused by a steady\nrise in atmospheric CO$_2$. More specifically, we center CO$_2$ input fields to\nzero mean and then use an explicit flux scheme and a mass fixer to assure mass\nbalance. This design enables stable and mass conserving transport for over 6\nmonths with all four neural network architectures. In our study, the\nSwinTransformer displays particularly strong emulation skill (90-day $R^2 >\n0.99$), with physically plausible emulation even for forward runs of multiple\nyears. This work paves the way forward towards high resolution forward and\ninverse modeling of inert trace gases with neural networks.\n","authors":["Vitus Benson","Ana Bastos","Christian Reimers","Alexander J. Winkler","Fanny Yang","Markus Reichstein"],"pdf_url":"https://arxiv.org/pdf/2408.11032v1.pdf","comment":"Code: https://github.com/vitusbenson/carbonbench"},{"id":"http://arxiv.org/abs/2408.11030v1","updated":"2024-08-20T17:31:48Z","published":"2024-08-20T17:31:48Z","title":"OpenScan: A Benchmark for Generalized Open-Vocabulary 3D Scene\n Understanding","summary":" Open-vocabulary 3D scene understanding (OV-3D) aims to localize and classify\nnovel objects beyond the closed object classes. However, existing approaches\nand benchmarks primarily focus on the open vocabulary problem within the\ncontext of object classes, which is insufficient to provide a holistic\nevaluation to what extent a model understands the 3D scene. In this paper, we\nintroduce a more challenging task called Generalized Open-Vocabulary 3D Scene\nUnderstanding (GOV-3D) to explore the open vocabulary problem beyond object\nclasses. It encompasses an open and diverse set of generalized knowledge,\nexpressed as linguistic queries of fine-grained and object-specific attributes.\nTo this end, we contribute a new benchmark named OpenScan, which consists of 3D\nobject attributes across eight representative linguistic aspects, including\naffordance, property, material, and more. We further evaluate state-of-the-art\nOV-3D methods on our OpenScan benchmark, and discover that these methods\nstruggle to comprehend the abstract vocabularies of the GOV-3D task, a\nchallenge that cannot be addressed by simply scaling up object classes during\ntraining. We highlight the limitations of existing methodologies and explore a\npromising direction to overcome the identified shortcomings. Data and code are\navailable at https://github.com/YoujunZhao/OpenScan\n","authors":["Youjun Zhao","Jiaying Lin","Shuquan Ye","Qianshi Pang","Rynson W. H. Lau"],"pdf_url":"https://arxiv.org/pdf/2408.11030v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.03740v2","updated":"2024-08-20T17:05:13Z","published":"2024-03-06T14:28:53Z","title":"Self-supervised Photographic Image Layout Representation Learning","summary":" In the domain of image layout representation learning, the critical process\nof translating image layouts into succinct vector forms is increasingly\nsignificant across diverse applications, such as image retrieval, manipulation,\nand generation. Most approaches in this area heavily rely on costly labeled\ndatasets and notably lack in adapting their modeling and learning methods to\nthe specific nuances of photographic image layouts. This shortfall makes the\nlearning process for photographic image layouts suboptimal. In our research, we\ndirectly address these challenges. We innovate by defining basic layout\nprimitives that encapsulate various levels of layout information and by mapping\nthese, along with their interconnections, onto a heterogeneous graph structure.\nThis graph is meticulously engineered to capture the intricate layout\ninformation within the pixel domain explicitly. Advancing further, we introduce\nnovel pretext tasks coupled with customized loss functions, strategically\ndesigned for effective self-supervised learning of these layout graphs.\nBuilding on this foundation, we develop an autoencoder-based network\narchitecture skilled in compressing these heterogeneous layout graphs into\nprecise, dimensionally-reduced layout representations. Additionally, we\nintroduce the LODB dataset, which features a broader range of layout categories\nand richer semantics, serving as a comprehensive benchmark for evaluating the\neffectiveness of layout representation learning methods. Our extensive\nexperimentation on this dataset demonstrates the superior performance of our\napproach in the realm of photographic image layout representation learning.\n","authors":["Zhaoran Zhao","Peng Lu","Xujun Peng","Wenhao Guo"],"pdf_url":"https://arxiv.org/pdf/2403.03740v2.pdf","comment":"The authors of the paper believe that there is an error in the\n measurement of the F1 curve in the metrics description"},{"id":"http://arxiv.org/abs/2408.11001v1","updated":"2024-08-20T16:53:34Z","published":"2024-08-20T16:53:34Z","title":"MegaFusion: Extend Diffusion Models towards Higher-resolution Image\n Generation without Further Tuning","summary":" Diffusion models have emerged as frontrunners in text-to-image generation for\ntheir impressive capabilities. Nonetheless, their fixed image resolution during\ntraining often leads to challenges in high-resolution image generation, such as\nsemantic inaccuracies and object replication. This paper introduces MegaFusion,\na novel approach that extends existing diffusion-based text-to-image generation\nmodels towards efficient higher-resolution generation without additional\nfine-tuning or extra adaptation. Specifically, we employ an innovative truncate\nand relay strategy to bridge the denoising processes across different\nresolutions, allowing for high-resolution image generation in a coarse-to-fine\nmanner. Moreover, by integrating dilated convolutions and noise re-scheduling,\nwe further adapt the model's priors for higher resolution. The versatility and\nefficacy of MegaFusion make it universally applicable to both latent-space and\npixel-space diffusion models, along with other derivative models. Extensive\nexperiments confirm that MegaFusion significantly boosts the capability of\nexisting models to produce images of megapixels and various aspect ratios,\nwhile only requiring about 40% of the original computational cost.\n","authors":["Haoning Wu","Shaocheng Shen","Qiang Hu","Xiaoyun Zhang","Ya Zhang","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2408.11001v1.pdf","comment":"Technical Report. Project Page:\n https://haoningwu3639.github.io/MegaFusion/"},{"id":"http://arxiv.org/abs/2408.11000v1","updated":"2024-08-20T16:53:30Z","published":"2024-08-20T16:53:30Z","title":"SenPa-MAE: Sensor Parameter Aware Masked Autoencoder for Multi-Satellite\n Self-Supervised Pretraining","summary":" This paper introduces SenPa-MAE, a transformer architecture that encodes the\nsensor parameters of an observed multispectral signal into the image\nembeddings. SenPa-MAE can be pre-trained on imagery of different satellites\nwith non-matching spectral or geometrical sensor characteristics. To\nincorporate sensor parameters, we propose a versatile sensor parameter encoding\nmodule as well as a data augmentation strategy for the diversification of the\npre-training dataset. This enables the model to effectively differentiate\nbetween various sensors and gain an understanding of sensor parameters and the\ncorrelation to the observed signal. Given the rising number of Earth\nobservation satellite missions and the diversity in their sensor\nspecifications, our approach paves the way towards a sensor-independent Earth\nobservation foundation model. This opens up possibilities such as cross-sensor\ntraining and sensor-independent inference.\n","authors":["Jonathan Prexl","Michael Schmitt"],"pdf_url":"https://arxiv.org/pdf/2408.11000v1.pdf","comment":"GCPR 2024"},{"id":"http://arxiv.org/abs/2408.10993v1","updated":"2024-08-20T16:42:11Z","published":"2024-08-20T16:42:11Z","title":"Facial Demorphing via Identity Preserving Image Decomposition","summary":" A face morph is created by combining the face images usually pertaining to\ntwo distinct identities. The goal is to generate an image that can be matched\nwith two identities thereby undermining the security of a face recognition\nsystem. To deal with this problem, several morph attack detection techniques\nhave been developed. But these methods do not extract any information about the\nunderlying bonafides used to create them. Demorphing addresses this limitation.\nHowever, current demorphing techniques are mostly reference-based, i.e, they\nneed an image of one of the identities to recover the other. In this work, we\ntreat demorphing as an ill-posed decomposition problem. We propose a novel\nmethod that is reference-free and recovers the bonafides with high accuracy.\nOur method decomposes the morph into several identity-preserving feature\ncomponents. A merger network then weighs and combines these components to\nrecover the bonafides. Our method is observed to reconstruct high-quality\nbonafides in terms of definition and fidelity. Experiments on the\nCASIA-WebFace, SMDD and AMSL datasets demonstrate the effectiveness of our\nmethod.\n","authors":["Nitish Shukla","Arun Ross"],"pdf_url":"https://arxiv.org/pdf/2408.10993v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07215v3","updated":"2024-08-20T16:37:12Z","published":"2023-06-12T16:20:36Z","title":"Efficient and Robust Quantization-aware Training via Adaptive Coreset\n Selection","summary":" Quantization-aware training (QAT) is a representative model compression\nmethod to reduce redundancy in weights and activations. However, most existing\nQAT methods require end-to-end training on the entire dataset, which suffers\nfrom long training time and high energy costs. In addition, the potential label\nnoise in the training data undermines the robustness of QAT. We propose two\nmetrics based on analysis of loss and gradient of quantized weights: error\nvector score and disagreement score, to quantify the importance of each sample\nduring training. Guided by these two metrics, we proposed a quantization-aware\nAdaptive Coreset Selection (ACS) method to select the data for the current\ntraining epoch. We evaluate our method on various networks (ResNet-18,\nMobileNetV2, RetinaNet), datasets(CIFAR-10, CIFAR-100, ImageNet-1K, COCO), and\nunder different quantization settings. Specifically, our method can achieve an\naccuracy of 68.39\\% of 4-bit quantized ResNet-18 on the ImageNet-1K dataset\nwith only a 10\\% subset, which has an absolute gain of 4.24\\% compared to the\nbaseline. Our method can also improve the robustness of QAT by removing noisy\nsamples in the training set.\n","authors":["Xijie Huang","Zechun Liu","Shih-Yang Liu","Kwang-Ting Cheng"],"pdf_url":"https://arxiv.org/pdf/2306.07215v3.pdf","comment":"Accepted by TMLR, Code: https://github.com/HuangOwen/QAT-ACS"},{"id":"http://arxiv.org/abs/2408.10987v1","updated":"2024-08-20T16:31:31Z","published":"2024-08-20T16:31:31Z","title":"Denoising Plane Wave Ultrasound Images Using Diffusion Probabilistic\n Models","summary":" Ultrasound plane wave imaging is a cutting-edge technique that enables high\nframe-rate imaging. However, one challenge associated with high frame-rate\nultrasound imaging is the high noise associated with them, hindering their\nwider adoption. Therefore, the development of a denoising method becomes\nimperative to augment the quality of plane wave images. Drawing inspiration\nfrom Denoising Diffusion Probabilistic Models (DDPMs), our proposed solution\naims to enhance plane wave image quality. Specifically, the method considers\nthe distinction between low-angle and high-angle compounding plane waves as\nnoise and effectively eliminates it by adapting a DDPM to beamformed\nradiofrequency (RF) data. The method underwent training using only 400\nsimulated images. In addition, our approach employs natural image segmentation\nmasks as intensity maps for the generated images, resulting in accurate\ndenoising for various anatomy shapes. The proposed method was assessed across\nsimulation, phantom, and in vivo images. The results of the evaluations\nindicate that our approach not only enhances image quality on simulated data\nbut also demonstrates effectiveness on phantom and in vivo data in terms of\nimage quality. Comparative analysis with other methods underscores the\nsuperiority of our proposed method across various evaluation metrics. The\nsource code and trained model will be released along with the dataset at:\nhttp://code.sonography.ai\n","authors":["Hojat Asgariandehkordi","Sobhan Goudarzi","Mostafa Sharifzadeh","Adrian Basarab","Hassan Rivaz"],"pdf_url":"https://arxiv.org/pdf/2408.10987v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10966v1","updated":"2024-08-20T16:01:05Z","published":"2024-08-20T16:01:05Z","title":"ISLES'24: Improving final infarct prediction in ischemic stroke using\n multimodal imaging and clinical data","summary":" Accurate estimation of core (irreversibly damaged tissue) and penumbra\n(salvageable tissue) volumes is essential for ischemic stroke treatment\ndecisions. Perfusion CT, the clinical standard, estimates these volumes but is\naffected by variations in deconvolution algorithms, implementations, and\nthresholds. Core tissue expands over time, with growth rates influenced by\nthrombus location, collateral circulation, and inherent patient-specific\nfactors. Understanding this tissue growth is crucial for determining the need\nto transfer patients to comprehensive stroke centers, predicting the benefits\nof additional reperfusion attempts during mechanical thrombectomy, and\nforecasting final clinical outcomes. This work presents the ISLES'24 challenge,\nwhich addresses final post-treatment stroke infarct prediction from\npre-interventional acute stroke imaging and clinical data. ISLES'24 establishes\na unique 360-degree setting where all feasibly accessible clinical data are\navailable for participants, including full CT acute stroke imaging, sub-acute\nfollow-up MRI, and clinical tabular data. The contributions of this work are\ntwo-fold: first, we introduce a standardized benchmarking of final stroke\ninfarct segmentation algorithms through the ISLES'24 challenge; second, we\nprovide insights into infarct segmentation using multimodal imaging and\nclinical data strategies by identifying outperforming methods on a finely\ncurated dataset. The outputs of this challenge are anticipated to enhance\nclinical decision-making and improve patient outcome predictions. All ISLES'24\nmaterials, including data, performance evaluation scripts, and leading\nalgorithmic strategies, are available to the research community following\n\\url{https://isles-24.grand-challenge.org/}.\n","authors":["Ezequiel de la Rosa","Ruisheng Su","Mauricio Reyes","Roland Wiest","Evamaria O. Riedel","Florian Kofler","Kaiyuan Yang","Hakim Baazaoui","David Robben","Susanne Wegener","Jan S. Kirschke","Benedikt Wiestler","Bjoern Menze"],"pdf_url":"https://arxiv.org/pdf/2408.10966v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.04844v2","updated":"2024-08-20T15:57:00Z","published":"2023-05-08T16:42:55Z","title":"SR+Codec: a Benchmark of Super-Resolution for Video Compression Bitrate\n Reduction","summary":" In recent years, there has been significant interest in Super-Resolution\n(SR), which focuses on generating a high-resolution image from a low-resolution\ninput. Deep learning-based methods for super-resolution have been particularly\npopular and have shown impressive results on various benchmarks. However,\nresearch indicates that these methods may not perform as well on strongly\ncompressed videos. We developed a super-resolution benchmark to analyze SR's\ncapacity to upscale compressed videos. Our dataset employed video codecs based\non five widely-used compression standards: H.264, H.265, H.266, AV1, and AVS3.\nWe assessed 19 popular SR models using our benchmark and evaluated their\nability to restore details and their susceptibility to compression artifacts.\nTo get an accurate perceptual ranking of SR models, we conducted a\ncrowd-sourced side-by-side comparison of their outputs. We found that some SR\nmodels, combined with compression, allow us to reduce the video bitrate without\nsignificant loss of quality. We also compared a range of image and video\nquality metrics with subjective scores to evaluate their accuracy on\nsuper-resolved compressed videos. The benchmark is publicly available at\nhttps://videoprocessing.ai/benchmarks/super-resolution-for-video-compression.html\n","authors":["Evgeney Bogatyrev","Ivan Molodetskikh","Dmitriy Vatolin"],"pdf_url":"https://arxiv.org/pdf/2305.04844v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10955v1","updated":"2024-08-20T15:51:01Z","published":"2024-08-20T15:51:01Z","title":"Multichannel Attention Networks with Ensembled Transfer Learning to\n Recognize Bangla Handwritten Charecter","summary":" The Bengali language is the 5th most spoken native and 7th most spoken\nlanguage in the world, and Bengali handwritten character recognition has\nattracted researchers for decades. However, other languages such as English,\nArabic, Turkey, and Chinese character recognition have contributed\nsignificantly to developing handwriting recognition systems. Still, little\nresearch has been done on Bengali character recognition because of the\nsimilarity of the character, curvature and other complexities. However, many\nresearchers have used traditional machine learning and deep learning models to\nconduct Bengali hand-written recognition. The study employed a convolutional\nneural network (CNN) with ensemble transfer learning and a multichannel\nattention network. We generated the feature from the two branches of the CNN,\nincluding Inception Net and ResNet and then produced an ensemble feature fusion\nby concatenating them. After that, we applied the attention module to produce\nthe contextual information from the ensemble features. Finally, we applied a\nclassification module to refine the features and classification. We evaluated\nthe proposed model using the CAMTERdb 3.1.2 data set and achieved 92\\% accuracy\nfor the raw dataset and 98.00\\% for the preprocessed dataset. We believe that\nour contribution to the Bengali handwritten character recognition domain will\nbe considered a great development.\n","authors":["Farhanul Haque","Md. Al-Hasan","Sumaiya Tabssum Mou","Abu Saleh Musa Miah","Jungpil Shin","Md Abdur Rahim"],"pdf_url":"https://arxiv.org/pdf/2408.10955v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10945v1","updated":"2024-08-20T15:34:27Z","published":"2024-08-20T15:34:27Z","title":"HiRED: Attention-Guided Token Dropping for Efficient Inference of\n High-Resolution Vision-Language Models in Resource-Constrained Environments","summary":" High-resolution Vision-Language Models (VLMs) have been widely used in\nmultimodal tasks to enhance accuracy by preserving detailed image information.\nHowever, these models often generate excessive visual tokens due to encoding\nmultiple partitions of the input image. Processing these excessive visual\ntokens is computationally challenging, especially in resource-constrained\nenvironments with commodity GPUs. To support high-resolution images while\nmeeting resource constraints, we propose High-Resolution Early Dropping\n(HiRED), a token-dropping scheme that operates within a fixed token budget\nbefore the Large Language Model (LLM) stage. HiRED can be integrated with\nexisting high-resolution VLMs in a plug-and-play manner, as it requires no\nadditional training while still maintaining superior accuracy. We strategically\nuse the vision encoder's attention in the initial layers to assess the visual\ncontent of each image partition and allocate the token budget accordingly.\nThen, using the attention in the final layer, we select the most important\nvisual tokens from each partition within the allocated budget, dropping the\nrest. Empirically, when applied to LLaVA-Next-7B on NVIDIA TESLA P40 GPU, HiRED\nwith a 20% token budget increases token generation throughput by 4.7, reduces\nfirst-token generation latency by 15 seconds, and saves 2.3 GB of GPU memory\nfor a single inference.\n","authors":["Kazi Hasan Ibn Arif","JinYi Yoon","Dimitrios S. Nikolopoulos","Hans Vandierendonck","Deepu John","Bo Ji"],"pdf_url":"https://arxiv.org/pdf/2408.10945v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2311.08815v2","updated":"2024-08-20T15:33:12Z","published":"2023-11-15T09:34:08Z","title":"Self-Supervised Disentanglement by Leveraging Structure in Data\n Augmentations","summary":" Self-supervised representation learning often uses data augmentations to\ninduce some invariance to \"style\" attributes of the data. However, with\ndownstream tasks generally unknown at training time, it is difficult to deduce\na priori which attributes of the data are indeed \"style\" and can be safely\ndiscarded. To deal with this, current approaches try to retain some style\ninformation by tuning the degree of invariance to some particular task, such as\nImageNet object classification. However, prior work has shown that such\ntask-specific tuning can lead to significant performance degradation on other\ntasks that rely on the discarded style. To address this, we introduce a more\nprincipled approach that seeks to disentangle style features rather than\ndiscard them. The key idea is to add multiple style embedding spaces where: (i)\neach is invariant to all-but-one augmentation; and (ii) joint entropy is\nmaximized. We formalize our structured data-augmentation procedure from a\ncausal latent-variable-model perspective, and prove identifiability of both\ncontent and individual style variables. We empirically demonstrate the benefits\nof our approach on both synthetic and real-world data.\n","authors":["Cian Eastwood","Julius von Kügelgen","Linus Ericsson","Diane Bouchacourt","Pascal Vincent","Bernhard Schölkopf","Mark Ibrahim"],"pdf_url":"https://arxiv.org/pdf/2311.08815v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10940v1","updated":"2024-08-20T15:29:56Z","published":"2024-08-20T15:29:56Z","title":"A Closer Look at Data Augmentation Strategies for Finetuning-Based\n Low/Few-Shot Object Detection","summary":" Current methods for low- and few-shot object detection have primarily focused\non enhancing model performance for detecting objects. One common approach to\nachieve this is by combining model finetuning with data augmentation\nstrategies. However, little attention has been given to the energy efficiency\nof these approaches in data-scarce regimes. This paper seeks to conduct a\ncomprehensive empirical study that examines both model performance and energy\nefficiency of custom data augmentations and automated data augmentation\nselection strategies when combined with a lightweight object detector. The\nmethods are evaluated in three different benchmark datasets in terms of their\nperformance and energy consumption, and the Efficiency Factor is employed to\ngain insights into their effectiveness considering both performance and\nefficiency. Consequently, it is shown that in many cases, the performance gains\nof data augmentation strategies are overshadowed by their increased energy\nusage, necessitating the development of more energy efficient data augmentation\nstrategies to address data scarcity.\n","authors":["Vladislav Li","Georgios Tsoumplekas","Ilias Siniosoglou","Vasileios Argyriou","Anastasios Lytos","Eleftherios Fountoukidis","Panagiotis Sarigiannidis"],"pdf_url":"https://arxiv.org/pdf/2408.10940v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10935v1","updated":"2024-08-20T15:17:53Z","published":"2024-08-20T15:17:53Z","title":"Large Point-to-Gaussian Model for Image-to-3D Generation","summary":" Recently, image-to-3D approaches have significantly advanced the generation\nquality and speed of 3D assets based on large reconstruction models,\nparticularly 3D Gaussian reconstruction models. Existing large 3D Gaussian\nmodels directly map 2D image to 3D Gaussian parameters, while regressing 2D\nimage to 3D Gaussian representations is challenging without 3D priors. In this\npaper, we propose a large Point-to-Gaussian model, that inputs the initial\npoint cloud produced from large 3D diffusion model conditional on 2D image to\ngenerate the Gaussian parameters, for image-to-3D generation. The point cloud\nprovides initial 3D geometry prior for Gaussian generation, thus significantly\nfacilitating image-to-3D Generation. Moreover, we present the\n\\textbf{A}ttention mechanism, \\textbf{P}rojection mechanism, and \\textbf{P}oint\nfeature extractor, dubbed as \\textbf{APP} block, for fusing the image features\nwith point cloud features. The qualitative and quantitative experiments\nextensively demonstrate the effectiveness of the proposed approach on GSO and\nObjaverse datasets, and show the proposed method achieves state-of-the-art\nperformance.\n","authors":["Longfei Lu","Huachen Gao","Tao Dai","Yaohua Zha","Zhi Hou","Junta Wu","Shu-Tao Xia"],"pdf_url":"https://arxiv.org/pdf/2408.10935v1.pdf","comment":"10 pages, 9 figures, ACM MM 2024"},{"id":"http://arxiv.org/abs/2408.10934v1","updated":"2024-08-20T15:17:11Z","published":"2024-08-20T15:17:11Z","title":"SDI-Net: Toward Sufficient Dual-View Interaction for Low-light Stereo\n Image Enhancement","summary":" Currently, most low-light image enhancement methods only consider information\nfrom a single view, neglecting the correlation between cross-view information.\nTherefore, the enhancement results produced by these methods are often\nunsatisfactory. In this context, there have been efforts to develop methods\nspecifically for low-light stereo image enhancement. These methods take into\naccount the cross-view disparities and enable interaction between the left and\nright views, leading to improved performance. However, these methods still do\nnot fully exploit the interaction between left and right view information. To\naddress this issue, we propose a model called Toward Sufficient Dual-View\nInteraction for Low-light Stereo Image Enhancement (SDI-Net). The backbone\nstructure of SDI-Net is two encoder-decoder pairs, which are used to learn the\nmapping function from low-light images to normal-light images. Among the\nencoders and the decoders, we design a module named Cross-View Sufficient\nInteraction Module (CSIM), aiming to fully exploit the correlations between the\nbinocular views via the attention mechanism. The quantitative and visual\nresults on public datasets validate the superiority of our method over other\nrelated methods. Ablation studies also demonstrate the effectiveness of the key\nelements in our model.\n","authors":["Linlin Hu","Ao Sun","Shijie Hao","Richang Hong","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2408.10934v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10919v1","updated":"2024-08-20T15:04:14Z","published":"2024-08-20T15:04:14Z","title":"CrossFi: A Cross Domain Wi-Fi Sensing Framework Based on Siamese Network","summary":" In recent years, Wi-Fi sensing has garnered significant attention due to its\nnumerous benefits, such as privacy protection, low cost, and penetration\nability. Extensive research has been conducted in this field, focusing on areas\nsuch as gesture recognition, people identification, and fall detection.\nHowever, many data-driven methods encounter challenges related to domain shift,\nwhere the model fails to perform well in environments different from the\ntraining data. One major factor contributing to this issue is the limited\navailability of Wi-Fi sensing datasets, which makes models learn excessive\nirrelevant information and over-fit to the training set. Unfortunately,\ncollecting large-scale Wi-Fi sensing datasets across diverse scenarios is a\nchallenging task. To address this problem, we propose CrossFi, a siamese\nnetwork-based approach that excels in both in-domain scenario and cross-domain\nscenario, including few-shot, zero-shot scenarios, and even works in few-shot\nnew-class scenario where testing set contains new categories. The core\ncomponent of CrossFi is a sample-similarity calculation network called CSi-Net,\nwhich improves the structure of the siamese network by using an attention\nmechanism to capture similarity information, instead of simply calculating the\ndistance or cosine similarity. Based on it, we develop an extra Weight-Net that\ncan generate a template for each class, so that our CrossFi can work in\ndifferent scenarios. Experimental results demonstrate that our CrossFi achieves\nstate-of-the-art performance across various scenarios. In gesture recognition\ntask, our CrossFi achieves an accuracy of 98.17% in in-domain scenario, 91.72%\nin one-shot cross-domain scenario, 64.81% in zero-shot cross-domain scenario,\nand 84.75% in one-shot new-class scenario. To facilitate future research, we\nwill release the code for our model upon publication.\n","authors":["Zijian Zhao","Tingwei Chen","Zhijie Cai","Hang Li","Xiaoyang Li","Qimei Chen","Guangxu Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.10919v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07545v4","updated":"2024-08-20T14:59:55Z","published":"2023-08-15T03:22:40Z","title":"Vision-Language Dataset Distillation","summary":" Dataset distillation methods reduce large-scale datasets to smaller sets of\nsynthetic data, preserving sufficient information to quickly train a new model\nfrom scratch. However, prior work on dataset distillation has focused\nexclusively on image classification datasets, whereas modern large-scale\ndatasets are primarily vision-language datasets. In this work, we design the\nfirst vision-language dataset distillation method, building on the idea of\ntrajectory matching. A key challenge is that vision-language datasets do not\nhave a set of discrete classes. To overcome this, our proposed method jointly\ndistills image-text pairs in a contrastive formulation. Further, we leverage\nLow-Rank Adaptation (LoRA) matching to enable more efficient and effective\ntrajectory matching in complex modern vision-language models. Since there are\nno existing baselines, we compare our distillation approach with three adapted\nvision-language coreset selection methods. We demonstrate significant\nimprovements on the challenging Flickr30K and COCO retrieval benchmarks: for\nexample, on Flickr30K, the best coreset selection method selecting 1000\nimage-text pairs for training achieves only 5.6% image-to-text retrieval\naccuracy (i.e., recall@1); in contrast, our dataset distillation almost doubles\nthat to 9.9% with just 100 training pairs, an order of magnitude fewer.\n","authors":["Xindi Wu","Byron Zhang","Zhiwei Deng","Olga Russakovsky"],"pdf_url":"https://arxiv.org/pdf/2308.07545v4.pdf","comment":"31 pages, 13 figures"},{"id":"http://arxiv.org/abs/2408.10906v1","updated":"2024-08-20T14:49:14Z","published":"2024-08-20T14:49:14Z","title":"ShapeSplat: A Large-scale Dataset of Gaussian Splats and Their\n Self-Supervised Pretraining","summary":" 3D Gaussian Splatting (3DGS) has become the de facto method of 3D\nrepresentation in many vision tasks. This calls for the 3D understanding\ndirectly in this representation space. To facilitate the research in this\ndirection, we first build a large-scale dataset of 3DGS using the commonly used\nShapeNet and ModelNet datasets. Our dataset ShapeSplat consists of 65K objects\nfrom 87 unique categories, whose labels are in accordance with the respective\ndatasets. The creation of this dataset utilized the compute equivalent of 2 GPU\nyears on a TITAN XP GPU.\n We utilize our dataset for unsupervised pretraining and supervised finetuning\nfor classification and segmentation tasks. To this end, we introduce\n\\textbf{\\textit{Gaussian-MAE}}, which highlights the unique benefits of\nrepresentation learning from Gaussian parameters. Through exhaustive\nexperiments, we provide several valuable insights. In particular, we show that\n(1) the distribution of the optimized GS centroids significantly differs from\nthe uniformly sampled point cloud (used for initialization) counterpart; (2)\nthis change in distribution results in degradation in classification but\nimprovement in segmentation tasks when using only the centroids; (3) to\nleverage additional Gaussian parameters, we propose Gaussian feature grouping\nin a normalized feature space, along with splats pooling layer, offering a\ntailored solution to effectively group and embed similar Gaussians, which leads\nto notable improvement in finetuning tasks.\n","authors":["Qi Ma","Yue Li","Bin Ren","Nicu Sebe","Ender Konukoglu","Theo Gevers","Luc Van Gool","Danda Pani Paudel"],"pdf_url":"https://arxiv.org/pdf/2408.10906v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10901v1","updated":"2024-08-20T14:43:53Z","published":"2024-08-20T14:43:53Z","title":"A Grey-box Attack against Latent Diffusion Model-based Image Editing by\n Posterior Collapse","summary":" Recent advancements in generative AI, particularly Latent Diffusion Models\n(LDMs), have revolutionized image synthesis and manipulation. However, these\ngenerative techniques raises concerns about data misappropriation and\nintellectual property infringement. Adversarial attacks on machine learning\nmodels have been extensively studied, and a well-established body of research\nhas extended these techniques as a benign metric to prevent the underlying\nmisuse of generative AI. Current approaches to safeguarding images from\nmanipulation by LDMs are limited by their reliance on model-specific knowledge\nand their inability to significantly degrade semantic quality of generated\nimages. In response to these shortcomings, we propose the Posterior Collapse\nAttack (PCA) based on the observation that VAEs suffer from posterior collapse\nduring training. Our method minimizes dependence on the white-box information\nof target models to get rid of the implicit reliance on model-specific\nknowledge. By accessing merely a small amount of LDM parameters, in specific\nmerely the VAE encoder of LDMs, our method causes a substantial semantic\ncollapse in generation quality, particularly in perceptual consistency, and\ndemonstrates strong transferability across various model architectures.\nExperimental results show that PCA achieves superior perturbation effects on\nimage generation of LDMs with lower runtime and VRAM. Our method outperforms\nexisting techniques, offering a more robust and generalizable solution that is\nhelpful in alleviating the socio-technical challenges posed by the rapidly\nevolving landscape of generative AI.\n","authors":["Zhongliang Guo","Lei Fang","Jingyu Lin","Yifei Qian","Shuai Zhao","Zeyu Wang","Junhao Dong","Cunjian Chen","Ognjen Arandjelović","Chun Pong Lau"],"pdf_url":"https://arxiv.org/pdf/2408.10901v1.pdf","comment":"21 pages, 7 figures, 10 tables"},{"id":"http://arxiv.org/abs/2408.09248v2","updated":"2024-08-20T14:39:46Z","published":"2024-08-17T16:34:03Z","title":"MagicID: Flexible ID Fidelity Generation System","summary":" Portrait Fidelity Generation is a prominent research area in generative\nmodels, with a primary focus on enhancing both controllability and fidelity.\nCurrent methods face challenges in generating high-fidelity portrait results\nwhen faces occupy a small portion of the image with a low resolution,\nespecially in multi-person group photo settings. To tackle these issues, we\npropose a systematic solution called MagicID, based on a self-constructed\nmillion-level multi-modal dataset named IDZoom. MagicID consists of Multi-Mode\nFusion training strategy (MMF) and DDIM Inversion based ID Restoration\ninference framework (DIIR). During training, MMF iteratively uses the skeleton\nand landmark modalities from IDZoom as conditional guidance. By introducing the\nClone Face Tuning in training stage and Mask Guided Multi-ID Cross Attention\n(MGMICA) in inference stage, explicit constraints on face positional features\nare achieved for multi-ID group photo generation. The DIIR aims to address the\nissue of artifacts. The DDIM Inversion is used in conjunction with face\nlandmarks, global and local face features to achieve face restoration while\nkeeping the background unchanged. Additionally, DIIR is plug-and-play and can\nbe applied to any diffusion-based portrait generation method. To validate the\neffectiveness of MagicID, we conducted extensive comparative and ablation\nexperiments. The experimental results demonstrate that MagicID has significant\nadvantages in both subjective and objective metrics, and achieves controllable\ngeneration in multi-person scenarios.\n","authors":["Zhaoli Deng","Wen Liu","Fanyi Wang","Junkang Zhang","Fan Chen","Meng Zhang","Wendong Zhang","Zhenpeng Mi"],"pdf_url":"https://arxiv.org/pdf/2408.09248v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.02314v2","updated":"2024-08-20T14:30:35Z","published":"2023-06-04T09:40:25Z","title":"Using Unreliable Pseudo-Labels for Label-Efficient Semantic Segmentation","summary":" The crux of label-efficient semantic segmentation is to produce high-quality\npseudo-labels to leverage a large amount of unlabeled or weakly labeled data. A\ncommon practice is to select the highly confident predictions as the\npseudo-ground-truths for each pixel, but it leads to a problem that most pixels\nmay be left unused due to their unreliability. However, we argue that every\npixel matters to the model training, even those unreliable and ambiguous\npixels. Intuitively, an unreliable prediction may get confused among the top\nclasses, however, it should be confident about the pixel not belonging to the\nremaining classes. Hence, such a pixel can be convincingly treated as a\nnegative key to those most unlikely categories. Therefore, we develop an\neffective pipeline to make sufficient use of unlabeled data. Concretely, we\nseparate reliable and unreliable pixels via the entropy of predictions, push\neach unreliable pixel to a category-wise queue that consists of negative keys,\nand manage to train the model with all candidate pixels. Considering the\ntraining evolution, we adaptively adjust the threshold for the\nreliable-unreliable partition. Experimental results on various benchmarks and\ntraining settings demonstrate the superiority of our approach over the\nstate-of-the-art alternatives.\n","authors":["Haochen Wang","Yuchao Wang","Yujun Shen","Junsong Fan","Yuxi Wang","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.02314v2.pdf","comment":"Accepted by IJCV. arXiv admin note: text overlap with\n arXiv:2203.03884"},{"id":"http://arxiv.org/abs/2408.10894v1","updated":"2024-08-20T14:27:03Z","published":"2024-08-20T14:27:03Z","title":"ViLReF: A Chinese Vision-Language Retinal Foundation Model","summary":" Subtle semantic differences in retinal image and text data present great\nchallenges for pre-training visual-language models. Moreover, false negative\nsamples, i.e., image-text pairs having the same semantics but incorrectly\nregarded as negatives, disrupt the visual-language pre-training process and\naffect the model's learning ability. This work aims to develop a retinal\nfoundation model, called ViLReF, by pre-training on a paired dataset comprising\n451,956 retinal images and corresponding diagnostic text reports. In our\nvision-language pre-training strategy, we leverage expert knowledge to\nfacilitate the extraction of labels and propose a novel constraint, the\nWeighted Similarity Coupling Loss, to adjust the speed of pushing sample pairs\nfurther apart dynamically within the feature space. Furthermore, we employ a\nbatch expansion module with dynamic memory queues, maintained by momentum\nencoders, to supply extra samples and compensate for the vacancies caused by\neliminating false negatives. Extensive experiments are conducted on multiple\ndatasets for downstream classification and segmentation tasks. The experimental\nresults demonstrate the powerful zero-shot and transfer learning capabilities\nof ViLReF, verifying the effectiveness of our pre-training strategy. Our ViLReF\nmodel is available at: https://github.com/T6Yang/ViLReF.\n","authors":["Shengzhu Yang","Jiawei Du","Jia Guo","Weihang Zhang","Hanruo Liu","Huiqi Li","Ningli Wang"],"pdf_url":"https://arxiv.org/pdf/2408.10894v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.08890v2","updated":"2024-08-20T14:19:38Z","published":"2024-05-14T18:07:04Z","title":"Language-Guided Self-Supervised Video Summarization Using Text Semantic\n Matching Considering the Diversity of the Video","summary":" Current video summarization methods rely heavily on supervised computer\nvision techniques, which demands time-consuming and subjective manual\nannotations. To overcome these limitations, we investigated self-supervised\nvideo summarization. Inspired by the success of Large Language Models (LLMs),\nwe explored the feasibility in transforming the video summarization task into a\nNatural Language Processing (NLP) task. By leveraging the advantages of LLMs in\ncontext understanding, we aim to enhance the effectiveness of self-supervised\nvideo summarization. Our method begins by generating captions for individual\nvideo frames, which are then synthesized into text summaries by LLMs.\nSubsequently, we measure semantic distance between the captions and the text\nsummary. Notably, we propose a novel loss function to optimize our model\naccording to the diversity of the video. Finally, the summarized video can be\ngenerated by selecting the frames with captions similar to the text summary.\nOur method achieves state-of-the-art performance on the SumMe dataset in rank\ncorrelation coefficients. In addition, our method has a novel feature of being\nable to achieve personalized summarization.\n","authors":["Tomoya Sugihara","Shuntaro Masuda","Ling Xiao","Toshihiko Yamasaki"],"pdf_url":"https://arxiv.org/pdf/2405.08890v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10885v1","updated":"2024-08-20T14:14:15Z","published":"2024-08-20T14:14:15Z","title":"Low-Quality Image Detection by Hierarchical VAE","summary":" To make an employee roster, photo album, or training dataset of generative\nmodels, one needs to collect high-quality images while dismissing low-quality\nones. This study addresses a new task of unsupervised detection of low-quality\nimages. We propose a method that not only detects low-quality images with\nvarious types of degradation but also provides visual clues of them based on an\nobservation that partial reconstruction by hierarchical variational\nautoencoders fails for low-quality images. The experiments show that our method\noutperforms several unsupervised out-of-distribution detection methods and also\ngives visual clues for low-quality images that help humans recognize them even\nin thumbnail view.\n","authors":["Tomoyasu Nanaumi","Kazuhiko Kawamoto","Hiroshi Kera"],"pdf_url":"https://arxiv.org/pdf/2408.10885v1.pdf","comment":"ICCV 2023, Workshop on Uncertainty Estimation for Computer Vision"},{"id":"http://arxiv.org/abs/2408.10883v1","updated":"2024-08-20T14:13:54Z","published":"2024-08-20T14:13:54Z","title":"DAAD: Dynamic Analysis and Adaptive Discriminator for Fake News\n Detection","summary":" In current web environment, fake news spreads rapidly across online social\nnetworks, posing serious threats to society. Existing multimodal fake news\ndetection (MFND) methods can be classified into knowledge-based and\nsemantic-based approaches. However, these methods are overly dependent on human\nexpertise and feedback, lacking flexibility. To address this challenge, we\npropose a Dynamic Analysis and Adaptive Discriminator (DAAD) approach for fake\nnews detection. For knowledge-based methods, we introduce the Monte Carlo Tree\nSearch (MCTS) algorithm to leverage the self-reflective capabilities of large\nlanguage models (LLMs) for prompt optimization, providing richer,\ndomain-specific details and guidance to the LLMs, while enabling more flexible\nintegration of LLM comment on news content. For semantic-based methods, we\ndefine four typical deceit patterns: emotional exaggeration, logical\ninconsistency, image manipulation, and semantic inconsistency, to reveal the\nmechanisms behind fake news creation. To detect these patterns, we carefully\ndesign four discriminators and expand them in depth and breadth, using the\nsoft-routing mechanism to explore optimal detection models. Experimental\nresults on three real-world datasets demonstrate the superiority of our\napproach. The code will be available at: https://github.com/SuXinqi/DAAD.\n","authors":["Xinqi Su","Yawen Cui","Ajian Liu","Xun Lin","Yuhao Wang","Haochen Liang","Wenhui Li","Zitong Yu"],"pdf_url":"https://arxiv.org/pdf/2408.10883v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10880v1","updated":"2024-08-20T14:10:44Z","published":"2024-08-20T14:10:44Z","title":"Open 3D World in Autonomous Driving","summary":" The capability for open vocabulary perception represents a significant\nadvancement in autonomous driving systems, facilitating the comprehension and\ninterpretation of a wide array of textual inputs in real-time. Despite\nextensive research in open vocabulary tasks within 2D computer vision, the\napplication of such methodologies to 3D environments, particularly within\nlarge-scale outdoor contexts, remains relatively underdeveloped. This paper\npresents a novel approach that integrates 3D point cloud data, acquired from\nLIDAR sensors, with textual information. The primary focus is on the\nutilization of textual data to directly localize and identify objects within\nthe autonomous driving context. We introduce an efficient framework for the\nfusion of bird's-eye view (BEV) region features with textual features, thereby\nenabling the system to seamlessly adapt to novel textual inputs and enhancing\nthe robustness of open vocabulary detection tasks. The effectiveness of the\nproposed methodology is rigorously evaluated through extensive experimentation\non the newly introduced NuScenes-T dataset, with additional validation of its\nzero-shot performance on the Lyft Level 5 dataset. This research makes a\nsubstantive contribution to the advancement of autonomous driving technologies\nby leveraging multimodal data to enhance open vocabulary perception in 3D\nenvironments, thereby pushing the boundaries of what is achievable in\nautonomous navigation and perception.\n","authors":["Xinlong Cheng","Lei Li"],"pdf_url":"https://arxiv.org/pdf/2408.10880v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14871v2","updated":"2024-08-20T14:06:39Z","published":"2023-12-22T17:49:11Z","title":"BrainVis: Exploring the Bridge between Brain and Visual Signals via\n Image Reconstruction","summary":" Analyzing and reconstructing visual stimuli from brain signals effectively\nadvances the understanding of human visual system. However, the EEG signals are\ncomplex and contain significant noise. This leads to substantial limitations in\nexisting works of visual stimuli reconstruction from EEG, such as difficulties\nin aligning EEG embeddings with the fine-grained semantic information and a\nheavy reliance on additional large self-collected dataset for training. To\naddress these challenges, we propose a novel approach called BrainVis. Firstly,\nwe divide the EEG signals into various units and apply a self-supervised\napproach on them to obtain EEG time-domain features, in an attempt to ease the\ntraining difficulty. Additionally, we also propose to utilize the\nfrequency-domain features to enhance the EEG representations. Then, we\nsimultaneously align EEG time-frequency embeddings with the interpolation of\nthe coarse and fine-grained semantics in the CLIP space, to highlight the\nprimary visual components and reduce the cross-modal alignment difficulty.\nFinally, we adopt the cascaded diffusion models to reconstruct images. Using\nonly 10\\% training data of the previous work, our proposed BrainVis outperforms\nstate of the arts in both semantic fidelity reconstruction and generation\nquality. The code is available at https://github.com/RomGai/BrainVis.\n","authors":["Honghao Fu","Zhiqi Shen","Jing Jih Chin","Hao Wang"],"pdf_url":"https://arxiv.org/pdf/2312.14871v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10872v1","updated":"2024-08-20T14:03:30Z","published":"2024-08-20T14:03:30Z","title":"V-RoAst: A New Dataset for Visual Road Assessment","summary":" Road traffic crashes cause millions of deaths annually and have a significant\neconomic impact, particularly in low- and middle-income countries (LMICs). This\npaper presents an approach using Vision Language Models (VLMs) for road safety\nassessment, overcoming the limitations of traditional Convolutional Neural\nNetworks (CNNs). We introduce a new task ,V-RoAst (Visual question answering\nfor Road Assessment), with a real-world dataset. Our approach optimizes prompt\nengineering and evaluates advanced VLMs, including Gemini-1.5-flash and\nGPT-4o-mini. The models effectively examine attributes for road assessment.\nUsing crowdsourced imagery from Mapillary, our scalable solution influentially\nestimates road safety levels. In addition, this approach is designed for local\nstakeholders who lack resources, as it does not require training data. It\noffers a cost-effective and automated methods for global road safety\nassessments, potentially saving lives and reducing economic burdens.\n","authors":["Natchapon Jongwiriyanurak","Zichao Zeng","June Moh Goo","Xinglei Wang","Ilya Ilyankou","Kerkritt Srirrongvikrai","Meihui Wang","James Haworth"],"pdf_url":"https://arxiv.org/pdf/2408.10872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10871v1","updated":"2024-08-20T14:03:21Z","published":"2024-08-20T14:03:21Z","title":"Radio U-Net: a convolutional neural network to detect diffuse radio\n sources in galaxy clusters and beyond","summary":" The forthcoming generation of radio telescope arrays promises significant\nadvancements in sensitivity and resolution, enabling the identification and\ncharacterization of many new faint and diffuse radio sources. Conventional\nmanual cataloging methodologies are anticipated to be insufficient to exploit\nthe capabilities of new radio surveys. Radio interferometric images of diffuse\nsources present a challenge for image segmentation tasks due to noise,\nartifacts, and embedded radio sources. In response to these challenges, we\nintroduce Radio U-Net, a fully convolutional neural network based on the U-Net\narchitecture. Radio U-Net is designed to detect faint and extended sources in\nradio surveys, such as radio halos, relics, and cosmic web filaments. Radio\nU-Net was trained on synthetic radio observations built upon cosmological\nsimulations and then tested on a sample of galaxy clusters, where the detection\nof cluster diffuse radio sources relied on customized data reduction and visual\ninspection of LOFAR Two Metre Sky Survey (LoTSS) data. The 83% of clusters\nexhibiting diffuse radio emission were accurately identified, and the\nsegmentation successfully recovered the morphology of the sources even in\nlow-quality images. In a test sample comprising 246 galaxy clusters, we\nachieved a 73% accuracy rate in distinguishing between clusters with and\nwithout diffuse radio emission. Our results establish the applicability of\nRadio U-Net to extensive radio survey datasets, probing its efficiency on\ncutting-edge high-performance computing systems. This approach represents an\nadvancement in optimizing the exploitation of forthcoming large radio surveys\nfor scientific exploration.\n","authors":["Chiara Stuardi","Claudio Gheller","Franco Vazza","Andrea Botteon"],"pdf_url":"https://arxiv.org/pdf/2408.10871v1.pdf","comment":"Accepted by MNRAS, 16 pages, 9 figures, 2 tables"},{"id":"http://arxiv.org/abs/2406.18159v2","updated":"2024-08-20T13:46:05Z","published":"2024-06-26T08:18:39Z","title":"Human-Aware 3D Scene Generation with Spatially-constrained Diffusion\n Models","summary":" Generating 3D scenes from human motion sequences supports numerous\napplications, including virtual reality and architectural design. However,\nprevious auto-regression-based human-aware 3D scene generation methods have\nstruggled to accurately capture the joint distribution of multiple objects and\ninput humans, often resulting in overlapping object generation in the same\nspace. To address this limitation, we explore the potential of diffusion models\nthat simultaneously consider all input humans and the floor plan to generate\nplausible 3D scenes. Our approach not only satisfies all input human\ninteractions but also adheres to spatial constraints with the floor plan.\nFurthermore, we introduce two spatial collision guidance mechanisms:\nhuman-object collision avoidance and object-room boundary constraints. These\nmechanisms help avoid generating scenes that conflict with human motions while\nrespecting layout constraints. To enhance the diversity and accuracy of\nhuman-guided scene generation, we have developed an automated pipeline that\nimproves the variety and plausibility of human-object interactions in the\nexisting 3D FRONT HUMAN dataset. Extensive experiments on both synthetic and\nreal-world datasets demonstrate that our framework can generate more natural\nand plausible 3D scenes with precise human-scene interactions, while\nsignificantly reducing human-object collisions compared to previous\nstate-of-the-art methods. Our code and data will be made publicly available\nupon publication of this work.\n","authors":["Xiaolin Hong","Hongwei Yi","Fazhi He","Qiong Cao"],"pdf_url":"https://arxiv.org/pdf/2406.18159v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10854v1","updated":"2024-08-20T13:45:49Z","published":"2024-08-20T13:45:49Z","title":"MambaDS: Near-Surface Meteorological Field Downscaling with Topography\n Constrained Selective State Space Modeling","summary":" In an era of frequent extreme weather and global warming, obtaining precise,\nfine-grained near-surface weather forecasts is increasingly essential for human\nactivities. Downscaling (DS), a crucial task in meteorological forecasting,\nenables the reconstruction of high-resolution meteorological states for target\nregions from global-scale forecast results. Previous downscaling methods,\ninspired by CNN and Transformer-based super-resolution models, lacked tailored\ndesigns for meteorology and encountered structural limitations. Notably, they\nfailed to efficiently integrate topography, a crucial prior in the downscaling\nprocess. In this paper, we address these limitations by pioneering the\nselective state space model into the meteorological field downscaling and\npropose a novel model called MambaDS. This model enhances the utilization of\nmultivariable correlations and topography information, unique challenges in the\ndownscaling process while retaining the advantages of Mamba in long-range\ndependency modeling and linear computational complexity. Through extensive\nexperiments in both China mainland and the continental United States (CONUS),\nwe validated that our proposed MambaDS achieves state-of-the-art results in\nthree different types of meteorological field downscaling settings. We will\nrelease the code subsequently.\n","authors":["Zili Liu","Hao Chen","Lei Bai","Wenyuan Li","Wanli Ouyang","Zhengxia Zou","Zhenwei Shi"],"pdf_url":"https://arxiv.org/pdf/2408.10854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10848v1","updated":"2024-08-20T13:40:25Z","published":"2024-08-20T13:40:25Z","title":"Perception-guided Jailbreak against Text-to-Image Models","summary":" In recent years, Text-to-Image (T2I) models have garnered significant\nattention due to their remarkable advancements. However, security concerns have\nemerged due to their potential to generate inappropriate or Not-Safe-For-Work\n(NSFW) images. In this paper, inspired by the observation that texts with\ndifferent semantics can lead to similar human perceptions, we propose an\nLLM-driven perception-guided jailbreak method, termed PGJ. It is a black-box\njailbreak method that requires no specific T2I model (model-free) and generates\nhighly natural attack prompts. Specifically, we propose identifying a safe\nphrase that is similar in human perception yet inconsistent in text semantics\nwith the target unsafe word and using it as a substitution. The experiments\nconducted on six open-source models and commercial online services with\nthousands of prompts have verified the effectiveness of PGJ.\n","authors":["Yihao Huang","Le Liang","Tianlin Li","Xiaojun Jia","Run Wang","Weikai Miao","Geguang Pu","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2408.10848v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2408.10844v1","updated":"2024-08-20T13:37:02Z","published":"2024-08-20T13:37:02Z","title":"Aligning Object Detector Bounding Boxes with Human Preference","summary":" Previous work shows that humans tend to prefer large bounding boxes over\nsmall bounding boxes with the same IoU. However, we show here that commonly\nused object detectors predict large and small boxes equally often. In this\nwork, we investigate how to align automatically detected object boxes with\nhuman preference and study whether this improves human quality perception. We\nevaluate the performance of three commonly used object detectors through a user\nstudy (N = 123). We find that humans prefer object detections that are upscaled\nwith factors of 1.5 or 2, even if the corresponding AP is close to 0. Motivated\nby this result, we propose an asymmetric bounding box regression loss that\nencourages large over small predicted bounding boxes. Our evaluation study\nshows that object detectors fine-tuned with the asymmetric loss are better\naligned with human preference and are preferred over fixed scaling factors. A\nqualitative evaluation shows that human preference might be influenced by some\nobject characteristics, like object shape.\n","authors":["Ombretta Strafforello","Osman S. Kayhan","Oana Inel","Klamer Schutte","Jan van Gemert"],"pdf_url":"https://arxiv.org/pdf/2408.10844v1.pdf","comment":"Accepted paper at the ECCV 2024 workshop on Assistive Computer Vision\n and Robotics (ACVR)"},{"id":"http://arxiv.org/abs/2405.06342v4","updated":"2024-08-20T13:35:19Z","published":"2024-05-10T09:18:17Z","title":"Compression-Realized Deep Structural Network for Video Quality\n Enhancement","summary":" This paper focuses on the task of quality enhancement for compressed videos.\nAlthough deep network-based video restorers achieve impressive progress, most\nof the existing methods lack a structured design to optimally leverage the\npriors within compression codecs. Since the quality degradation of the video is\nprimarily induced by the compression algorithm, a new paradigm is urgently\nneeded for a more ``conscious'' process of quality enhancement. As a result, we\npropose the Compression-Realized Deep Structural Network (CRDS), introducing\nthree inductive biases aligned with the three primary processes in the classic\ncompression codec, merging the strengths of classical encoder architecture with\ndeep network capabilities. Inspired by the residual extraction and domain\ntransformation process in the codec, a pre-trained Latent Degradation Residual\nAuto-Encoder is proposed to transform video frames into a latent feature space,\nand the mutual neighborhood attention mechanism is integrated for precise\nmotion estimation and residual extraction. Furthermore, drawing inspiration\nfrom the quantization noise distribution of the codec, CRDS proposes a novel\nProgressive Denoising framework with intermediate supervision that decomposes\nthe quality enhancement into a series of simpler denoising sub-tasks.\nExperimental results on datasets like LDV 2.0 and MFQE 2.0 indicate our\napproach surpasses state-of-the-art models.\n","authors":["Hanchi Sun","Xiaohong Liu","Xinyang Jiang","Yifei Shen","Dongsheng Li","Xiongkuo Min","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2405.06342v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07947v2","updated":"2024-08-20T13:30:11Z","published":"2024-08-15T05:43:46Z","title":"Conditional Brownian Bridge Diffusion Model for VHR SAR to Optical Image\n Translation","summary":" Synthetic Aperture Radar (SAR) imaging technology provides the unique\nadvantage of being able to collect data regardless of weather conditions and\ntime. However, SAR images exhibit complex backscatter patterns and speckle\nnoise, which necessitate expertise for interpretation. Research on translating\nSAR images into optical-like representations has been conducted to aid the\ninterpretation of SAR data. Nevertheless, existing studies have predominantly\nutilized low-resolution satellite imagery datasets and have largely been based\non Generative Adversarial Network (GAN) which are known for their training\ninstability and low fidelity. To overcome these limitations of low-resolution\ndata usage and GAN-based approaches, this paper introduces a conditional\nimage-to-image translation approach based on Brownian Bridge Diffusion Model\n(BBDM). We conducted comprehensive experiments on the MSAW dataset, a paired\nSAR and optical images collection of 0.5m Very-High-Resolution (VHR). The\nexperimental results indicate that our method surpasses both the Conditional\nDiffusion Models (CDMs) and the GAN-based models in diverse perceptual quality\nmetrics.\n","authors":["Seon-Hoon Kim","Dae-won Chung"],"pdf_url":"https://arxiv.org/pdf/2408.07947v2.pdf","comment":"5 pages, 2 figures, 1 table"},{"id":"http://arxiv.org/abs/2408.10831v1","updated":"2024-08-20T13:28:37Z","published":"2024-08-20T13:28:37Z","title":"ZebraPose: Zebra Detection and Pose Estimation using only Synthetic Data","summary":" Synthetic data is increasingly being used to address the lack of labeled\nimages in uncommon domains for deep learning tasks. A prominent example is 2D\npose estimation of animals, particularly wild species like zebras, for which\ncollecting real-world data is complex and impractical. However, many approaches\nstill require real images, consistency and style constraints, sophisticated\nanimal models, and/or powerful pre-trained networks to bridge the syn-to-real\ngap. Moreover, they often assume that the animal can be reliably detected in\nimages or videos, a hypothesis that often does not hold, e.g. in wildlife\nscenarios or aerial images. To solve this, we use synthetic data generated with\na 3D photorealistic simulator to obtain the first synthetic dataset that can be\nused for both detection and 2D pose estimation of zebras without applying any\nof the aforementioned bridging strategies. Unlike previous works, we\nextensively train and benchmark our detection and 2D pose estimation models on\nmultiple real-world and synthetic datasets using both pre-trained and\nnon-pre-trained backbones. These experiments show how the models trained from\nscratch and only with synthetic data can consistently generalize to real-world\nimages of zebras in both tasks. Moreover, we show it is possible to easily\ngeneralize those same models to 2D pose estimation of horses with a minimal\namount of real-world images to account for the domain transfer. Code, results,\ntrained models; and the synthetic, training, and validation data, including\n104K manually labeled frames, are provided as open-source at\nhttps://zebrapose.is.tue.mpg.de/\n","authors":["Elia Bonetto","Aamir Ahmad"],"pdf_url":"https://arxiv.org/pdf/2408.10831v1.pdf","comment":"8 pages, 5 tables, 7 figures"},{"id":"http://arxiv.org/abs/2408.10827v1","updated":"2024-08-20T13:21:57Z","published":"2024-08-20T13:21:57Z","title":"CO2Wounds-V2: Extended Chronic Wounds Dataset From Leprosy Patients","summary":" Chronic wounds pose an ongoing health concern globally, largely due to the\nprevalence of conditions such as diabetes and leprosy's disease. The standard\nmethod of monitoring these wounds involves visual inspection by healthcare\nprofessionals, a practice that could present challenges for patients in remote\nareas with inadequate transportation and healthcare infrastructure. This has\nled to the development of algorithms designed for the analysis and follow-up of\nwound images, which perform image-processing tasks such as classification,\ndetection, and segmentation. However, the effectiveness of these algorithms\nheavily depends on the availability of comprehensive and varied wound image\ndata, which is usually scarce. This paper introduces the CO2Wounds-V2 dataset,\nan extended collection of RGB wound images from leprosy patients with their\ncorresponding semantic segmentation annotations, aiming to enhance the\ndevelopment and testing of image-processing algorithms in the medical field.\n","authors":["Karen Sanchez","Carlos Hinojosa","Olinto Mieles","Chen Zhao","Bernard Ghanem","Henry Arguello"],"pdf_url":"https://arxiv.org/pdf/2408.10827v1.pdf","comment":"2024 IEEE International Conference on Image Processing (ICIP 2024)"},{"id":"http://arxiv.org/abs/2408.10823v1","updated":"2024-08-20T13:18:28Z","published":"2024-08-20T13:18:28Z","title":"Trustworthy Compression? Impact of AI-based Codecs on Biometrics for Law\n Enforcement","summary":" Image-based biometrics can aid law enforcement in various aspects, for\nexample in iris, fingerprint and soft-biometric recognition. A critical\nprecondition for recognition is the availability of sufficient biometric\ninformation in images. It is visually apparent that strong JPEG compression\nremoves such details. However, latest AI-based image compression seemingly\npreserves many image details even for very strong compression factors. Yet,\nthese perceived details are not necessarily grounded in measurements, which\nraises the question whether these images can still be used for biometric\nrecognition. In this work, we investigate how AI compression impacts iris,\nfingerprint and soft-biometric (fabrics and tattoo) images. We also investigate\nthe recognition performance for iris and fingerprint images after AI\ncompression. It turns out that iris recognition can be strongly affected, while\nfingerprint recognition is quite robust. The loss of detail is qualitatively\nbest seen in fabrics and tattoos images. Overall, our results show that\nAI-compression still permits many biometric tasks, but attention to strong\ncompression factors in sensitive tasks is advisable.\n","authors":["Sandra Bergmann","Denise Moussa","Christian Riess"],"pdf_url":"https://arxiv.org/pdf/2408.10823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10821v1","updated":"2024-08-20T13:17:07Z","published":"2024-08-20T13:17:07Z","title":"Constructing a High Temporal Resolution Global Lakes Dataset via\n Swin-Unet with Applications to Area Prediction","summary":" Lakes provide a wide range of valuable ecosystem services, such as water\nsupply, biodiversity habitats, and carbon sequestration. However, lakes are\nincreasingly threatened by climate change and human activities. Therefore,\ncontinuous global monitoring of lake dynamics is crucial, but remains\nchallenging on a large scale. The recently developed Global Lakes Area Database\n(GLAKES) has mapped over 3.4 million lakes worldwide, but it only provides data\nat decadal intervals, which may be insufficient to capture rapid or short-term\nchanges.This paper introduces an expanded lake database, GLAKES-Additional,\nwhich offers biennial delineations and area measurements for 152,567 lakes\nglobally from 1990 to 2021. We employed the Swin-Unet model, replacing\ntraditional convolution operations, to effectively address the challenges posed\nby the receptive field requirements of high spatial resolution satellite\nimagery. The increased biennial time resolution helps to quantitatively\nattribute lake area changes to climatic and hydrological drivers, such as\nprecipitation and temperature changes.For predicting lake area changes, we used\na Long Short-Term Memory (LSTM) neural network and an extended time series\ndataset for preliminary modeling. Under climate and land use scenarios, our\nmodel achieved an RMSE of 0.317 km^2 in predicting future lake area changes.\n","authors":["Yutian Han","Baoxiang Huang","He Gao"],"pdf_url":"https://arxiv.org/pdf/2408.10821v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09130v2","updated":"2024-08-20T13:03:51Z","published":"2024-08-17T08:05:09Z","title":"Gaussian in the Dark: Real-Time View Synthesis From Inconsistent Dark\n Images Using Gaussian Splatting","summary":" 3D Gaussian Splatting has recently emerged as a powerful representation that\ncan synthesize remarkable novel views using consistent multi-view images as\ninput. However, we notice that images captured in dark environments where the\nscenes are not fully illuminated can exhibit considerable brightness variations\nand multi-view inconsistency, which poses great challenges to 3D Gaussian\nSplatting and severely degrades its performance. To tackle this problem, we\npropose Gaussian-DK. Observing that inconsistencies are mainly caused by camera\nimaging, we represent a consistent radiance field of the physical world using a\nset of anisotropic 3D Gaussians, and design a camera response module to\ncompensate for multi-view inconsistencies. We also introduce a step-based\ngradient scaling strategy to constrain Gaussians near the camera, which turn\nout to be floaters, from splitting and cloning. Experiments on our proposed\nbenchmark dataset demonstrate that Gaussian-DK produces high-quality renderings\nwithout ghosting and floater artifacts and significantly outperforms existing\nmethods. Furthermore, we can also synthesize light-up images by controlling\nexposure levels that clearly show details in shadow areas.\n","authors":["Sheng Ye","Zhen-Hui Dong","Yubin Hu","Yu-Hui Wen","Yong-Jin Liu"],"pdf_url":"https://arxiv.org/pdf/2408.09130v2.pdf","comment":"accepted by PG 2024"},{"id":"http://arxiv.org/abs/2408.10805v1","updated":"2024-08-20T12:55:14Z","published":"2024-08-20T12:55:14Z","title":"MPL: Lifting 3D Human Pose from Multi-view 2D Poses","summary":" Estimating 3D human poses from 2D images is challenging due to occlusions and\nprojective acquisition. Learning-based approaches have been largely studied to\naddress this challenge, both in single and multi-view setups. These solutions\nhowever fail to generalize to real-world cases due to the lack of (multi-view)\n'in-the-wild' images paired with 3D poses for training. For this reason, we\npropose combining 2D pose estimation, for which large and rich training\ndatasets exist, and 2D-to-3D pose lifting, using a transformer-based network\nthat can be trained from synthetic 2D-3D pose pairs. Our experiments\ndemonstrate decreases up to 45% in MPJPE errors compared to the 3D pose\nobtained by triangulating the 2D poses. The framework's source code is\navailable at https://github.com/aghasemzadeh/OpenMPL .\n","authors":["Seyed Abolfazl Ghasemzadeh","Alexandre Alahi","Christophe De Vleeschouwer"],"pdf_url":"https://arxiv.org/pdf/2408.10805v1.pdf","comment":"14 pages, accepted in ECCV T-CAP 2024, code:\n https://github.com/aghasemzadeh/OpenMPL"},{"id":"http://arxiv.org/abs/2408.09554v2","updated":"2024-08-20T12:47:35Z","published":"2024-08-18T17:44:00Z","title":"Screen Them All: High-Throughput Pan-Cancer Genetic and Phenotypic\n Biomarker Screening from H&E Whole Slide Images","summary":" Many molecular alterations serve as clinically prognostic or\ntherapy-predictive biomarkers, typically detected using single or multi-gene\nmolecular assays. However, these assays are expensive, tissue destructive and\noften take weeks to complete. Using AI on routine H&E WSIs offers a fast and\neconomical approach to screen for multiple molecular biomarkers. We present a\nhigh-throughput AI-based system leveraging Virchow2, a foundation model\npre-trained on 3 million slides, to interrogate genomic features previously\ndetermined by an next-generation sequencing (NGS) assay, using 47,960 scanned\nhematoxylin and eosin (H&E) whole slide images (WSIs) from 38,984 cancer\npatients. Unlike traditional methods that train individual models for each\nbiomarker or cancer type, our system employs a unified model to simultaneously\npredict a wide range of clinically relevant molecular biomarkers across cancer\ntypes. By training the network to replicate the MSK-IMPACT targeted biomarker\npanel of 505 genes, it identified 80 high performing biomarkers with a mean\nAU-ROC of 0.89 in 15 most common cancer types. In addition, 40 biomarkers\ndemonstrated strong associations with specific cancer histologic subtypes.\nFurthermore, 58 biomarkers were associated with targets frequently assayed\nclinically for therapy selection and response prediction. The model can also\npredict the activity of five canonical signaling pathways, identify defects in\nDNA repair mechanisms, and predict genomic instability measured by tumor\nmutation burden, microsatellite instability (MSI), and chromosomal instability\n(CIN). The proposed model can offer potential to guide therapy selection,\nimprove treatment efficacy, accelerate patient screening for clinical trials\nand provoke the interrogation of new therapeutic targets.\n","authors":["Yi Kan Wang","Ludmila Tydlitatova","Jeremy D. Kunz","Gerard Oakley","Ran A. Godrich","Matthew C. H. Lee","Chad Vanderbilt","Razik Yousfi","Thomas Fuchs","David S. Klimstra","Siqi Liu"],"pdf_url":"https://arxiv.org/pdf/2408.09554v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10794v1","updated":"2024-08-20T12:38:34Z","published":"2024-08-20T12:38:34Z","title":"Tapping in a Remote Vehicle's onboard LLM to Complement the Ego\n Vehicle's Field-of-View","summary":" Today's advanced automotive systems are turning into intelligent\nCyber-Physical Systems (CPS), bringing computational intelligence to their\ncyber-physical context. Such systems power advanced driver assistance systems\n(ADAS) that observe a vehicle's surroundings for their functionality. However,\nsuch ADAS have clear limitations in scenarios when the direct line-of-sight to\nsurrounding objects is occluded, like in urban areas. Imagine now automated\ndriving (AD) systems that ideally could benefit from other vehicles'\nfield-of-view in such occluded situations to increase traffic safety if, for\nexample, locations about pedestrians can be shared across vehicles. Current\nliterature suggests vehicle-to-infrastructure (V2I) via roadside units (RSUs)\nor vehicle-to-vehicle (V2V) communication to address such issues that stream\nsensor or object data between vehicles. When considering the ongoing revolution\nin vehicle system architectures towards powerful, centralized processing units\nwith hardware accelerators, foreseeing the onboard presence of large language\nmodels (LLMs) to improve the passengers' comfort when using voice assistants\nbecomes a reality. We are suggesting and evaluating a concept to complement the\nego vehicle's field-of-view (FOV) with another vehicle's FOV by tapping into\ntheir onboard LLM to let the machines have a dialogue about what the other\nvehicle ``sees''. Our results show that very recent versions of LLMs, such as\nGPT-4V and GPT-4o, understand a traffic situation to an impressive level of\ndetail, and hence, they can be used even to spot traffic participants. However,\nbetter prompts are needed to improve the detection quality and future work is\nneeded towards a standardised message interchange format between vehicles.\n","authors":["Malsha Ashani Mahawatta Dona","Beatriz Cabrero-Daniel","Yinan Yu","Christian Berger"],"pdf_url":"https://arxiv.org/pdf/2408.10794v1.pdf","comment":"50th Euromicro Conference Series on Software Engineering and Advanced\n Applications (SEAA) 2024 - WiP"},{"id":"http://arxiv.org/abs/2408.10789v1","updated":"2024-08-20T12:30:37Z","published":"2024-08-20T12:30:37Z","title":"Learning Part-aware 3D Representations by Fusing 2D Gaussians and\n Superquadrics","summary":" Low-level 3D representations, such as point clouds, meshes, NeRFs, and 3D\nGaussians, are commonly used to represent 3D objects or scenes. However, humans\nusually perceive 3D objects or scenes at a higher level as a composition of\nparts or structures rather than points or voxels. Representing 3D as semantic\nparts can benefit further understanding and applications. We aim to solve\npart-aware 3D reconstruction, which parses objects or scenes into semantic\nparts. In this paper, we introduce a hybrid representation of superquadrics and\n2D Gaussians, trying to dig 3D structural clues from multi-view image inputs.\nAccurate structured geometry reconstruction and high-quality rendering are\nachieved at the same time. We incorporate parametric superquadrics in mesh\nforms into 2D Gaussians by attaching Gaussian centers to faces in meshes.\nDuring the training, superquadrics parameters are iteratively optimized, and\nGaussians are deformed accordingly, resulting in an efficient hybrid\nrepresentation. On the one hand, this hybrid representation inherits the\nadvantage of superquadrics to represent different shape primitives, supporting\nflexible part decomposition of scenes. On the other hand, 2D Gaussians are\nincorporated to model the complex texture and geometry details, ensuring\nhigh-quality rendering and geometry reconstruction. The reconstruction is fully\nunsupervised. We conduct extensive experiments on data from DTU and ShapeNet\ndatasets, in which the method decomposes scenes into reasonable parts,\noutperforming existing state-of-the-art approaches.\n","authors":["Zhirui Gao","Renjiao Yi","Yuhang Huang","Wei Chen","Chenyang Zhu","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2408.10789v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10787v1","updated":"2024-08-20T12:27:53Z","published":"2024-08-20T12:27:53Z","title":"LightMDETR: A Lightweight Approach for Low-Cost Open-Vocabulary Object\n Detection Training","summary":" Object detection in computer vision traditionally involves identifying\nobjects in images. By integrating textual descriptions, we enhance this\nprocess, providing better context and accuracy. The MDETR model significantly\nadvances this by combining image and text data for more versatile object\ndetection and classification. However, MDETR's complexity and high\ncomputational demands hinder its practical use. In this paper, we introduce\nLightweight MDETR (LightMDETR), an optimized MDETR variant designed for\nimproved computational efficiency while maintaining robust multimodal\ncapabilities. Our approach involves freezing the MDETR backbone and training a\nsole component, the Deep Fusion Encoder (DFE), to represent image and text\nmodalities. A learnable context vector enables the DFE to switch between these\nmodalities. Evaluation on datasets like RefCOCO, RefCOCO+, and RefCOCOg\ndemonstrates that LightMDETR achieves superior precision and accuracy.\n","authors":["Binta Sow","Bilal Faye","Hanane Azzag","Mustapha Lebbah"],"pdf_url":"https://arxiv.org/pdf/2408.10787v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08802v2","updated":"2024-08-20T12:22:52Z","published":"2024-08-16T15:26:23Z","title":"PriorMapNet: Enhancing Online Vectorized HD Map Construction with Priors","summary":" Online vectorized High-Definition (HD) map construction is crucial for\nsubsequent prediction and planning tasks in autonomous driving. Following MapTR\nparadigm, recent works have made noteworthy achievements. However, reference\npoints are randomly initialized in mainstream methods, leading to unstable\nmatching between predictions and ground truth. To address this issue, we\nintroduce PriorMapNet to enhance online vectorized HD map construction with\npriors. We propose the PPS-Decoder, which provides reference points with\nposition and structure priors. Fitted from the map elements in the dataset,\nprior reference points lower the learning difficulty and achieve stable\nmatching. Furthermore, we propose the PF-Encoder to enhance the image-to-BEV\ntransformation with BEV feature priors. Besides, we propose the DMD\ncross-attention, which decouples cross-attention along multi-scale and\nmulti-sample respectively to achieve efficiency. Our proposed PriorMapNet\nachieves state-of-the-art performance in the online vectorized HD map\nconstruction task on nuScenes and Argoverse2 datasets. The code will be\nreleased publicly soon.\n","authors":["Rongxuan Wang","Xin Lu","Xiaoyang Liu","Xiaoyi Zou","Tongyi Cao","Ying Li"],"pdf_url":"https://arxiv.org/pdf/2408.08802v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10777v1","updated":"2024-08-20T12:17:25Z","published":"2024-08-20T12:17:25Z","title":"Just a Hint: Point-Supervised Camouflaged Object Detection","summary":" Camouflaged Object Detection (COD) demands models to expeditiously and\naccurately distinguish objects which conceal themselves seamlessly in the\nenvironment. Owing to the subtle differences and ambiguous boundaries, COD is\nnot only a remarkably challenging task for models but also for human\nannotators, requiring huge efforts to provide pixel-wise annotations. To\nalleviate the heavy annotation burden, we propose to fulfill this task with the\nhelp of only one point supervision. Specifically, by swiftly clicking on each\nobject, we first adaptively expand the original point-based annotation to a\nreasonable hint area. Then, to avoid partial localization around discriminative\nparts, we propose an attention regulator to scatter model attention to the\nwhole object through partially masking labeled regions. Moreover, to solve the\nunstable feature representation of camouflaged objects under only point-based\nannotation, we perform unsupervised contrastive learning based on differently\naugmented image pairs (e.g. changing color or doing translation). On three\nmainstream COD benchmarks, experimental results show that our model outperforms\nseveral weakly-supervised methods by a large margin across various metrics.\n","authors":["Huafeng Chen","Dian Shao","Guangqian Guo","Shan Gao"],"pdf_url":"https://arxiv.org/pdf/2408.10777v1.pdf","comment":"Accepted by ECCV2024"},{"id":"http://arxiv.org/abs/2408.10775v1","updated":"2024-08-20T12:14:18Z","published":"2024-08-20T12:14:18Z","title":"Generative AI in Industrial Machine Vision -- A Review","summary":" Machine vision enhances automation, quality control, and operational\nefficiency in industrial applications by enabling machines to interpret and act\non visual data. While traditional computer vision algorithms and approaches\nremain widely utilized, machine learning has become pivotal in current research\nactivities. In particular, generative \\gls*{AI} demonstrates promising\npotential by improving pattern recognition capabilities, through data\naugmentation, increasing image resolution, and identifying anomalies for\nquality control. However, the application of generative \\gls*{AI} in machine\nvision is still in its early stages due to challenges in data diversity,\ncomputational requirements, and the necessity for robust validation methods. A\ncomprehensive literature review is essential to understand the current state of\ngenerative \\gls*{AI} in industrial machine vision, focusing on recent\nadvancements, applications, and research trends. Thus, a literature review\nbased on the PRISMA guidelines was conducted, analyzing over 1,200 papers on\ngenerative \\gls*{AI} in industrial machine vision. Our findings reveal various\npatterns in current research, with the primary use of generative \\gls*{AI}\nbeing data augmentation, for machine vision tasks such as classification and\nobject detection. Furthermore, we gather a collection of application challenges\ntogether with data requirements to enable a successful application of\ngenerative \\gls*{AI} in industrial machine vision. This overview aims to\nprovide researchers with insights into the different areas and applications\nwithin current research, highlighting significant advancements and identifying\nopportunities for future work.\n","authors":["Hans Aoyang Zhou","Dominik Wolfschläger","Constantinos Florides","Jonas Werheid","Hannes Behnen","Jan-Henrick Woltersmann","Tiago C. Pinto","Marco Kemmerling","Anas Abdelrazeq","Robert H. Schmitt"],"pdf_url":"https://arxiv.org/pdf/2408.10775v1.pdf","comment":"44 pages, 7 figures, This work has been submitted to the Journal of\n Intelligent Manufacturing"},{"id":"http://arxiv.org/abs/2310.12431v2","updated":"2024-08-20T12:10:24Z","published":"2023-10-19T02:49:24Z","title":"SAM Meets UAP: Attacking Segment Anything Model With Universal\n Adversarial Perturbation","summary":" As Segment Anything Model (SAM) becomes a popular foundation model in\ncomputer vision, its adversarial robustness has become a concern that cannot be\nignored. This works investigates whether it is possible to attack SAM with\nimage-agnostic Universal Adversarial Perturbation (UAP). In other words, we\nseek a single perturbation that can fool the SAM to predict invalid masks for\nmost (if not all) images. We demonstrate convetional image-centric attack\nframework is effective for image-independent attacks but fails for universal\nadversarial attack. To this end, we propose a novel perturbation-centric\nframework that results in a UAP generation method based on self-supervised\ncontrastive learning (CL), where the UAP is set to the anchor sample and the\npositive sample is augmented from the UAP. The representations of negative\nsamples are obtained from the image encoder in advance and saved in a memory\nbank. The effectiveness of our proposed CL-based UAP generation method is\nvalidated by both quantitative and qualitative results. On top of the ablation\nstudy to understand various components in our proposed method, we shed light on\nthe roles of positive and negative samples in making the generated UAP\neffective for attacking SAM.\n","authors":["Dongshen Han","Chaoning Zhang","Sheng Zheng","Chang Lu","Yang Yang","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2310.12431v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19224v2","updated":"2024-08-20T12:05:44Z","published":"2024-05-29T16:04:03Z","title":"A study on the adequacy of common IQA measures for medical images","summary":" Image quality assessment (IQA) is standard practice in the development stage\nof novel machine learning algorithms that operate on images. The most commonly\nused IQA measures have been developed and tested for natural images, but not in\nthe medical setting. Reported inconsistencies arising in medical images are not\nsurprising, as they have different properties than natural images. In this\nstudy, we test the applicability of common IQA measures for medical image data\nby comparing their assessment to manually rated chest X-ray (5 experts) and\nphotoacoustic image data (2 experts). Moreover, we include supplementary\nstudies on grayscale natural images and accelerated brain MRI data. The results\nof all experiments show a similar outcome in line with previous findings for\nmedical imaging: PSNR and SSIM in the default setting are in the lower range of\nthe result list and HaarPSI outperforms the other tested measures in the\noverall performance. Also among the top performers in our medical experiments\nare the full reference measures FSIM, GMSD, LPIPS and MS-SSIM. Generally, the\nresults on natural images yield considerably higher correlations, suggesting\nthat the additional employment of tailored IQA measures for medical imaging\nalgorithms is needed.\n","authors":["Anna Breger","Clemens Karner","Ian Selby","Janek Gröhl","Sören Dittmer","Edward Lilley","Judith Babar","Jake Beckford","Thomas R Else","Timothy J Sadler","Shahab Shahipasand","Arthikkaa Thavakumar","Michael Roberts","Carola-Bibiane Schönlieb"],"pdf_url":"https://arxiv.org/pdf/2405.19224v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10768v1","updated":"2024-08-20T12:03:59Z","published":"2024-08-20T12:03:59Z","title":"Detection of Intracranial Hemorrhage for Trauma Patients","summary":" Whole-body CT is used for multi-trauma patients in the search of any and all\ninjuries. Since an initial assessment needs to be rapid and the search for\nlesions is done for the whole body, very little time can be allocated for the\ninspection of a specific anatomy. In particular, intracranial hemorrhages are\nstill missed, especially by clinical students. In this work, we present a Deep\nLearning approach for highlighting such lesions to improve the diagnostic\naccuracy. While most works on intracranial hemorrhages perform segmentation,\ndetection only requires bounding boxes for the localization of the bleeding. In\nthis paper, we propose a novel Voxel-Complete IoU (VC-IoU) loss that encourages\nthe network to learn the 3D aspect ratios of bounding boxes and leads to more\nprecise detections. We extensively experiment on brain bleeding detection using\na publicly available dataset, and validate it on a private cohort, where we\nachieve 0.877 AR30, 0.728 AP30, and 0.653 AR30, 0.514 AP30 respectively. These\nresults constitute a relative +5% improvement in Average Recall for both\ndatasets compared to other loss functions. Finally, as there is little data\ncurrently publicly available for 3D object detection and as annotation\nresources are limited in the clinical setting, we evaluate the cost of\ndifferent annotation methods, as well as the impact of imprecise bounding boxes\nin the training data on the detection performance.\n","authors":["Antoine P. Sanner","Nils F. Grauhan","Marc A. Brockmann","Ahmed E. Othman","Anirban Mukhopadhyay"],"pdf_url":"https://arxiv.org/pdf/2408.10768v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.02123v2","updated":"2024-08-20T11:57:06Z","published":"2024-08-04T19:37:30Z","title":"Human-inspired Explanations for Vision Transformers and Convolutional\n Neural Networks","summary":" We introduce Foveation-based Explanations (FovEx), a novel human-inspired\nvisual explainability (XAI) method for Deep Neural Networks. Our method\nachieves state-of-the-art performance on both transformer (on 4 out of 5\nmetrics) and convolutional models (on 3 out of 5 metrics), demonstrating its\nversatility. Furthermore, we show the alignment between the explanation map\nproduced by FovEx and human gaze patterns (+14\\% in NSS compared to RISE,\n+203\\% in NSS compared to gradCAM), enhancing our confidence in FovEx's ability\nto close the interpretation gap between humans and machines.\n","authors":["Mahadev Prasad Panda","Matteo Tiezzi","Martina Vilas","Gemma Roig","Bjoern M. Eskofier","Dario Zanca"],"pdf_url":"https://arxiv.org/pdf/2408.02123v2.pdf","comment":"Accepted at the Human-inspired Computer Vision (HCV) ECCV 2024\n Workshop as an extended abstract. A long version of the work can be found at\n arXiv:2408.02123v1"},{"id":"http://arxiv.org/abs/2408.05398v2","updated":"2024-08-20T11:50:29Z","published":"2024-08-10T01:35:06Z","title":"PersonViT: Large-scale Self-supervised Vision Transformer for Person\n Re-Identification","summary":" Person Re-Identification (ReID) aims to retrieve relevant individuals in\nnon-overlapping camera images and has a wide range of applications in the field\nof public safety. In recent years, with the development of Vision Transformer\n(ViT) and self-supervised learning techniques, the performance of person ReID\nbased on self-supervised pre-training has been greatly improved. Person ReID\nrequires extracting highly discriminative local fine-grained features of the\nhuman body, while traditional ViT is good at extracting context-related global\nfeatures, making it difficult to focus on local human body features. To this\nend, this article introduces the recently emerged Masked Image Modeling (MIM)\nself-supervised learning method into person ReID, and effectively extracts\nhigh-quality global and local features through large-scale unsupervised\npre-training by combining masked image modeling and discriminative contrastive\nlearning, and then conducts supervised fine-tuning training in the person ReID\ntask. This person feature extraction method based on ViT with masked image\nmodeling (PersonViT) has the good characteristics of unsupervised, scalable,\nand strong generalization capabilities, overcoming the problem of difficult\nannotation in supervised person ReID, and achieves state-of-the-art results on\npublicly available benchmark datasets, including MSMT17, Market1501,\nDukeMTMC-reID, and Occluded-Duke. The code and pre-trained models of the\nPersonViT method are released at \\url{https://github.com/hustvl/PersonViT} to\npromote further research in the person ReID field.\n","authors":["Bin Hu","Xinggang Wang","Wenyu Liu"],"pdf_url":"https://arxiv.org/pdf/2408.05398v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10760v1","updated":"2024-08-20T11:49:27Z","published":"2024-08-20T11:49:27Z","title":"SAM-COD: SAM-guided Unified Framework for Weakly-Supervised Camouflaged\n Object Detection","summary":" Most Camouflaged Object Detection (COD) methods heavily rely on mask\nannotations, which are time-consuming and labor-intensive to acquire. Existing\nweakly-supervised COD approaches exhibit significantly inferior performance\ncompared to fully-supervised methods and struggle to simultaneously support all\nthe existing types of camouflaged object labels, including scribbles, bounding\nboxes, and points. Even for Segment Anything Model (SAM), it is still\nproblematic to handle the weakly-supervised COD and it typically encounters\nchallenges of prompt compatibility of the scribble labels, extreme response,\nsemantically erroneous response, and unstable feature representations,\nproducing unsatisfactory results in camouflaged scenes. To mitigate these\nissues, we propose a unified COD framework in this paper, termed SAM-COD, which\nis capable of supporting arbitrary weakly-supervised labels. Our SAM-COD\nemploys a prompt adapter to handle scribbles as prompts based on SAM.\nMeanwhile, we introduce response filter and semantic matcher modules to improve\nthe quality of the masks obtained by SAM under COD prompts. To alleviate the\nnegative impacts of inaccurate mask predictions, a new strategy of\nprompt-adaptive knowledge distillation is utilized to ensure a reliable feature\nrepresentation. To validate the effectiveness of our approach, we have\nconducted extensive empirical experiments on three mainstream COD benchmarks.\nThe results demonstrate the superiority of our method against state-of-the-art\nweakly-supervised and even fully-supervised methods.\n","authors":["Huafeng Chen","Pengxu Wei","Guangqian Guo","Shan Gao"],"pdf_url":"https://arxiv.org/pdf/2408.10760v1.pdf","comment":"Accepted by ECCV2024"},{"id":"http://arxiv.org/abs/2307.16694v5","updated":"2024-08-20T11:47:44Z","published":"2023-07-31T14:09:03Z","title":"Investigating and Improving Latent Density Segmentation Models for\n Aleatoric Uncertainty Quantification in Medical Imaging","summary":" Data uncertainties, such as sensor noise, occlusions or limitations in the\nacquisition method can introduce irreducible ambiguities in images, which\nresult in varying, yet plausible, semantic hypotheses. In Machine Learning,\nthis ambiguity is commonly referred to as aleatoric uncertainty. In image\nsegmentation, latent density models can be utilized to address this problem.\nThe most popular approach is the Probabilistic U-Net (PU-Net), which uses\nlatent Normal densities to optimize the conditional data log-likelihood\nEvidence Lower Bound. In this work, we demonstrate that the PU-Net latent space\nis severely sparse and heavily under-utilized. To address this, we introduce\nmutual information maximization and entropy-regularized Sinkhorn Divergence in\nthe latent space to promote homogeneity across all latent dimensions,\neffectively improving gradient-descent updates and latent space\ninformativeness. Our results show that by applying this on public datasets of\nvarious clinical segmentation problems, our proposed methodology receives up to\n11% performance gains compared against preceding latent variable models for\nprobabilistic segmentation on the Hungarian-Matched Intersection over Union.\nThe results indicate that encouraging a homogeneous latent space significantly\nimproves latent density modeling for medical image segmentation.\n","authors":["M. M. Amaan Valiuddin","Christiaan G. A. Viviers","Ruud J. G. van Sloun","Peter H. N. de With","Fons van der Sommen"],"pdf_url":"https://arxiv.org/pdf/2307.16694v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10739v1","updated":"2024-08-20T11:14:23Z","published":"2024-08-20T11:14:23Z","title":"TrackNeRF: Bundle Adjusting NeRF from Sparse and Noisy Views via Feature\n Tracks","summary":" Neural radiance fields (NeRFs) generally require many images with accurate\nposes for accurate novel view synthesis, which does not reflect realistic\nsetups where views can be sparse and poses can be noisy. Previous solutions for\nlearning NeRFs with sparse views and noisy poses only consider local geometry\nconsistency with pairs of views. Closely following \\textit{bundle adjustment}\nin Structure-from-Motion (SfM), we introduce TrackNeRF for more globally\nconsistent geometry reconstruction and more accurate pose optimization.\nTrackNeRF introduces \\textit{feature tracks}, \\ie connected pixel trajectories\nacross \\textit{all} visible views that correspond to the \\textit{same} 3D\npoints. By enforcing reprojection consistency among feature tracks, TrackNeRF\nencourages holistic 3D consistency explicitly. Through extensive experiments,\nTrackNeRF sets a new benchmark in noisy and sparse view reconstruction. In\nparticular, TrackNeRF shows significant improvements over the state-of-the-art\nBARF and SPARF by $\\sim8$ and $\\sim1$ in terms of PSNR on DTU under various\nsparse and noisy view setups. The code is available at\n\\href{https://tracknerf.github.io/}.\n","authors":["Jinjie Mai","Wenxuan Zhu","Sara Rojas","Jesus Zarzar","Abdullah Hamdi","Guocheng Qian","Bing Li","Silvio Giancola","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2408.10739v1.pdf","comment":"ECCV 2024 (supplemental pages included)"},{"id":"http://arxiv.org/abs/2407.13555v2","updated":"2024-08-20T11:12:41Z","published":"2024-07-18T14:28:31Z","title":"PetFace: A Large-Scale Dataset and Benchmark for Animal Identification","summary":" Automated animal face identification plays a crucial role in the monitoring\nof behaviors, conducting of surveys, and finding of lost animals. Despite the\nadvancements in human face identification, the lack of datasets and benchmarks\nin the animal domain has impeded progress. In this paper, we introduce the\nPetFace dataset, a comprehensive resource for animal face identification\nencompassing 257,484 unique individuals across 13 animal families and 319 breed\ncategories, including both experimental and pet animals. This large-scale\ncollection of individuals facilitates the investigation of unseen animal face\nverification, an area that has not been sufficiently explored in existing\ndatasets due to the limited number of individuals. Moreover, PetFace also has\nfine-grained annotations such as sex, breed, color, and pattern. We provide\nmultiple benchmarks including re-identification for seen individuals and\nverification for unseen individuals. The models trained on our dataset\noutperform those trained on prior datasets, even for detailed breed variations\nand unseen animal families. Our result also indicates that there is some room\nto improve the performance of integrated identification on multiple animal\nfamilies. We hope the PetFace dataset will facilitate animal face\nidentification and encourage the development of non-invasive animal automatic\nidentification methods.\n","authors":["Risa Shinoda","Kaede Shiohara"],"pdf_url":"https://arxiv.org/pdf/2407.13555v2.pdf","comment":"ECCV 2024. Dataset and code: https://dahlian00.github.io/PetFacePage/"},{"id":"http://arxiv.org/abs/2408.10733v1","updated":"2024-08-20T11:05:32Z","published":"2024-08-20T11:05:32Z","title":"Classification of Endoscopy and Video Capsule Images using\n CNN-Transformer Model","summary":" Gastrointestinal cancer is a leading cause of cancer-related incidence and\ndeath, making it crucial to develop novel computer-aided diagnosis systems for\nearly detection and enhanced treatment. Traditional approaches rely on the\nexpertise of gastroenterologists to identify diseases; however, this process is\nsubjective, and interpretation can vary even among expert clinicians.\nConsidering recent advancements in classifying gastrointestinal anomalies and\nlandmarks in endoscopic and video capsule endoscopy images, this study proposes\na hybrid model that combines the advantages of Transformers and Convolutional\nNeural Networks (CNNs) to enhance classification performance. Our model\nutilizes DenseNet201 as a CNN branch to extract local features and integrates a\nSwin Transformer branch for global feature understanding, combining both to\nperform the classification task. For the GastroVision dataset, our proposed\nmodel demonstrates excellent performance with Precision, Recall, F1 score,\nAccuracy, and Matthews Correlation Coefficient (MCC) of 0.8320, 0.8386, 0.8324,\n0.8386, and 0.8191, respectively, showcasing its robustness against class\nimbalance and surpassing other CNNs as well as the Swin Transformer model.\nSimilarly, for the Kvasir-Capsule, a large video capsule endoscopy dataset, our\nmodel outperforms all others, achieving overall Precision, Recall, F1 score,\nAccuracy, and MCC of 0.7007, 0.7239, 0.6900, 0.7239, and 0.3871. Moreover, we\ngenerated saliency maps to explain our model's focus areas, demonstrating its\nreliable decision-making process. The results underscore the potential of our\nhybrid CNN-Transformer model in aiding the early and accurate detection of\ngastrointestinal (GI) anomalies.\n","authors":["Aliza Subedi","Smriti Regmi","Nisha Regmi","Bhumi Bhusal","Ulas Bagci","Debesh Jha"],"pdf_url":"https://arxiv.org/pdf/2408.10733v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.05074v2","updated":"2024-08-20T11:01:21Z","published":"2024-06-07T16:45:53Z","title":"Hibou: A Family of Foundational Vision Transformers for Pathology","summary":" Pathology, the microscopic examination of diseased tissue, is critical for\ndiagnosing various medical conditions, particularly cancers. Traditional\nmethods are labor-intensive and prone to human error. Digital pathology, which\nconverts glass slides into high-resolution digital images for analysis by\ncomputer algorithms, revolutionizes the field by enhancing diagnostic accuracy,\nconsistency, and efficiency through automated image analysis and large-scale\ndata processing. Foundational transformer pretraining is crucial for developing\nrobust, generalizable models as it enables learning from vast amounts of\nunannotated data.\n This paper introduces the Hibou family of foundational vision transformers\nfor pathology, leveraging the DINOv2 framework to pretrain two model variants,\nHibou-B and Hibou-L, on a proprietary dataset of over 1 million whole slide\nimages (WSIs) representing diverse tissue types and staining techniques. Our\npretrained models demonstrate superior performance on both patch-level and\nslide-level benchmarks, surpassing existing state-of-the-art methods. Notably,\nHibou-L achieves the highest average accuracy across multiple benchmark\ndatasets. To support further research and application in the field, we have\nopen-sourced the Hibou models, which can be accessed at\nhttps://github.com/HistAI/hibou.\n","authors":["Dmitry Nechaev","Alexey Pchelnikov","Ekaterina Ivanova"],"pdf_url":"https://arxiv.org/pdf/2406.05074v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.13479v3","updated":"2024-08-20T10:35:24Z","published":"2023-10-20T13:20:17Z","title":"Segment, Select, Correct: A Framework for Weakly-Supervised Referring\n Segmentation","summary":" Referring Image Segmentation (RIS) - the problem of identifying objects in\nimages through natural language sentences - is a challenging task currently\nmostly solved through supervised learning. However, while collecting referred\nannotation masks is a time-consuming process, the few existing\nweakly-supervised and zero-shot approaches fall significantly short in\nperformance compared to fully-supervised learning ones. To bridge the\nperformance gap without mask annotations, we propose a novel weakly-supervised\nframework that tackles RIS by decomposing it into three steps: obtaining\ninstance masks for the object mentioned in the referencing instruction\n(segment), using zero-shot learning to select a potentially correct mask for\nthe given instruction (select), and bootstrapping a model which allows for\nfixing the mistakes of zero-shot selection (correct). In our experiments, using\nonly the first two steps (zero-shot segment and select) outperforms other\nzero-shot baselines by as much as 16.5%, while our full method improves upon\nthis much stronger baseline and sets the new state-of-the-art for\nweakly-supervised RIS, reducing the gap between the weakly-supervised and\nfully-supervised methods in some cases from around 33% to as little as 7%. Code\nis available at https://github.com/fgirbal/segment-select-correct.\n","authors":["Francisco Eiras","Kemal Oksuz","Adel Bibi","Philip H. S. Torr","Puneet K. Dokania"],"pdf_url":"https://arxiv.org/pdf/2310.13479v3.pdf","comment":"Accepted to ECCV'24 Workshop Proceedings (Instance-Level Recognition\n Workshop)"},{"id":"http://arxiv.org/abs/2408.10710v1","updated":"2024-08-20T10:24:59Z","published":"2024-08-20T10:24:59Z","title":"Coarse-to-Fine Detection of Multiple Seams for Robotic Welding","summary":" Efficiently detecting target weld seams while ensuring sub-millimeter\naccuracy has always been an important challenge in autonomous welding, which\nhas significant application in industrial practice. Previous works mostly\nfocused on recognizing and localizing welding seams one by one, leading to\ninferior efficiency in modeling the workpiece. This paper proposes a novel\nframework capable of multiple weld seams extraction using both RGB images and\n3D point clouds. The RGB image is used to obtain the region of interest by\napproximately localizing the weld seams, and the point cloud is used to achieve\nthe fine-edge extraction of the weld seams within the region of interest using\nregion growth. Our method is further accelerated by using a pre-trained deep\nlearning model to ensure both efficiency and generalization ability. The\nperformance of the proposed method has been comprehensively tested on various\nworkpieces featuring both linear and curved weld seams and in physical\nexperiment systems. The results showcase considerable potential for real-world\nindustrial applications, emphasizing the method's efficiency and effectiveness.\nVideos of the real-world experiments can be found at\nhttps://youtu.be/pq162HSP2D4.\n","authors":["Pengkun Wei","Shuo Cheng","Dayou Li","Ran Song","Yipeng Zhang","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.10710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05477v2","updated":"2024-08-20T10:16:00Z","published":"2024-08-10T08:09:57Z","title":"Scene123: One Prompt to 3D Scene Generation via Video-Assisted and\n Consistency-Enhanced MAE","summary":" As Artificial Intelligence Generated Content (AIGC) advances, a variety of\nmethods have been developed to generate text, images, videos, and 3D objects\nfrom single or multimodal inputs, contributing efforts to emulate human-like\ncognitive content creation. However, generating realistic large-scale scenes\nfrom a single input presents a challenge due to the complexities involved in\nensuring consistency across extrapolated views generated by models. Benefiting\nfrom recent video generation models and implicit neural representations, we\npropose Scene123, a 3D scene generation model, that not only ensures realism\nand diversity through the video generation framework but also uses implicit\nneural fields combined with Masked Autoencoders (MAE) to effectively ensures\nthe consistency of unseen areas across views. Specifically, we initially warp\nthe input image (or an image generated from text) to simulate adjacent views,\nfilling the invisible areas with the MAE model. However, these filled images\nusually fail to maintain view consistency, thus we utilize the produced views\nto optimize a neural radiance field, enhancing geometric consistency.\n Moreover, to further enhance the details and texture fidelity of generated\nviews, we employ a GAN-based Loss against images derived from the input image\nthrough the video generation model. Extensive experiments demonstrate that our\nmethod can generate realistic and consistent scenes from a single prompt. Both\nqualitative and quantitative results indicate that our approach surpasses\nexisting state-of-the-art methods. We show encourage video examples at\nhttps://yiyingyang12.github.io/Scene123.github.io/.\n","authors":["Yiying Yang","Fukun Yin","Jiayuan Fan","Xin Chen","Wanzhang Li","Gang Yu"],"pdf_url":"https://arxiv.org/pdf/2408.05477v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2305.11588 by other authors"},{"id":"http://arxiv.org/abs/2408.10703v1","updated":"2024-08-20T09:58:30Z","published":"2024-08-20T09:58:30Z","title":"Large Language Models for Multimodal Deformable Image Registration","summary":" The challenge of Multimodal Deformable Image Registration (MDIR) lies in the\nconversion and alignment of features between images of different modalities.\nGenerative models (GMs) cannot retain the necessary information enough from the\nsource modality to the target one, while non-GMs struggle to align features\nacross these two modalities. In this paper, we propose a novel coarse-to-fine\nMDIR framework,LLM-Morph, which is applicable to various pre-trained Large\nLanguage Models (LLMs) to solve these concerns by aligning the deep features\nfrom different modal medical images. Specifically, we first utilize a CNN\nencoder to extract deep visual features from cross-modal image pairs, then we\nuse the first adapter to adjust these tokens, and use LoRA in pre-trained LLMs\nto fine-tune their weights, both aimed at eliminating the domain gap between\nthe pre-trained LLMs and the MDIR task. Third, for the alignment of tokens, we\nutilize other four adapters to transform the LLM-encoded tokens into\nmulti-scale visual features, generating multi-scale deformation fields and\nfacilitating the coarse-to-fine MDIR task. Extensive experiments in MR-CT\nAbdomen and SR-Reg Brain datasets demonstrate the effectiveness of our\nframework and the potential of pre-trained LLMs for MDIR task. Our code is\navailabel at: https://github.com/ninjannn/LLM-Morph.\n","authors":["Mingrui Ma","Weijie Wang","Jie Ning","Jianfeng He","Nicu Sebe","Bruno Lepri"],"pdf_url":"https://arxiv.org/pdf/2408.10703v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10694v1","updated":"2024-08-20T09:46:30Z","published":"2024-08-20T09:46:30Z","title":"MsMemoryGAN: A Multi-scale Memory GAN for Palm-vein Adversarial\n Purification","summary":" Deep neural networks have recently achieved promising performance in the vein\nrecognition task and have shown an increasing application trend, however, they\nare prone to adversarial perturbation attacks by adding imperceptible\nperturbations to the input, resulting in making incorrect recognition. To\naddress this issue, we propose a novel defense model named MsMemoryGAN, which\naims to filter the perturbations from adversarial samples before recognition.\nFirst, we design a multi-scale autoencoder to achieve high-quality\nreconstruction and two memory modules to learn the detailed patterns of normal\nsamples at different scales. Second, we investigate a learnable metric in the\nmemory module to retrieve the most relevant memory items to reconstruct the\ninput image. Finally, the perceptional loss is combined with the pixel loss to\nfurther enhance the quality of the reconstructed image. During the training\nphase, the MsMemoryGAN learns to reconstruct the input by merely using fewer\nprototypical elements of the normal patterns recorded in the memory. At the\ntesting stage, given an adversarial sample, the MsMemoryGAN retrieves its most\nrelevant normal patterns in memory for the reconstruction. Perturbations in the\nadversarial sample are usually not reconstructed well, resulting in purifying\nthe input from adversarial perturbations. We have conducted extensive\nexperiments on two public vein datasets under different adversarial attack\nmethods to evaluate the performance of the proposed approach. The experimental\nresults show that our approach removes a wide variety of adversarial\nperturbations, allowing vein classifiers to achieve the highest recognition\naccuracy.\n","authors":["Huafeng Qin","Yuming Fu","Huiyan Zhang","Mounim A. El-Yacoubi","Xinbo Gao","Qun Song","Jun Wang"],"pdf_url":"https://arxiv.org/pdf/2408.10694v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10688v1","updated":"2024-08-20T09:40:08Z","published":"2024-08-20T09:40:08Z","title":"TDS-CLIP: Temporal Difference Side Network for Image-to-Video Transfer\n Learning","summary":" Recently, large-scale pre-trained vision-language models (e.g., CLIP), have\ngarnered significant attention thanks to their powerful representative\ncapabilities. This inspires researchers in transferring the knowledge from\nthese large pre-trained models to other task-specific models, e.g., Video\nAction Recognition (VAR) models, via particularly leveraging side networks to\nenhance the efficiency of parameter-efficient fine-tuning (PEFT). However,\ncurrent transferring approaches in VAR tend to directly transfer the frozen\nknowledge from large pre-trained models to action recognition networks with\nminimal cost, instead of exploiting the temporal modeling capabilities of the\naction recognition models themselves. Therefore, in this paper, we propose a\nmemory-efficient Temporal Difference Side Network (TDS-CLIP) to balance\nknowledge transferring and temporal modeling, avoiding backpropagation in\nfrozen parameter models. Specifically, we introduce a Temporal Difference\nAdapter (TD-Adapter), which can effectively capture local temporal differences\nin motion features to strengthen the model's global temporal modeling\ncapabilities. Furthermore, we designed a Side Motion Enhancement Adapter\n(SME-Adapter) to guide the proposed side network in efficiently learning the\nrich motion information in videos, thereby improving the side network's ability\nto capture and learn motion information. Extensive experiments are conducted on\nthree benchmark datasets, including Something-Something V1\\&V2, and\nKinetics-400. Experimental results demonstrate that our approach achieves\ncompetitive performance.\n","authors":["Bin Wang","Wenqian Wang"],"pdf_url":"https://arxiv.org/pdf/2408.10688v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10679v1","updated":"2024-08-20T09:31:03Z","published":"2024-08-20T09:31:03Z","title":"DemMamba: Alignment-free Raw Video Demoireing with Frequency-assisted\n Spatio-Temporal Mamba","summary":" Moire patterns arise when two similar repetitive patterns interfere, a\nphenomenon frequently observed during the capture of images or videos on\nscreens. The color, shape, and location of moire patterns may differ across\nvideo frames, posing a challenge in learning information from adjacent frames\nand preserving temporal consistency. Previous video demoireing methods heavily\nrely on well-designed alignment modules, resulting in substantial computational\nburdens. Recently, Mamba, an improved version of the State Space Model (SSM),\nhas demonstrated significant potential for modeling long-range dependencies\nwith linear complexity, enabling efficient temporal modeling in video\ndemoireing without requiring a specific alignment module. In this paper, we\npropose a novel alignment-free Raw video demoireing network with\nfrequency-assisted spatio-temporal Mamba (DemMamba). The Spatial Mamba Block\n(SMB) and Temporal Mamba Block (TMB) are sequentially arranged to facilitate\neffective intra- and inter-relationship modeling in Raw videos with moire\npatterns. Within SMB, an Adaptive Frequency Block (AFB) is introduced to aid\ndemoireing in the frequency domain. For TMB, a Channel Attention Block (CAB) is\nembedded to further enhance temporal information interactions by exploiting the\ninter-channel relationships among features. Extensive experiments demonstrate\nthat our proposed DemMamba surpasses state-of-the-art approaches by 1.3 dB and\ndelivers a superior visual experience.\n","authors":["Shuning Xu","Xina Liu","Binbin Song","Xiangyu Chen","Qiubo Chen","Jiantao Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.10679v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03782v2","updated":"2024-08-20T09:13:35Z","published":"2023-12-06T09:59:30Z","title":"Novel class discovery meets foundation models for 3D semantic\n segmentation","summary":" The task of Novel Class Discovery (NCD) in semantic segmentation entails\ntraining a model able to accurately segment unlabelled (novel) classes, relying\non the available supervision from annotated (base) classes. Although\nextensively investigated in 2D image data, the extension of the NCD task to the\ndomain of 3D point clouds represents a pioneering effort, characterized by\nassumptions and challenges that are not present in the 2D case. This paper\nrepresents an advancement in the analysis of point cloud data in four\ndirections. Firstly, it introduces the novel task of NCD for point cloud\nsemantic segmentation. Secondly, it demonstrates that directly transposing the\nonly existing NCD method for 2D image semantic segmentation to 3D data yields\nsuboptimal results. Thirdly, a new NCD approach based on online clustering,\nuncertainty estimation, and semantic distillation is presented. Lastly, a novel\nevaluation protocol is proposed to rigorously assess the performance of NCD in\npoint cloud semantic segmentation. Through comprehensive evaluations on the\nSemanticKITTI, SemanticPOSS, and S3DIS datasets, the paper demonstrates\nsubstantial superiority of the proposed method over the considered baselines.\n","authors":["Luigi Riz","Cristiano Saltori","Yiming Wang","Elisa Ricci","Fabio Poiesi"],"pdf_url":"https://arxiv.org/pdf/2312.03782v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2303.11610"},{"id":"http://arxiv.org/abs/2408.10670v1","updated":"2024-08-20T09:13:12Z","published":"2024-08-20T09:13:12Z","title":"A Noncontact Technique for Wave Measurement Based on Thermal\n Stereography and Deep Learning","summary":" The accurate measurement of the wave field and its spatiotemporal evolution\nis essential in many hydrodynamic experiments and engineering applications. The\nbinocular stereo imaging technique has been widely used to measure waves.\nHowever, the optical properties of indoor water surfaces, including\ntransparency, specular reflection, and texture absence, pose challenges for\nimage processing and stereo reconstruction. This study proposed a novel\ntechnique that combined thermal stereography and deep learning to achieve fully\nnoncontact wave measurements. The optical imaging properties of water in the\nlong-wave infrared spectrum were found to be suitable for stereo matching,\neffectively avoiding the issues in the visible-light spectrum. After capturing\nwave images using thermal stereo cameras, a reconstruction strategy involving\ndeep learning techniques was proposed to improve stereo matching performance. A\ngenerative approach was employed to synthesize a dataset with ground-truth\ndisparity from unannotated infrared images. This dataset was then fed to a\npretrained stereo neural network for fine-tuning to achieve domain adaptation.\nWave flume experiments were conducted to validate the feasibility and accuracy\nof the proposed technique. The final reconstruction results indicated great\nagreement and high accuracy with a mean bias of less than 2.1% compared with\nthe measurements obtained using wave probes, suggesting that the novel\ntechnique effectively measures the spatiotemporal distribution of wave surface\nin hydrodynamic experiments.\n","authors":["Deyu Li","Longfei Xiao","Handi Wei","Yan Li","Binghua Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.10670v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14554v2","updated":"2024-08-20T09:04:25Z","published":"2024-05-23T13:32:07Z","title":"SearchLVLMs: A Plug-and-Play Framework for Augmenting Large\n Vision-Language Models by Searching Up-to-Date Internet Knowledge","summary":" Large vision-language models (LVLMs) are ignorant of the up-to-date\nknowledge, such as LLaVA series, because they cannot be updated frequently due\nto the large amount of resources required, and therefore fail in many cases.\nFor example, if a LVLM was released on January 2024, and it wouldn't know the\nsinger of the theme song for the new Detective Conan movie, which wasn't\nreleased until April 2024. To solve the problem, a promising solution motivated\nby retrieval-augmented generation (RAG) is to provide LVLMs with up-to-date\nknowledge via internet search during inference, i.e., internet-augmented\ngeneration (IAG), which is already integrated in some closed-source commercial\nLVLMs such as GPT-4V. However, the specific mechanics underpinning them remain\na mystery. In this paper, we propose a plug-and-play framework, for augmenting\nexisting LVLMs in handling visual question answering (VQA) about up-to-date\nknowledge, dubbed SearchLVLMs. A hierarchical filtering model is trained to\neffectively and efficiently find the most helpful content from the websites\nreturned by a search engine to prompt LVLMs with up-to-date knowledge. To train\nthe model and evaluate our framework's performance, we propose a pipeline to\nautomatically generate news-related VQA samples to construct a dataset, dubbed\nUDK-VQA. A multi-model voting mechanism is introduced to label the usefulness\nof website/content for VQA samples to construct the training set. Experimental\nresults demonstrate the effectiveness of our framework, outperforming GPT-4V by\nabout 25% in accuracy.\n","authors":["Chuanhao Li","Zhen Li","Chenchen Jing","Shuo Liu","Wenqi Shao","Yuwei Wu","Ping Luo","Yu Qiao","Kaipeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.14554v2.pdf","comment":"13 pages, 6 figures, a plug-and-play framework to augment large\n vision-language models with up-to-date internet knowledge"},{"id":"http://arxiv.org/abs/2307.07439v4","updated":"2024-08-20T08:52:17Z","published":"2023-07-14T16:04:03Z","title":"Atlas-Based Interpretable Age Prediction In Whole-Body MR Images","summary":" Age prediction is an important part of medical assessments and research. It\ncan aid in detecting diseases as well as abnormal ageing by highlighting\npotential discrepancies between chronological and biological age. To improve\nunderstanding of age-related changes in various body parts, we investigate the\nageing of the human body on a large scale by using whole-body 3D images. We\nutilise the Grad-CAM method to determine the body areas most predictive of a\nperson's age. In order to expand our analysis beyond individual subjects, we\nemploy registration techniques to generate population-wide importance maps that\nshow the most predictive areas in the body for a whole cohort of subjects. We\nshow that the investigation of the full 3D volume of the whole body and the\npopulation-wide analysis can give important insights into which body parts play\nthe most important roles in predicting a person's age. Our findings reveal\nthree primary areas of interest: the spine, the autochthonous back muscles, and\nthe cardiac region, which exhibits the highest importance. Finally, we\ninvestigate differences between subjects that show accelerated and decelerated\nageing.\n","authors":["Sophie Starck","Yadunandan Vivekanand Kini","Jessica Johanna Maria Ritter","Rickmer Braren","Daniel Rueckert","Tamara Mueller"],"pdf_url":"https://arxiv.org/pdf/2307.07439v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10656v1","updated":"2024-08-20T08:51:09Z","published":"2024-08-20T08:51:09Z","title":"deepmriprep: Voxel-based Morphometry (VBM) Preprocessing via Deep Neural\n Networks","summary":" Voxel-based Morphometry (VBM) has emerged as a powerful approach in\nneuroimaging research, utilized in over 7,000 studies since the year 2000.\nUsing Magnetic Resonance Imaging (MRI) data, VBM assesses variations in the\nlocal density of brain tissue and examines its associations with biological and\npsychometric variables. Here, we present deepmriprep, a neural network-based\npipeline that performs all necessary preprocessing steps for VBM analysis of\nT1-weighted MR images using deep neural networks. Utilizing the Graphics\nProcessing Unit (GPU), deepmriprep is 37 times faster than CAT12, the leading\nVBM preprocessing toolbox. The proposed method matches CAT12 in accuracy for\ntissue segmentation and image registration across more than 100 datasets and\nshows strong correlations in VBM results. Tissue segmentation maps from\ndeepmriprep have over 95% agreement with ground truth maps, and its non-linear\nregistration, using supervised SYMNet, predicts smooth deformation fields\ncomparable to CAT12. The high processing speed of deepmriprep enables rapid\npreprocessing of extensive datasets and thereby fosters the application of VBM\nanalysis to large-scale neuroimaging studies and opens the door to real-time\napplications. Finally, deepmripreps straightforward, modular design enables\nresearchers to easily understand, reuse, and advance the underlying methods,\nfostering further advancements in neuroimaging research. deepmriprep can be\nconveniently installed as a Python package and is publicly accessible at\nhttps://github.com/wwu-mmll/deepmriprep.\n","authors":["Lukas Fisch","Nils R. Winter","Janik Goltermann","Carlotta Barkhau","Daniel Emden","Jan Ernsting","Maximilian Konowski","Ramona Leenings","Tiana Borgers","Kira Flinkenflügel","Dominik Grotegerd","Anna Kraus","Elisabeth J. Leehr","Susanne Meinert","Frederike Stein","Lea Teutenberg","Florian Thomas-Odenthal","Paula Usemann","Marco Hermesdorf","Hamidreza Jamalabadi","Andreas Jansen","Igor Nenadic","Benjamin Straube","Tilo Kircher","Klaus Berger","Benjamin Risse","Udo Dannlowski","Tim Hahn"],"pdf_url":"https://arxiv.org/pdf/2408.10656v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10653v1","updated":"2024-08-20T08:48:33Z","published":"2024-08-20T08:48:33Z","title":"UIE-UnFold: Deep Unfolding Network with Color Priors and Vision\n Transformer for Underwater Image Enhancement","summary":" Underwater image enhancement (UIE) plays a crucial role in various marine\napplications, but it remains challenging due to the complex underwater\nenvironment. Current learning-based approaches frequently lack explicit\nincorporation of prior knowledge about the physical processes involved in\nunderwater image formation, resulting in limited optimization despite their\nimpressive enhancement results. This paper proposes a novel deep unfolding\nnetwork (DUN) for UIE that integrates color priors and inter-stage feature\ntransformation to improve enhancement performance. The proposed DUN model\ncombines the iterative optimization and reliability of model-based methods with\nthe flexibility and representational power of deep learning, offering a more\nexplainable and stable solution compared to existing learning-based UIE\napproaches. The proposed model consists of three key components: a Color Prior\nGuidance Block (CPGB) that establishes a mapping between color channels of\ndegraded and original images, a Nonlinear Activation Gradient Descent Module\n(NAGDM) that simulates the underwater image degradation process, and an Inter\nStage Feature Transformer (ISF-Former) that facilitates feature exchange\nbetween different network stages. By explicitly incorporating color priors and\nmodeling the physical characteristics of underwater image formation, the\nproposed DUN model achieves more accurate and reliable enhancement results.\nExtensive experiments on multiple underwater image datasets demonstrate the\nsuperiority of the proposed model over state-of-the-art methods in both\nquantitative and qualitative evaluations. The proposed DUN-based approach\noffers a promising solution for UIE, enabling more accurate and reliable\nscientific analysis in marine research. The code is available at\nhttps://github.com/CXH-Research/UIE-UnFold.\n","authors":["Yingtie Lei","Jia Yu","Yihang Dong","Changwei Gong","Ziyang Zhou","Chi-Man Pun"],"pdf_url":"https://arxiv.org/pdf/2408.10653v1.pdf","comment":"Accepted by DSAA CIVIL 2024"},{"id":"http://arxiv.org/abs/2408.10652v1","updated":"2024-08-20T08:46:54Z","published":"2024-08-20T08:46:54Z","title":"Vocabulary-Free 3D Instance Segmentation with Vision and Language\n Assistant","summary":" Most recent 3D instance segmentation methods are open vocabulary, offering a\ngreater flexibility than closed-vocabulary methods. Yet, they are limited to\nreasoning within a specific set of concepts, \\ie the vocabulary, prompted by\nthe user at test time. In essence, these models cannot reason in an open-ended\nfashion, i.e., answering ``List the objects in the scene.''. We introduce the\nfirst method to address 3D instance segmentation in a setting that is void of\nany vocabulary prior, namely a vocabulary-free setting. We leverage a large\nvision-language assistant and an open-vocabulary 2D instance segmenter to\ndiscover and ground semantic categories on the posed images. To form 3D\ninstance mask, we first partition the input point cloud into dense superpoints,\nwhich are then merged into 3D instance masks. We propose a novel superpoint\nmerging strategy via spectral clustering, accounting for both mask coherence\nand semantic coherence that are estimated from the 2D object instance masks. We\nevaluate our method using ScanNet200 and Replica, outperforming existing\nmethods in both vocabulary-free and open-vocabulary settings. Code will be made\navailable.\n","authors":["Guofeng Mei","Luigi Riz","Yiming Wang","Fabio Poiesi"],"pdf_url":"https://arxiv.org/pdf/2408.10652v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09218v2","updated":"2024-08-20T08:46:37Z","published":"2024-08-17T14:55:15Z","title":"A Fast and Computationally Inexpensive Method For Image Translation of\n 3D Volume Patient Data","summary":" CycleGAN was trained on SynthRAD Grand Challenge Dataset using the\nsingle-epoch modification (SEM) method proposed in this paper which is referred\nto as (CycleGAN-single) compared to the usual method of training CycleGAN on\naround 200 epochs (CycleGAN-multi). Model performance were evaluated\nqualitatively and quantitatively with quantitative performance metrics like\nPSNR, SSIM, MAE and MSE. The consideration of both quantitative and qualitative\nperformance when evaluating a model is unique to certain image-translation\ntasks like medical imaging as detailed in this paper. Also, this paper shows\nthat good quantitative performance does not always imply good qualitative\nperformance and the converse is also not always True (i.e. good qualitative\nperformance does not always imply good quantitative performance). This paper\nalso proposes FQGA (Fast Paired Image-to-Image Translation Quarter-Generator\nAdversary) Model which has 1/4 the number of parameters compared to CycleGAN\n(when comparing their Generator Models). FQGA outperforms CycleGAN\nqualitatively and quantitatively even only after training on 20 epochs.\nFinally, using SEM method on FQGA allowed it to again outperform CycleGAN both\nquantitatively and qualitatively. These performance gains with fewer model\nparameters and time savings from running fewer epochs may also be applicable to\nother image-to-image translation tasks in Machine Learning apart from the\nMedical image-translation task discussed in this paper between Cone Beam\nComputed Tomography (CBCT) and Computed Tomography (CT) images.\n","authors":["Cho Yang"],"pdf_url":"https://arxiv.org/pdf/2408.09218v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09680v2","updated":"2024-08-20T08:44:42Z","published":"2024-08-19T03:38:29Z","title":"MambaLoc: Efficient Camera Localisation via State Space Model","summary":" Location information is pivotal for the automation and intelligence of\nterminal devices and edge-cloud IoT systems, such as autonomous vehicles and\naugmented reality. However, achieving reliable positioning across diverse IoT\napplications remains challenging due to significant training costs and the\nnecessity of densely collected data. To tackle these issues, we have\ninnovatively applied the selective state space (SSM) model to visual\nlocalization, introducing a new model named MambaLoc. The proposed model\ndemonstrates exceptional training efficiency by capitalizing on the SSM model's\nstrengths in efficient feature extraction, rapid computation, and memory\noptimization, and it further ensures robustness in sparse data environments due\nto its parameter sparsity. Additionally, we propose the Global Information\nSelector (GIS), which leverages selective SSM to implicitly achieve the\nefficient global feature extraction capabilities of Non-local Neural Networks.\nThis design leverages the computational efficiency of the SSM model alongside\nthe Non-local Neural Networks' capacity to capture long-range dependencies with\nminimal layers. Consequently, the GIS enables effective global information\ncapture while significantly accelerating convergence. Our extensive\nexperimental validation using public indoor and outdoor datasets first\ndemonstrates our model's effectiveness, followed by evidence of its versatility\nwith various existing localization models. Our code and models are publicly\navailable to support further research and development in this area.\n","authors":["Jialu Wang","Kaichen Zhou","Andrew Markham","Niki Trigoni"],"pdf_url":"https://arxiv.org/pdf/2408.09680v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08258v2","updated":"2024-08-20T08:36:59Z","published":"2024-08-15T16:59:15Z","title":"Snuffy: Efficient Whole Slide Image Classifier","summary":" Whole Slide Image (WSI) classification with multiple instance learning (MIL)\nin digital pathology faces significant computational challenges. Current\nmethods mostly rely on extensive self-supervised learning (SSL) for\nsatisfactory performance, requiring long training periods and considerable\ncomputational resources. At the same time, no pre-training affects performance\ndue to domain shifts from natural images to WSIs. We introduce Snuffy\narchitecture, a novel MIL-pooling method based on sparse transformers that\nmitigates performance loss with limited pre-training and enables continual\nfew-shot pre-training as a competitive option. Our sparsity pattern is tailored\nfor pathology and is theoretically proven to be a universal approximator with\nthe tightest probabilistic sharp bound on the number of layers for sparse\ntransformers, to date. We demonstrate Snuffy's effectiveness on CAMELYON16 and\nTCGA Lung cancer datasets, achieving superior WSI and patch-level accuracies.\nThe code is available on https://github.com/jafarinia/snuffy.\n","authors":["Hossein Jafarinia","Alireza Alipanah","Danial Hamdi","Saeed Razavi","Nahal Mirzaie","Mohammad Hossein Rohban"],"pdf_url":"https://arxiv.org/pdf/2408.08258v2.pdf","comment":"Accepted for ECCV 2024"},{"id":"http://arxiv.org/abs/2408.10641v1","updated":"2024-08-20T08:32:39Z","published":"2024-08-20T08:32:39Z","title":"A Review of Human-Object Interaction Detection","summary":" Human-object interaction (HOI) detection plays a key role in high-level\nvisual understanding, facilitating a deep comprehension of human activities.\nSpecifically, HOI detection aims to locate the humans and objects involved in\ninteractions within images or videos and classify the specific interactions\nbetween them. The success of this task is influenced by several key factors,\nincluding the accurate localization of human and object instances, as well as\nthe correct classification of object categories and interaction relationships.\nThis paper systematically summarizes and discusses the recent work in\nimage-based HOI detection. First, the mainstream datasets involved in HOI\nrelationship detection are introduced. Furthermore, starting with two-stage\nmethods and end-to-end one-stage detection approaches, this paper\ncomprehensively discusses the current developments in image-based HOI\ndetection, analyzing the strengths and weaknesses of these two methods.\nAdditionally, the advancements of zero-shot learning, weakly supervised\nlearning, and the application of large-scale language models in HOI detection\nare discussed. Finally, the current challenges in HOI detection are outlined,\nand potential research directions and future trends are explored.\n","authors":["Yuxiao Wang","Qiwei Xiong","Yu Lei","Weiying Xue","Qi Liu","Zhenao Wei"],"pdf_url":"https://arxiv.org/pdf/2408.10641v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10636v1","updated":"2024-08-20T08:22:29Z","published":"2024-08-20T08:22:29Z","title":"Generating Multi-frame Ultrawide-field Fluorescein Angiography from\n Ultrawide-field Color Imaging Improves Diabetic Retinopathy Stratification","summary":" Ultrawide-field fluorescein angiography (UWF-FA) facilitates diabetic\nretinopathy (DR) detection by providing a clear visualization of peripheral\nretinal lesions. However, the intravenous dye injection with potential risks\nhamper its application. We aim to acquire dye-free UWF-FA images from\nnoninvasive UWF color fundus (UWF-CF) images using generative artificial\nintelligence (GenAI) and evaluate its effectiveness in DR screening. A total of\n18,321 UWF-FA images of different phases were registered with corresponding\nUWF-CF images and fed into a generative adversarial networks (GAN)-based model\nfor training. The quality of generated UWF-FA images was evaluated through\nquantitative metrics and human evaluation. The DeepDRiD dataset was used to\nexternally assess the contribution of generated UWF-FA images to DR\nclassification, using area under the receiver operating characteristic curve\n(AUROC) as outcome metrics. The generated early, mid, and late phase UWF-FA\nimages achieved high authenticity, with multi-scale similarity scores ranging\nfrom 0.70 to 0.91 and qualitative visual scores ranging from 1.64 to 1.98\n(1=real UWF-FA quality). In fifty randomly selected images, 56% to 76% of the\ngenerated images were difficult to distinguish from real images in the Turing\ntest. Moreover, adding these generated UWF-FA images for DR classification\nsignificantly increased the AUROC from 0.869 to 0.904 compared to the baseline\nmodel using UWF-CF images (P < .001). The model successfully generates\nrealistic multi-frame UWF-FA images without intravenous dye injection. The\ngenerated UWF-FA enhanced DR stratification.\n","authors":["Ruoyu Chen","Kezheng Xu","Kangyan Zheng","Weiyi Zhang","Yan Lu","Danli Shi","Mingguang He"],"pdf_url":"https://arxiv.org/pdf/2408.10636v1.pdf","comment":"27 pages, 2 figures"},{"id":"http://arxiv.org/abs/2210.13723v6","updated":"2024-08-20T08:12:38Z","published":"2022-10-25T02:42:49Z","title":"S3E: A Mulit-Robot Multimodal Dataset for Collaborative SLAM","summary":" The burgeoning demand for collaborative robotic systems to execute complex\ntasks collectively has intensified the research community's focus on advancing\nsimultaneous localization and mapping (SLAM) in a cooperative context. Despite\nthis interest, the scalability and diversity of existing datasets for\ncollaborative trajectories remain limited, especially in scenarios with\nconstrained perspectives where the generalization capabilities of Collaborative\nSLAM (C-SLAM) are critical for the feasibility of multi-agent missions.\nAddressing this gap, we introduce S3E, an expansive multimodal dataset.\nCaptured by a fleet of unmanned ground vehicles traversing four distinct\ncollaborative trajectory paradigms, S3E encompasses 13 outdoor and 5 indoor\nsequences. These sequences feature meticulously synchronized and spatially\ncalibrated data streams, including 360-degree LiDAR point cloud,\nhigh-resolution stereo imagery, high-frequency inertial measurement units\n(IMU), and Ultra-wideband (UWB) relative observations. Our dataset not only\nsurpasses previous efforts in scale, scene diversity, and data intricacy but\nalso provides a thorough analysis and benchmarks for both collaborative and\nindividual SLAM methodologies. For access to the dataset and the latest\ninformation, please visit our repository at https://pengyu-team.github.io/S3E.\n","authors":["Dapeng Feng","Yuhua Qi","Shipeng Zhong","Zhiqiang Chen","Qiming Chen","Hongbo Chen","Jin Wu","Jun Ma"],"pdf_url":"https://arxiv.org/pdf/2210.13723v6.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2308.08242v4","updated":"2024-08-20T08:11:20Z","published":"2023-08-16T09:16:05Z","title":"Contrastive Learning for Lane Detection via cross-similarity","summary":" Detecting lane markings in road scenes poses a challenge due to their\nintricate nature, which is susceptible to unfavorable conditions. While lane\nmarkings have strong shape priors, their visibility is easily compromised by\nlighting conditions, occlusions by other vehicles or pedestrians, and fading of\ncolors over time. The detection process is further complicated by the presence\nof several lane shapes and natural variations, necessitating large amounts of\ndata to train a robust lane detection model capable of handling various\nscenarios. In this paper, we present a novel self-supervised learning method\ntermed Contrastive Learning for Lane Detection via cross-similarity (CLLD) to\nenhance the resilience of lane detection models in real-world scenarios,\nparticularly when the visibility of lanes is compromised. CLLD introduces a\ncontrastive learning (CL) method that assesses the similarity of local features\nwithin the global context of the input image. It uses the surrounding\ninformation to predict lane markings. This is achieved by integrating local\nfeature contrastive learning with our proposed cross-similar operation. The\nlocal feature CL concentrates on extracting features from small patches, a\nnecessity for accurately localizing lane segments. Meanwhile, cross-similarity\ncaptures global features, enabling the detection of obscured lane segments\nbased on their surroundings. We enhance cross-similarity by randomly masking\nportions of input images in the process of augmentation. Extensive experiments\non TuSimple and CuLane benchmarks demonstrate that CLLD outperforms SOTA\ncontrastive learning methods, particularly in visibility-impairing conditions\nlike shadows, while it also delivers comparable results under normal\nconditions. Compared to supervised learning, CLLD still excels in challenging\nscenarios such as shadows and crowded scenes, which are common in real-world\ndriving.\n","authors":["Ali Zoljodi","Sadegh Abadijou","Mina Alibeigi","Masoud Daneshtalab"],"pdf_url":"https://arxiv.org/pdf/2308.08242v4.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2408.10627v1","updated":"2024-08-20T08:08:32Z","published":"2024-08-20T08:08:32Z","title":"Rethinking Video Segmentation with Masked Video Consistency: Did the\n Model Learn as Intended?","summary":" Video segmentation aims at partitioning video sequences into meaningful\nsegments based on objects or regions of interest within frames. Current video\nsegmentation models are often derived from image segmentation techniques, which\nstruggle to cope with small-scale or class-imbalanced video datasets. This\nleads to inconsistent segmentation results across frames. To address these\nissues, we propose a training strategy Masked Video Consistency, which enhances\nspatial and temporal feature aggregation. MVC introduces a training strategy\nthat randomly masks image patches, compelling the network to predict the entire\nsemantic segmentation, thus improving contextual information integration.\nAdditionally, we introduce Object Masked Attention (OMA) to optimize the\ncross-attention mechanism by reducing the impact of irrelevant queries, thereby\nenhancing temporal modeling capabilities. Our approach, integrated into the\nlatest decoupled universal video segmentation framework, achieves\nstate-of-the-art performance across five datasets for three video segmentation\ntasks, demonstrating significant improvements over previous methods without\nincreasing model parameters.\n","authors":["Chen Liang","Qiang Guo","Xiaochao Qu","Luoqi Liu","Ting Liu"],"pdf_url":"https://arxiv.org/pdf/2408.10627v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10624v1","updated":"2024-08-20T08:06:16Z","published":"2024-08-20T08:06:16Z","title":"WRIM-Net: Wide-Ranging Information Mining Network for Visible-Infrared\n Person Re-Identification","summary":" For the visible-infrared person re-identification (VI-ReID) task, one of the\nprimary challenges lies in significant cross-modality discrepancy. Existing\nmethods struggle to conduct modality-invariant information mining. They often\nfocus solely on mining singular dimensions like spatial or channel, and\noverlook the extraction of specific-modality multi-dimension information. To\nfully mine modality-invariant information across a wide range, we introduce the\nWide-Ranging Information Mining Network (WRIM-Net), which mainly comprises a\nMulti-dimension Interactive Information Mining (MIIM) module and an\nAuxiliary-Information-based Contrastive Learning (AICL) approach. Empowered by\nthe proposed Global Region Interaction (GRI), MIIM comprehensively mines\nnon-local spatial and channel information through intra-dimension interaction.\nMoreover, Thanks to the low computational complexity design, separate MIIM can\nbe positioned in shallow layers, enabling the network to better mine\nspecific-modality multi-dimension information. AICL, by introducing the novel\nCross-Modality Key-Instance Contrastive (CMKIC) loss, effectively guides the\nnetwork in extracting modality-invariant information. We conduct extensive\nexperiments not only on the well-known SYSU-MM01 and RegDB datasets but also on\nthe latest large-scale cross-modality LLCM dataset. The results demonstrate\nWRIM-Net's superiority over state-of-the-art methods.\n","authors":["Yonggan Wu","Ling-Chao Meng","Yuan Zichao","Sixian Chan","Hong-Qiang Wang"],"pdf_url":"https://arxiv.org/pdf/2408.10624v1.pdf","comment":"18 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.10623v1","updated":"2024-08-20T08:06:09Z","published":"2024-08-20T08:06:09Z","title":"TextMastero: Mastering High-Quality Scene Text Editing in Diverse\n Languages and Styles","summary":" Scene text editing aims to modify texts on images while maintaining the style\nof newly generated text similar to the original. Given an image, a target area,\nand target text, the task produces an output image with the target text in the\nselected area, replacing the original. This task has been studied extensively,\nwith initial success using Generative Adversarial Networks (GANs) to balance\ntext fidelity and style similarity. However, GAN-based methods struggled with\ncomplex backgrounds or text styles. Recent works leverage diffusion models,\nshowing improved results, yet still face challenges, especially with non-Latin\nlanguages like CJK characters (Chinese, Japanese, Korean) that have complex\nglyphs, often producing inaccurate or unrecognizable characters. To address\nthese issues, we present \\emph{TextMastero} - a carefully designed multilingual\nscene text editing architecture based on latent diffusion models (LDMs).\nTextMastero introduces two key modules: a glyph conditioning module for\nfine-grained content control in generating accurate texts, and a latent\nguidance module for providing comprehensive style information to ensure\nsimilarity before and after editing. Both qualitative and quantitative\nexperiments demonstrate that our method surpasses all known existing works in\ntext fidelity and style similarity.\n","authors":["Tong Wang","Xiaochao Qu","Ting Liu"],"pdf_url":"https://arxiv.org/pdf/2408.10623v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13378v2","updated":"2024-08-20T08:05:59Z","published":"2024-03-20T08:21:00Z","title":"IIDM: Image-to-Image Diffusion Model for Semantic Image Synthesis","summary":" Semantic image synthesis aims to generate high-quality images given semantic\nconditions, i.e. segmentation masks and style reference images. Existing\nmethods widely adopt generative adversarial networks (GANs). GANs take all\nconditional inputs and directly synthesize images in a single forward step. In\nthis paper, semantic image synthesis is treated as an image denoising task and\nis handled with a novel image-to-image diffusion model (IIDM). Specifically,\nthe style reference is first contaminated with random noise and then\nprogressively denoised by IIDM, guided by segmentation masks. Moreover, three\ntechniques, refinement, color-transfer and model ensembles, are proposed to\nfurther boost the generation quality. They are plug-in inference modules and do\nnot require additional training. Extensive experiments show that our IIDM\noutperforms existing state-of-the-art methods by clear margins. Further\nanalysis is provided via detailed demonstrations. We have implemented IIDM\nbased on the Jittor framework; code is available at\nhttps://github.com/ader47/jittor-jieke-semantic_images_synthesis.\n","authors":["Feng Liu","Xiaobin Chang"],"pdf_url":"https://arxiv.org/pdf/2403.13378v2.pdf","comment":"6 pages, 7 figures, accepted by CVMJ 2024"},{"id":"http://arxiv.org/abs/2408.10619v1","updated":"2024-08-20T07:54:08Z","published":"2024-08-20T07:54:08Z","title":"Novel Change Detection Framework in Remote Sensing Imagery Using\n Diffusion Models and Structural Similarity Index (SSIM)","summary":" Change detection is a crucial task in remote sensing, enabling the monitoring\nof environmental changes, urban growth, and disaster impact. Conventional\nchange detection techniques, such as image differencing and ratioing, often\nstruggle with noise and fail to capture complex variations in imagery. Recent\nadvancements in machine learning, particularly generative models like diffusion\nmodels, offer new opportunities for enhancing change detection accuracy. In\nthis paper, we propose a novel change detection framework that combines the\nstrengths of Stable Diffusion models with the Structural Similarity Index\n(SSIM) to create robust and interpretable change maps. Our approach, named\nDiffusion Based Change Detector, is evaluated on both synthetic and real-world\nremote sensing datasets and compared with state-of-the-art methods. The results\ndemonstrate that our method significantly outperforms traditional differencing\ntechniques and recent deep learning-based methods, particularly in scenarios\nwith complex changes and noise.\n","authors":["Andrew Kiruluta","Eric Lundy","Andreas Lemos"],"pdf_url":"https://arxiv.org/pdf/2408.10619v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.13281v2","updated":"2024-08-20T07:54:01Z","published":"2024-06-19T07:21:31Z","title":"ECAFormer: Low-light Image Enhancement using Cross Attention","summary":" Low-light image enhancement (LLIE) is critical in computer vision. Existing\nLLIE methods often fail to discover the underlying relationships between\ndifferent sub-components, causing the loss of complementary information between\nmultiple modules and network layers, ultimately resulting in the loss of image\ndetails. To beat this shortage, we design a hierarchical mutual Enhancement via\na Cross Attention transformer (ECAFormer), which introduces an architecture\nthat enables concurrent propagation and interaction of multiple features. The\nmodel preserves detailed information by introducing a Dual Multi-head\nself-attention (DMSA), which leverages visual and semantic features across\ndifferent scales, allowing them to guide and complement each other. Besides, a\nCross-Scale DMSA block is introduced to capture the residual connection,\nintegrating cross-layer information to further enhance image detail.\nExperimental results show that ECAFormer reaches competitive performance across\nmultiple benchmarks, yielding nearly a 3% improvement in PSNR over the\nsuboptimal method, demonstrating the effectiveness of information interaction\nin LLIE.\n","authors":["Yudi Ruan","Hao Ma","Weikai Li","Xiao Wang"],"pdf_url":"https://arxiv.org/pdf/2406.13281v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09872v2","updated":"2024-08-20T07:52:08Z","published":"2024-04-15T15:43:52Z","title":"Conditional Prototype Rectification Prompt Learning","summary":" Pre-trained large-scale vision-language models (VLMs) have acquired profound\nunderstanding of general visual concepts. Recent advancements in efficient\ntransfer learning (ETL) have shown remarkable success in fine-tuning VLMs\nwithin the scenario of limited data, introducing only a few parameters to\nharness task-specific insights from VLMs. Despite significant progress, current\nleading ETL methods tend to overfit the narrow distributions of base classes\nseen during training and encounter two primary challenges: (i) only utilizing\nuni-modal information to modeling task-specific knowledge; and (ii) using\ncostly and time-consuming methods to supplement knowledge. To address these\nissues, we propose a Conditional Prototype Rectification Prompt Learning (CPR)\nmethod to correct the bias of base examples and augment limited data in an\neffective way. Specifically, we alleviate overfitting on base classes from two\naspects. First, each input image acquires knowledge from both textual and\nvisual prototypes, and then generates sample-conditional text tokens. Second,\nwe extract utilizable knowledge from unlabeled data to further refine the\nprototypes. These two strategies mitigate biases stemming from base classes,\nyielding a more effective classifier. Extensive experiments on 11 benchmark\ndatasets show that our CPR achieves state-of-the-art performance on both\nfew-shot classification and base-to-new generalization tasks. Our code is\navaliable at \\url{https://github.com/chenhaoxing/CPR}.\n","authors":["Haoxing Chen","Yaohui Li","Zizheng Huang","Yan Hong","Zhuoer Xu","Zhangxuan Gu","Jun Lan","Huijia Zhu","Weiqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2404.09872v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10618v1","updated":"2024-08-20T07:50:29Z","published":"2024-08-20T07:50:29Z","title":"OMEGA: Efficient Occlusion-Aware Navigation for Air-Ground Robot in\n Dynamic Environments via State Space Model","summary":" Air-ground robots (AGRs) are widely used in surveillance and disaster\nresponse due to their exceptional mobility and versatility (i.e., flying and\ndriving). Current AGR navigation systems perform well in static occlusion-prone\nenvironments (e.g., indoors) by using 3D semantic occupancy networks to predict\nocclusions for complete local mapping and then computing Euclidean Signed\nDistance Field (ESDF) for path planning. However, these systems face challenges\nin dynamic, severe occlusion scenes (e.g., crowds) due to limitations in\nperception networks' low prediction accuracy and path planners' high\ncomputation overhead. In this paper, we propose OMEGA, which contains OccMamba\nwith an Efficient AGR-Planner to address the above-mentioned problems. OccMamba\nadopts a novel architecture that separates semantic and occupancy prediction\ninto independent branches, incorporating two mamba blocks within these\nbranches. These blocks efficiently extract semantic and geometric features in\n3D environments with linear complexity, ensuring that the network can learn\nlong-distance dependencies to improve prediction accuracy. Semantic and\ngeometric features are combined within the Bird's Eye View (BEV) space to\nminimise computational overhead during feature fusion. The resulting semantic\noccupancy map is then seamlessly integrated into the local map, providing\nocclusion awareness of the dynamic environment. Our AGR-Planner utilizes this\nlocal map and employs kinodynamic A* search and gradient-based trajectory\noptimization to guarantee planning is ESDF-free and energy-efficient. Extensive\nexperiments demonstrate that OccMamba outperforms the state-of-the-art 3D\nsemantic occupancy network with 25.0% mIoU. End-to-end navigation experiments\nin dynamic scenes verify OMEGA's efficiency, achieving a 96% average planning\nsuccess rate. Code and video are available at\nhttps://jmwang0117.github.io/OMEGA/.\n","authors":["Junming Wang","Dong Huang","Xiuxian Guan","Zekai Sun","Tianxiang Shen","Fangming Liu","Heming Cui"],"pdf_url":"https://arxiv.org/pdf/2408.10618v1.pdf","comment":"OccMamba is Coming!"},{"id":"http://arxiv.org/abs/2408.10616v1","updated":"2024-08-20T07:49:43Z","published":"2024-08-20T07:49:43Z","title":"A toolbox for calculating objective image properties in aesthetics\n research","summary":" Over the past two decades, researchers in the field of visual aesthetics have\nstudied numerous quantitative (objective) image properties and how they relate\nto visual aesthetic appreciation. However, results are difficult to compare\nbetween research groups. One reason is that researchers use different sets of\nimage properties in their studies. But even if the same properties are used,\nthe image pre-processing techniques may differ and often researchers use their\nown customized scripts to calculate the image properties. To provide greater\naccessibility and comparability of research results in visual experimental\naesthetics, we developed an open-access and easy-to-use toolbox (called the\n'Aesthetics Toolbox'). The Toolbox allows users to calculate a well-defined set\nof quantitative image properties popular in contemporary research. The\nproperties include lightness and color statistics, Fourier spectral properties,\nfractality, self-similarity, symmetry, as well as different entropy measures\nand CNN-based variances. Compatible with most devices, the Toolbox provides an\nintuitive click-and-drop web interface. In the Toolbox, we integrated the\noriginal scripts of four different research groups and translated them into\nPython 3. To ensure that results were consistent across analyses, we took care\nthat results from the Python versions of the scripts were the same as those\nfrom the original scripts. The toolbox, detailed documentation, and a link to\nthe cloud version are available via Github:\nhttps://github.com/RBartho/Aesthetics-Toolbox. In summary, we developed a\ntoolbox that helps to standardize and simplify the calculation of quantitative\nimage properties for visual aesthetics research.\n","authors":["Christoph Redies","Ralf Bartho","Lisa Koßmann","Branka Spehar","Ronald Hübner","Johan Wagemans","Gregor U. Hayn-Leichsenring"],"pdf_url":"https://arxiv.org/pdf/2408.10616v1.pdf","comment":"41 pages, 6 figure"},{"id":"http://arxiv.org/abs/2408.10614v1","updated":"2024-08-20T07:48:45Z","published":"2024-08-20T07:48:45Z","title":"Generalizable Facial Expression Recognition","summary":" SOTA facial expression recognition (FER) methods fail on test sets that have\ndomain gaps with the train set. Recent domain adaptation FER methods need to\nacquire labeled or unlabeled samples of target domains to fine-tune the FER\nmodel, which might be infeasible in real-world deployment. In this paper, we\naim to improve the zero-shot generalization ability of FER methods on different\nunseen test sets using only one train set. Inspired by how humans first detect\nfaces and then select expression features, we propose a novel FER pipeline to\nextract expression-related features from any given face images. Our method is\nbased on the generalizable face features extracted by large models like CLIP.\nHowever, it is non-trivial to adapt the general features of CLIP for specific\ntasks like FER. To preserve the generalization ability of CLIP and the high\nprecision of the FER model, we design a novel approach that learns sigmoid\nmasks based on the fixed CLIP face features to extract expression features. To\nfurther improve the generalization ability on unseen test sets, we separate the\nchannels of the learned masked features according to the expression classes to\ndirectly generate logits and avoid using the FC layer to reduce overfitting. We\nalso introduce a channel-diverse loss to make the learned masks separated.\nExtensive experiments on five different FER datasets verify that our method\noutperforms SOTA FER methods by large margins. Code is available in\nhttps://github.com/zyh-uaiaaaa/Generalizable-FER.\n","authors":["Yuhang Zhang","Xiuqi Zheng","Chenyi Liang","Jiani Hu","Weihong Deng"],"pdf_url":"https://arxiv.org/pdf/2408.10614v1.pdf","comment":"Accepted by ECCV2024"},{"id":"http://arxiv.org/abs/2408.10605v1","updated":"2024-08-20T07:37:23Z","published":"2024-08-20T07:37:23Z","title":"MUSES: 3D-Controllable Image Generation via Multi-Modal Agent\n Collaboration","summary":" Despite recent advancements in text-to-image generation, most existing\nmethods struggle to create images with multiple objects and complex spatial\nrelationships in 3D world. To tackle this limitation, we introduce a generic AI\nsystem, namely MUSES, for 3D-controllable image generation from user queries.\nSpecifically, our MUSES addresses this challenging task by developing a\nprogressive workflow with three key components, including (1) Layout Manager\nfor 2D-to-3D layout lifting, (2) Model Engineer for 3D object acquisition and\ncalibration, (3) Image Artist for 3D-to-2D image rendering. By mimicking the\ncollaboration of human professionals, this multi-modal agent pipeline\nfacilitates the effective and automatic creation of images with 3D-controllable\nobjects, through an explainable integration of top-down planning and bottom-up\ngeneration. Additionally, we find that existing benchmarks lack detailed\ndescriptions of complex 3D spatial relationships of multiple objects. To fill\nthis gap, we further construct a new benchmark of T2I-3DisBench (3D image\nscene), which describes diverse 3D image scenes with 50 detailed prompts.\nExtensive experiments show the state-of-the-art performance of MUSES on both\nT2I-CompBench and T2I-3DisBench, outperforming recent strong competitors such\nas DALL-E 3 and Stable Diffusion 3. These results demonstrate a significant\nstep of MUSES forward in bridging natural language, 2D image generation, and 3D\nworld.\n","authors":["Yanbo Ding","Shaobin Zhuang","Kunchang Li","Zhengrong Yue","Yu Qiao","Yali Wang"],"pdf_url":"https://arxiv.org/pdf/2408.10605v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10602v1","updated":"2024-08-20T07:30:00Z","published":"2024-08-20T07:30:00Z","title":"MV-MOS: Multi-View Feature Fusion for 3D Moving Object Segmentation","summary":" Effectively summarizing dense 3D point cloud data and extracting motion\ninformation of moving objects (moving object segmentation, MOS) is crucial to\nautonomous driving and robotics applications. How to effectively utilize motion\nand semantic features and avoid information loss during 3D-to-2D projection is\nstill a key challenge. In this paper, we propose a novel multi-view MOS model\n(MV-MOS) by fusing motion-semantic features from different 2D representations\nof point clouds. To effectively exploit complementary information, the motion\nbranches of the proposed model combines motion features from both bird's eye\nview (BEV) and range view (RV) representations. In addition, a semantic branch\nis introduced to provide supplementary semantic features of moving objects.\nFinally, a Mamba module is utilized to fuse the semantic features with motion\nfeatures and provide effective guidance for the motion branches. We validated\nthe effectiveness of the proposed multi-branch fusion MOS framework via\ncomprehensive experiments, and our proposed model outperforms existing\nstate-of-the-art models on the SemanticKITTI benchmark.\n","authors":["Jintao Cheng","Xingming Chen","Jinxin Liang","Xiaoyu Tang","Xieyuanli Chen","Dachuan Li"],"pdf_url":"https://arxiv.org/pdf/2408.10602v1.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.02110v2","updated":"2024-08-20T07:19:58Z","published":"2024-08-04T18:41:35Z","title":"AvatarPose: Avatar-guided 3D Pose Estimation of Close Human Interaction\n from Sparse Multi-view Videos","summary":" Despite progress in human motion capture, existing multi-view methods often\nface challenges in estimating the 3D pose and shape of multiple closely\ninteracting people. This difficulty arises from reliance on accurate 2D joint\nestimations, which are hard to obtain due to occlusions and body contact when\npeople are in close interaction. To address this, we propose a novel method\nleveraging the personalized implicit neural avatar of each individual as a\nprior, which significantly improves the robustness and precision of this\nchallenging pose estimation task. Concretely, the avatars are efficiently\nreconstructed via layered volume rendering from sparse multi-view videos. The\nreconstructed avatar prior allows for the direct optimization of 3D poses based\non color and silhouette rendering loss, bypassing the issues associated with\nnoisy 2D detections. To handle interpenetration, we propose a collision loss on\nthe overlapping shape regions of avatars to add penetration constraints.\nMoreover, both 3D poses and avatars are optimized in an alternating manner. Our\nexperimental results demonstrate state-of-the-art performance on several public\ndatasets.\n","authors":["Feichi Lu","Zijian Dong","Jie Song","Otmar Hilliges"],"pdf_url":"https://arxiv.org/pdf/2408.02110v2.pdf","comment":"Project Page: https://eth-ait.github.io/AvatarPose/"},{"id":"http://arxiv.org/abs/2402.02085v6","updated":"2024-08-20T07:17:31Z","published":"2024-02-03T08:52:06Z","title":"DeCoF: Generated Video Detection via Frame Consistency: The First\n Benchmark Dataset","summary":" The escalating quality of video generated by advanced video generation\nmethods results in new security challenges, while there have been few relevant\nresearch efforts: 1) There is no open-source dataset for generated video\ndetection, 2) No generated video detection method has been proposed so far. To\nthis end, we propose an open-source dataset and a detection method for\ngenerated video for the first time. First, we propose a scalable dataset\nconsisting of 964 prompts, covering various forgery targets, scenes, behaviors,\nand actions, as well as various generation models with different architectures\nand generation methods, including the most popular commercial models like\nOpenAI's Sora and Google's Veo. Second, we found via probing experiments that\nspatial artifact-based detectors lack generalizability. Hence, we propose a\nsimple yet effective \\textbf{de}tection model based on \\textbf{f}rame\n\\textbf{co}nsistency (\\textbf{DeCoF}), which focuses on temporal artifacts by\neliminating the impact of spatial artifacts during feature learning. Extensive\nexperiments demonstrate the efficacy of DeCoF in detecting videos generated by\nunseen video generation models and confirm its powerful generalizability across\nseveral commercially proprietary models. Our code and dataset will be released\nat \\url{https://github.com/wuwuwuyue/DeCoF}.\n","authors":["Long Ma","Jiajia Zhang","Hongping Deng","Ningyu Zhang","Qinglang Guo","Haiyang Yu","Yong Liao","Pengyuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.02085v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10600v1","updated":"2024-08-20T07:16:01Z","published":"2024-08-20T07:16:01Z","title":"Breast tumor classification based on self-supervised contrastive\n learning from ultrasound videos","summary":" Background: Breast ultrasound is prominently used in diagnosing breast\ntumors. At present, many automatic systems based on deep learning have been\ndeveloped to help radiologists in diagnosis. However, training such systems\nremains challenging because they are usually data-hungry and demand amounts of\nlabeled data, which need professional knowledge and are expensive. Methods: We\nadopted a triplet network and a self-supervised contrastive learning technique\nto learn representations from unlabeled breast ultrasound video clips. We\nfurther designed a new hard triplet loss to to learn representations that\nparticularly discriminate positive and negative image pairs that are hard to\nrecognize. We also constructed a pretraining dataset from breast ultrasound\nvideos (1,360 videos from 200 patients), which includes an anchor sample\ndataset with 11,805 images, a positive sample dataset with 188,880 images, and\na negative sample dataset dynamically generated from video clips. Further, we\nconstructed a finetuning dataset, including 400 images from 66 patients. We\ntransferred the pretrained network to a downstream benign/malignant\nclassification task and compared the performance with other state-of-the-art\nmodels, including three models pretrained on ImageNet and a previous\ncontrastive learning model retrained on our datasets. Results and conclusion:\nExperiments revealed that our model achieved an area under the receiver\noperating characteristic curve (AUC) of 0.952, which is significantly higher\nthan the others. Further, we assessed the dependence of our pretrained model on\nthe number of labeled data and revealed that <100 samples were required to\nachieve an AUC of 0.901. The proposed framework greatly reduces the demand for\nlabeled data and holds potential for use in automatic breast ultrasound image\ndiagnosis.\n","authors":["Yunxin Tang","Siyuan Tang","Jian Zhang","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2408.10600v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10599v1","updated":"2024-08-20T07:14:28Z","published":"2024-08-20T07:14:28Z","title":"Vision Calorimeter for Anti-neutron Reconstruction: A Baseline","summary":" In high-energy physics, anti-neutrons ($\\bar{n}$) are fundamental particles\nthat frequently appear as final-state particles, and the reconstruction of\ntheir kinematic properties provides an important probe for understanding the\ngoverning principles. However, this confronts significant challenges\ninstrumentally with the electromagnetic calorimeter (EMC), a typical\nexperimental sensor but recovering the information of incident $\\bar{n}$\ninsufficiently. In this study, we introduce Vision Calorimeter (ViC), a\nbaseline method for anti-neutron reconstruction that leverages deep learning\ndetectors to analyze the implicit relationships between EMC responses and\nincident $\\bar{n}$ characteristics. Our motivation lies in that energy\ndistributions of $\\bar{n}$ samples deposited in the EMC cell arrays embody rich\ncontextual information. Converted to 2-D images, such contextual energy\ndistributions can be used to predict the status of $\\bar{n}$ ($i.e.$, incident\nposition and momentum) through a deep learning detector along with pseudo\nbounding boxes and a specified training objective. Experimental results\ndemonstrate that ViC substantially outperforms the conventional reconstruction\napproach, reducing the prediction error of incident position by 42.81% (from\n17.31$^{\\circ}$ to 9.90$^{\\circ}$). More importantly, this study for the first\ntime realizes the measurement of incident $\\bar{n}$ momentum, underscoring the\npotential of deep learning detectors for particle reconstruction. Code is\navailable at https://github.com/yuhongtian17/ViC.\n","authors":["Hongtian Yu","Yangu Li","Mingrui Wu","Letian Shen","Yue Liu","Yunxuan Song","Qixiang Ye","Xiaorui Lyu","Yajun Mao","Yangheng Zheng","Yunfan Liu"],"pdf_url":"https://arxiv.org/pdf/2408.10599v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10593v1","updated":"2024-08-20T07:10:40Z","published":"2024-08-20T07:10:40Z","title":"An Efficient Sign Language Translation Using Spatial Configuration and\n Motion Dynamics with LLMs","summary":" Gloss-free Sign Language Translation (SLT) converts sign videos directly into\nspoken language sentences without relying on glosses. Recently, Large Language\nModels (LLMs) have shown remarkable translation performance in gloss-free\nmethods by harnessing their powerful natural language generation capabilities.\nHowever, these methods often rely on domain-specific fine-tuning of visual\nencoders to achieve optimal results. By contrast, this paper emphasizes the\nimportance of capturing the spatial configurations and motion dynamics inherent\nin sign language. With this in mind, we introduce Spatial and Motion-based Sign\nLanguage Translation (SpaMo), a novel LLM-based SLT framework. The core idea of\nSpaMo is simple yet effective. We first extract spatial and motion features\nusing off-the-shelf visual encoders and then input these features into an LLM\nwith a language prompt. Additionally, we employ a visual-text alignment process\nas a warm-up before the SLT supervision. Our experiments demonstrate that SpaMo\nachieves state-of-the-art performance on two popular datasets, PHOENIX14T and\nHow2Sign.\n","authors":["Eui Jun Hwang","Sukmin Cho","Junmyeong Lee","Jong C. Park"],"pdf_url":"https://arxiv.org/pdf/2408.10593v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2408.09358v2","updated":"2024-08-20T07:07:49Z","published":"2024-08-18T04:48:03Z","title":"Panorama Tomosynthesis from Head CBCT with Simulated Projection Geometry","summary":" Cone Beam Computed Tomography (CBCT) and Panoramic X-rays are the most\ncommonly used imaging modalities in dental health care. CBCT can produce\nthree-dimensional views of a patient's head, providing clinicians with better\ndiagnostic capability, whereas Panoramic X-ray can capture the entire\nmaxillofacial region in a single image. If the CBCT is already available, it\ncan be beneficial to synthesize a Panoramic X-ray, thereby avoiding an\nimmediate additional scan and extra radiation exposure. Existing methods focus\non delineating an approximate dental arch and creating orthogonal projections\nalong this arch. However, no golden standard is available for such dental arch\nextractions, and this choice can affect the quality of synthesized X-rays. To\navoid such issues, we propose a novel method for synthesizing Panoramic X-rays\nfrom diverse head CBCTs, employing a simulated projection geometry and dynamic\nrotation centers. Our method effectively synthesized panoramic views from CBCT,\neven for patients with missing or nonexistent teeth and in the presence of\nsevere metal implants. Our results demonstrate that this method can generate\nhigh-quality panoramic images irrespective of the CBCT scanner geometry.\n","authors":["Anusree P. S.","Bikram Keshari Parida","Seong Yong Moon","Wonsang You"],"pdf_url":"https://arxiv.org/pdf/2408.09358v2.pdf","comment":"12 pages, 6 figures, 1 table, Journal submission planned"},{"id":"http://arxiv.org/abs/2408.10588v1","updated":"2024-08-20T06:52:03Z","published":"2024-08-20T06:52:03Z","title":"DEGAS: Detailed Expressions on Full-Body Gaussian Avatars","summary":" Although neural rendering has made significant advancements in creating\nlifelike, animatable full-body and head avatars, incorporating detailed\nexpressions into full-body avatars remains largely unexplored. We present\nDEGAS, the first 3D Gaussian Splatting (3DGS)-based modeling method for\nfull-body avatars with rich facial expressions. Trained on multiview videos of\na given subject, our method learns a conditional variational autoencoder that\ntakes both the body motion and facial expression as driving signals to generate\nGaussian maps in the UV layout. To drive the facial expressions, instead of the\ncommonly used 3D Morphable Models (3DMMs) in 3D head avatars, we propose to\nadopt the expression latent space trained solely on 2D portrait images,\nbridging the gap between 2D talking faces and 3D avatars. Leveraging the\nrendering capability of 3DGS and the rich expressiveness of the expression\nlatent space, the learned avatars can be reenacted to reproduce photorealistic\nrendering images with subtle and accurate facial expressions. Experiments on an\nexisting dataset and our newly proposed dataset of full-body talking avatars\ndemonstrate the efficacy of our method. We also propose an audio-driven\nextension of our method with the help of 2D talking faces, opening new\npossibilities to interactive AI agents.\n","authors":["Zhijing Shao","Duotun Wang","Qing-Yao Tian","Yao-Dong Yang","Hengyu Meng","Zeyu Cai","Bo Dong","Yu Zhang","Kang Zhang","Zeyu Wang"],"pdf_url":"https://arxiv.org/pdf/2408.10588v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10581v1","updated":"2024-08-20T06:42:17Z","published":"2024-08-20T06:42:17Z","title":"Multi-view Hand Reconstruction with a Point-Embedded Transformer","summary":" This work introduces a novel and generalizable multi-view Hand Mesh\nReconstruction (HMR) model, named POEM, designed for practical use in\nreal-world hand motion capture scenarios. The advances of the POEM model\nconsist of two main aspects. First, concerning the modeling of the problem, we\npropose embedding a static basis point within the multi-view stereo space. A\npoint represents a natural form of 3D information and serves as an ideal medium\nfor fusing features across different views, given its varied projections across\nthese views. Consequently, our method harnesses a simple yet effective idea: a\ncomplex 3D hand mesh can be represented by a set of 3D basis points that 1) are\nembedded in the multi-view stereo, 2) carry features from the multi-view\nimages, and 3) encompass the hand in it. The second advance lies in the\ntraining strategy. We utilize a combination of five large-scale multi-view\ndatasets and employ randomization in the number, order, and poses of the\ncameras. By processing such a vast amount of data and a diverse array of camera\nconfigurations, our model demonstrates notable generalizability in the\nreal-world applications. As a result, POEM presents a highly practical,\nplug-and-play solution that enables user-friendly, cost-effective multi-view\nmotion capture for both left and right hands. The model and source codes are\navailable at https://github.com/JubSteven/POEM-v2.\n","authors":["Lixin Yang","Licheng Zhong","Pengxiang Zhu","Xinyu Zhan","Junxiao Kong","Jian Xu","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2408.10581v1.pdf","comment":"Generalizable multi-view Hand Mesh Reconstruction (HMR) model.\n Extension of the original work at CVPR2023"},{"id":"http://arxiv.org/abs/2405.09863v3","updated":"2024-08-20T06:37:37Z","published":"2024-05-16T07:41:54Z","title":"Box-Free Model Watermarks Are Prone to Black-Box Removal Attacks","summary":" Box-free model watermarking is an emerging technique to safeguard the\nintellectual property of deep learning models, particularly those for low-level\nimage processing tasks. Existing works have verified and improved its\neffectiveness in several aspects. However, in this paper, we reveal that\nbox-free model watermarking is prone to removal attacks, even under the\nreal-world threat model such that the protected model and the watermark\nextractor are in black boxes. Under this setting, we carry out three studies.\n1) We develop an extractor-gradient-guided (EGG) remover and show its\neffectiveness when the extractor uses ReLU activation only. 2) More generally,\nfor an unknown extractor, we leverage adversarial attacks and design the EGG\nremover based on the estimated gradients. 3) Under the most stringent condition\nthat the extractor is inaccessible, we design a transferable remover based on a\nset of private proxy models. In all cases, the proposed removers can\nsuccessfully remove embedded watermarks while preserving the quality of the\nprocessed images, and we also demonstrate that the EGG remover can even replace\nthe watermarks. Extensive experimental results verify the effectiveness and\ngeneralizability of the proposed attacks, revealing the vulnerabilities of the\nexisting box-free methods and calling for further research.\n","authors":["Haonan An","Guang Hua","Zhiping Lin","Yuguang Fang"],"pdf_url":"https://arxiv.org/pdf/2405.09863v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10575v1","updated":"2024-08-20T06:30:37Z","published":"2024-08-20T06:30:37Z","title":"MUSE: Mamba is Efficient Multi-scale Learner for Text-video Retrieval","summary":" Text-Video Retrieval (TVR) aims to align and associate relevant video content\nwith corresponding natural language queries. Most existing TVR methods are\nbased on large-scale pre-trained vision-language models (e.g., CLIP). However,\ndue to the inherent plain structure of CLIP, few TVR methods explore the\nmulti-scale representations which offer richer contextual information for a\nmore thorough understanding. To this end, we propose MUSE, a multi-scale mamba\nwith linear computational complexity for efficient cross-resolution modeling.\nSpecifically, the multi-scale representations are generated by applying a\nfeature pyramid on the last single-scale feature map. Then, we employ the Mamba\nstructure as an efficient multi-scale learner to jointly learn scale-wise\nrepresentations. Furthermore, we conduct comprehensive studies to investigate\ndifferent model structures and designs. Extensive results on three popular\nbenchmarks have validated the superiority of MUSE.\n","authors":["Haoran Tang","Meng Cao","Jinfa Huang","Ruyang Liu","Peng Jin","Ge Li","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2408.10575v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2408.10572v1","updated":"2024-08-20T06:23:20Z","published":"2024-08-20T06:23:20Z","title":"A Tutorial on Explainable Image Classification for Dementia Stages Using\n Convolutional Neural Network and Gradient-weighted Class Activation Mapping","summary":" This paper presents a tutorial of an explainable approach using Convolutional\nNeural Network (CNN) and Gradient-weighted Class Activation Mapping (Grad-CAM)\nto classify four progressive dementia stages based on open MRI brain images.\nThe detailed implementation steps are demonstrated with an explanation. Whilst\nthe proposed CNN architecture is demonstrated to achieve more than 99% accuracy\nfor the test dataset, the computational procedure of CNN remains a black box.\nThe visualisation based on Grad-CAM is attempted to explain such very high\naccuracy and may provide useful information for physicians. Future motivation\nbased on this work is discussed.\n","authors":["Kevin Kam Fung Yuen"],"pdf_url":"https://arxiv.org/pdf/2408.10572v1.pdf","comment":"15 pages, 11 figures, 3 tables"},{"id":"http://arxiv.org/abs/2405.19725v2","updated":"2024-08-20T06:19:27Z","published":"2024-05-30T06:15:08Z","title":"Quantum Visual Feature Encoding Revisited","summary":" Although quantum machine learning has been introduced for a while, its\napplications in computer vision are still limited. This paper, therefore,\nrevisits the quantum visual encoding strategies, the initial step in quantum\nmachine learning. Investigating the root cause, we uncover that the existing\nquantum encoding design fails to ensure information preservation of the visual\nfeatures after the encoding process, thus complicating the learning process of\nthe quantum machine learning models. In particular, the problem, termed\n\"Quantum Information Gap\" (QIG), leads to a gap of information between\nclassical and corresponding quantum features. We provide theoretical proof and\npractical demonstrations of that found and underscore the significance of QIG,\nas it directly impacts the performance of quantum machine learning algorithms.\nTo tackle this challenge, we introduce a simple but efficient new loss function\nnamed Quantum Information Preserving (QIP) to minimize this gap, resulting in\nenhanced performance of quantum machine learning algorithms. Extensive\nexperiments validate the effectiveness of our approach, showcasing superior\nperformance compared to current methodologies and consistently achieving\nstate-of-the-art results in quantum modeling.\n","authors":["Xuan-Bac Nguyen","Hoang-Quan Nguyen","Hugh Churchill","Samee U. Khan","Khoa Luu"],"pdf_url":"https://arxiv.org/pdf/2405.19725v2.pdf","comment":"Accepted to Quantum Machine Intelligence"},{"id":"http://arxiv.org/abs/2408.10571v1","updated":"2024-08-20T06:17:56Z","published":"2024-08-20T06:17:56Z","title":"Prompt-Agnostic Adversarial Perturbation for Customized Diffusion Models","summary":" Diffusion models have revolutionized customized text-to-image generation,\nallowing for efficient synthesis of photos from personal data with textual\ndescriptions. However, these advancements bring forth risks including privacy\nbreaches and unauthorized replication of artworks. Previous researches\nprimarily center around using prompt-specific methods to generate adversarial\nexamples to protect personal images, yet the effectiveness of existing methods\nis hindered by constrained adaptability to different prompts. In this paper, we\nintroduce a Prompt-Agnostic Adversarial Perturbation (PAP) method for\ncustomized diffusion models. PAP first models the prompt distribution using a\nLaplace Approximation, and then produces prompt-agnostic perturbations by\nmaximizing a disturbance expectation based on the modeled distribution. This\napproach effectively tackles the prompt-agnostic attacks, leading to improved\ndefense stability. Extensive experiments in face privacy and artistic style\nprotection, demonstrate the superior generalization of our method in comparison\nto existing techniques.\n","authors":["Cong Wan","Yuhang He","Xiang Song","Yihong Gong"],"pdf_url":"https://arxiv.org/pdf/2408.10571v1.pdf","comment":"33 pages, 14 figures, under review"},{"id":"http://arxiv.org/abs/2408.10567v1","updated":"2024-08-20T06:08:37Z","published":"2024-08-20T06:08:37Z","title":"Prompt Your Brain: Scaffold Prompt Tuning for Efficient Adaptation of\n fMRI Pre-trained Model","summary":" We introduce Scaffold Prompt Tuning (ScaPT), a novel prompt-based framework\nfor adapting large-scale functional magnetic resonance imaging (fMRI)\npre-trained models to downstream tasks, with high parameter efficiency and\nimproved performance compared to fine-tuning and baselines for prompt tuning.\nThe full fine-tuning updates all pre-trained parameters, which may distort the\nlearned feature space and lead to overfitting with limited training data which\nis common in fMRI fields. In contrast, we design a hierarchical prompt\nstructure that transfers the knowledge learned from high-resource tasks to\nlow-resource ones. This structure, equipped with a Deeply-conditioned\nInput-Prompt (DIP) mapping module, allows for efficient adaptation by updating\nonly 2% of the trainable parameters. The framework enhances semantic\ninterpretability through attention mechanisms between inputs and prompts, and\nit clusters prompts in the latent space in alignment with prior knowledge.\nExperiments on public resting state fMRI datasets reveal ScaPT outperforms\nfine-tuning and multitask-based prompt tuning in neurodegenerative diseases\ndiagnosis/prognosis and personality trait prediction, even with fewer than 20\nparticipants. It highlights ScaPT's efficiency in adapting pre-trained fMRI\nmodels to low-resource tasks.\n","authors":["Zijian Dong","Yilei Wu","Zijiao Chen","Yichi Zhang","Yueming Jin","Juan Helen Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.10567v1.pdf","comment":"MICCAI 2024"},{"id":"http://arxiv.org/abs/2408.10562v1","updated":"2024-08-20T06:03:40Z","published":"2024-08-20T06:03:40Z","title":"Kalib: Markerless Hand-Eye Calibration with Keypoint Tracking","summary":" Hand-eye calibration involves estimating the transformation between the\ncamera and the robot. Traditional methods rely on fiducial markers, involving\nmuch manual labor and careful setup. Recent advancements in deep learning offer\nmarkerless techniques, but they present challenges, including the need for\nretraining networks for each robot, the requirement of accurate mesh models for\ndata generation, and the need to address the sim-to-real gap. In this letter,\nwe propose Kalib, an automatic and universal markerless hand-eye calibration\npipeline that leverages the generalizability of visual foundation models to\neliminate these barriers. In each calibration process, Kalib uses keypoint\ntracking and proprioceptive sensors to estimate the transformation between a\nrobot's coordinate space and its corresponding points in camera space. Our\nmethod does not require training new networks or access to mesh models. Through\nevaluations in simulation environments and the real-world dataset DROID, Kalib\ndemonstrates superior accuracy compared to recent baseline methods. This\napproach provides an effective and flexible calibration process for various\nrobot systems by simplifying setup and removing dependency on precise physical\nmarkers.\n","authors":["Tutian Tang","Minghao Liu","Wenqiang Xu","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2408.10562v1.pdf","comment":"The code and supplementary materials are available at\n https://sites.google.com/view/hand-eye-kalib"},{"id":"http://arxiv.org/abs/2403.05053v3","updated":"2024-08-20T05:14:00Z","published":"2024-03-08T04:58:49Z","title":"PrimeComposer: Faster Progressively Combined Diffusion for Image\n Composition with Attention Steering","summary":" Image composition involves seamlessly integrating given objects into a\nspecific visual context. Current training-free methods rely on composing\nattention weights from several samplers to guide the generator. However, since\nthese weights are derived from disparate contexts, their combination leads to\ncoherence confusion and loss of appearance information. These issues worsen\nwith their excessive focus on background generation, even when unnecessary in\nthis task. This not only impedes their swift implementation but also\ncompromises foreground generation quality. Moreover, these methods introduce\nunwanted artifacts in the transition area. In this paper, we formulate image\ncomposition as a subject-based local editing task, solely focusing on\nforeground generation. At each step, the edited foreground is combined with the\nnoisy background to maintain scene consistency. To address the remaining\nissues, we propose PrimeComposer, a faster training-free diffuser that\ncomposites the images by well-designed attention steering across different\nnoise levels. This steering is predominantly achieved by our Correlation\nDiffuser, utilizing its self-attention layers at each step. Within these\nlayers, the synthesized subject interacts with both the referenced object and\nbackground, capturing intricate details and coherent relationships. This prior\ninformation is encoded into the attention weights, which are then integrated\ninto the self-attention layers of the generator to guide the synthesis process.\nBesides, we introduce a Region-constrained Cross-Attention to confine the\nimpact of specific subject-related tokens to desired regions, addressing the\nunwanted artifacts shown in the prior method thereby further improving the\ncoherence in the transition area. Our method exhibits the fastest inference\nefficiency and extensive experiments demonstrate our superiority both\nqualitatively and quantitatively.\n","authors":["Yibin Wang","Weizhong Zhang","Jianwei Zheng","Cheng Jin"],"pdf_url":"https://arxiv.org/pdf/2403.05053v3.pdf","comment":"Accepted by ACMMM2024. Code:\n https://github.com/CodeGoat24/PrimeComposer"},{"id":"http://arxiv.org/abs/2408.10543v1","updated":"2024-08-20T04:55:29Z","published":"2024-08-20T04:55:29Z","title":"Diff-PCC: Diffusion-based Neural Compression for 3D Point Clouds","summary":" Stable diffusion networks have emerged as a groundbreaking development for\ntheir ability to produce realistic and detailed visual content. This\ncharacteristic renders them ideal decoders, capable of producing high-quality\nand aesthetically pleasing reconstructions. In this paper, we introduce the\nfirst diffusion-based point cloud compression method, dubbed Diff-PCC, to\nleverage the expressive power of the diffusion model for generative and\naesthetically superior decoding. Different from the conventional autoencoder\nfashion, a dual-space latent representation is devised in this paper, in which\na compressor composed of two independent encoding backbones is considered to\nextract expressive shape latents from distinct latent spaces. At the decoding\nside, a diffusion-based generator is devised to produce high-quality\nreconstructions by considering the shape latents as guidance to stochastically\ndenoise the noisy point clouds. Experiments demonstrate that the proposed\nDiff-PCC achieves state-of-the-art compression performance (e.g., 7.711 dB\nBD-PSNR gains against the latest G-PCC standard at ultra-low bitrate) while\nattaining superior subjective quality. Source code will be made publicly\navailable.\n","authors":["Kai Liu","Kang You","Pan Gao"],"pdf_url":"https://arxiv.org/pdf/2408.10543v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10541v1","updated":"2024-08-20T04:45:13Z","published":"2024-08-20T04:45:13Z","title":"The Instance-centric Transformer for the RVOS Track of LSVOS Challenge:\n 3rd Place Solution","summary":" Referring Video Object Segmentation is an emerging multi-modal task that aims\nto segment objects in the video given a natural language expression. In this\nwork, we build two instance-centric models and fuse predicted results from\nframe-level and instance-level. First, we introduce instance mask into the\nDETR-based model for query initialization to achieve temporal enhancement and\nemploy SAM for spatial refinement. Secondly, we build an instance retrieval\nmodel conducting binary instance mask classification whether the instance is\nreferred. Finally, we fuse predicted results and our method achieved a score of\n52.67 J&F in the validation phase and 60.36 J&F in the test phase, securing the\nfinal ranking of 3rd place in the 6-th LSVOS Challenge RVOS Track.\n","authors":["Bin Cao","Yisi Zhang","Hanyi Wang","Xingjian He","Jing Liu"],"pdf_url":"https://arxiv.org/pdf/2408.10541v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2406.13939"},{"id":"http://arxiv.org/abs/2408.10539v1","updated":"2024-08-20T04:34:06Z","published":"2024-08-20T04:34:06Z","title":"Training Matting Models without Alpha Labels","summary":" The labelling difficulty has been a longstanding problem in deep image\nmatting. To escape from fine labels, this work explores using rough annotations\nsuch as trimaps coarsely indicating the foreground/background as supervision.\nWe present that the cooperation between learned semantics from indicated known\nregions and proper assumed matting rules can help infer alpha values at\ntransition areas. Inspired by the nonlocal principle in traditional image\nmatting, we build a directional distance consistency loss (DDC loss) at each\npixel neighborhood to constrain the alpha values conditioned on the input\nimage. DDC loss forces the distance of similar pairs on the alpha matte and on\nits corresponding image to be consistent. In this way, the alpha values can be\npropagated from learned known regions to unknown transition areas. With only\nimages and trimaps, a matting model can be trained under the supervision of a\nknown loss and the proposed DDC loss. Experiments on AM-2K and P3M-10K dataset\nshow that our paradigm achieves comparable performance with the\nfine-label-supervised baseline, while sometimes offers even more satisfying\nresults than human-labelled ground truth. Code is available at\n\\url{https://github.com/poppuppy/alpha-free-matting}.\n","authors":["Wenze Liu","Zixuan Ye","Hao Lu","Zhiguo Cao","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2408.10539v1.pdf","comment":"12 pages, 12 figures"},{"id":"http://arxiv.org/abs/2408.10538v1","updated":"2024-08-20T04:32:50Z","published":"2024-08-20T04:32:50Z","title":"Surgical Workflow Recognition and Blocking Effectiveness Detection in\n Laparoscopic Liver Resections with Pringle Maneuver","summary":" Pringle maneuver (PM) in laparoscopic liver resection aims to reduce blood\nloss and provide a clear surgical view by intermittently blocking blood inflow\nof the liver, whereas prolonged PM may cause ischemic injury. To\ncomprehensively monitor this surgical procedure and provide timely warnings of\nineffective and prolonged blocking, we suggest two complementary AI-assisted\nsurgical monitoring tasks: workflow recognition and blocking effectiveness\ndetection in liver resections. The former presents challenges in real-time\ncapturing of short-term PM, while the latter involves the intraoperative\ndiscrimination of long-term liver ischemia states. To address these challenges,\nwe meticulously collect a novel dataset, called PmLR50, consisting of 25,037\nvideo frames covering various surgical phases from 50 laparoscopic liver\nresection procedures. Additionally, we develop an online baseline for PmLR50,\ntermed PmNet. This model embraces Masked Temporal Encoding (MTE) and Compressed\nSequence Modeling (CSM) for efficient short-term and long-term temporal\ninformation modeling, and embeds Contrastive Prototype Separation (CPS) to\nenhance action discrimination between similar intraoperative operations.\nExperimental results demonstrate that PmNet outperforms existing\nstate-of-the-art surgical workflow recognition methods on the PmLR50 benchmark.\nOur research offers potential clinical applications for the laparoscopic liver\nsurgery community. Source code and data will be publicly available.\n","authors":["Diandian Guo","Weixin Si","Zhixi Li","Jialun Pei","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2408.10538v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10537v1","updated":"2024-08-20T04:31:46Z","published":"2024-08-20T04:31:46Z","title":"Subspace Prototype Guidance for Mitigating Class Imbalance in Point\n Cloud Semantic Segmentation","summary":" Point cloud semantic segmentation can significantly enhance the perception of\nan intelligent agent. Nevertheless, the discriminative capability of the\nsegmentation network is influenced by the quantity of samples available for\ndifferent categories. To mitigate the cognitive bias induced by class\nimbalance, this paper introduces a novel method, namely subspace prototype\nguidance (\\textbf{SPG}), to guide the training of segmentation network.\nSpecifically, the point cloud is initially separated into independent point\nsets by category to provide initial conditions for the generation of feature\nsubspaces. The auxiliary branch which consists of an encoder and a projection\nhead maps these point sets into separate feature subspaces. Subsequently, the\nfeature prototypes which are extracted from the current separate subspaces and\nthen combined with prototypes of historical subspaces guide the feature space\nof main branch to enhance the discriminability of features of minority\ncategories. The prototypes derived from the feature space of main branch are\nalso employed to guide the training of the auxiliary branch, forming a\nsupervisory loop to maintain consistent convergence of the entire network. The\nexperiments conducted on the large public benchmarks (i.e. S3DIS, ScanNet v2,\nScanNet200, Toronto-3D) and collected real-world data illustrate that the\nproposed method significantly improves the segmentation performance and\nsurpasses the state-of-the-art method. The code is available at\n\\url{https://github.com/Javion11/PointLiBR.git}.\n","authors":["Jiawei Han","Kaiqi Liu","Wei Li","Guangzhi Chen"],"pdf_url":"https://arxiv.org/pdf/2408.10537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10533v1","updated":"2024-08-20T04:20:11Z","published":"2024-08-20T04:20:11Z","title":"FAGStyle: Feature Augmentation on Geodesic Surface for Zero-shot\n Text-guided Diffusion Image Style Transfer","summary":" The goal of image style transfer is to render an image guided by a style\nreference while maintaining the original content. Existing image-guided methods\nrely on specific style reference images, restricting their wider application\nand potentially compromising result quality. As a flexible alternative,\ntext-guided methods allow users to describe the desired style using text\nprompts. Despite their versatility, these methods often struggle with\nmaintaining style consistency, reflecting the described style accurately, and\npreserving the content of the target image. To address these challenges, we\nintroduce FAGStyle, a zero-shot text-guided diffusion image style transfer\nmethod. Our approach enhances inter-patch information interaction by\nincorporating the Sliding Window Crop technique and Feature Augmentation on\nGeodesic Surface into our style control loss. Furthermore, we integrate a\nPre-Shape self-correlation consistency loss to ensure content consistency.\nFAGStyle demonstrates superior performance over existing methods, consistently\nachieving stylization that retains the semantic content of the source image.\nExperimental results confirms the efficacy of FAGStyle across a diverse range\nof source contents and styles, both imagined and common.\n","authors":["Yuexing Han","Liheng Ruan","Bing Wang"],"pdf_url":"https://arxiv.org/pdf/2408.10533v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10532v1","updated":"2024-08-20T04:18:53Z","published":"2024-08-20T04:18:53Z","title":"NutrifyAI: An AI-Powered System for Real-Time Food Detection,\n Nutritional Analysis, and Personalized Meal Recommendations","summary":" With diet and nutrition apps reaching 1.4 billion users in 2022 [1], it's no\nsurprise that health apps like MyFitnessPal, Noom, and Calorie Counter, are\nsurging in popularity. However, one major setback [2] of nearly all nutrition\napplications is that users must enter food data manually, which is\ntime-consuming and tedious. Thus, there has been an increasing demand for\napplications that can accurately identify food items, analyze their nutritional\ncontent, and offer dietary recommendations in real-time. This paper introduces\na comprehensive system that combines advanced computer vision techniques with\nnutrition analysis, implemented in a versatile mobile and web application. The\nsystem is divided into three key components: 1) food detection using the YOLOv8\nmodel, 2) nutrient analysis via the Edamam Nutrition Analysis API, and 3)\npersonalized meal recommendations using the Edamam Meal Planning and Recipe\nSearch APIs. Designed for both mobile and web platforms, the application\nensures fast processing times with an intuitive user interface, with features\nsuch as data visualizations using Chart.js, a login system, and personalized\nsettings for dietary preferences, allergies, and cuisine choices. Preliminary\nresults showcase the system's effectiveness, making it a valuable tool for\nusers to make informed dietary decisions.\n","authors":["Michelle Han","Junyao Chen"],"pdf_url":"https://arxiv.org/pdf/2408.10532v1.pdf","comment":"7 pages, 12 figures"},{"id":"http://arxiv.org/abs/2207.08794v3","updated":"2024-08-20T04:14:46Z","published":"2022-07-18T17:47:39Z","title":"D$^3$FlowSLAM: Self-Supervised Dynamic SLAM with Flow Motion\n Decomposition and DINO Guidance","summary":" In this paper, we introduce a self-supervised deep SLAM method that robustly\noperates in dynamic scenes while accurately identifying dynamic components. Our\nmethod leverages a dual-flow representation for static flow and dynamic flow,\nfacilitating effective scene decomposition in dynamic environments. We propose\na dynamic update module based on this representation and develop a dense SLAM\nsystem that excels in dynamic scenarios. In addition, we design a\nself-supervised training scheme using DINO as a prior, enabling label-free\ntraining. Our method achieves superior accuracy compared to other\nself-supervised methods. It also matches or even surpasses the performance of\nexisting supervised methods in some cases. All code and data will be made\npublicly available upon acceptance.\n","authors":["Xingyuan Yu","Weicai Ye","Xiyue Guo","Yuhang Ming","Jinyu Li","Hujun Bao","Zhaopeng Cui","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2207.08794v3.pdf","comment":"Homepage: https://zju3dv.github.io/deflowslam"},{"id":"http://arxiv.org/abs/2401.00763v3","updated":"2024-08-20T04:11:26Z","published":"2024-01-01T14:06:55Z","title":"New Job, New Gender? Measuring the Social Bias in Image Generation\n Models","summary":" Image generation models can generate or edit images from a given text. Recent\nadvancements in image generation technology, exemplified by DALL-E and\nMidjourney, have been groundbreaking. These advanced models, despite their\nimpressive capabilities, are often trained on massive Internet datasets, making\nthem susceptible to generating content that perpetuates social stereotypes and\nbiases, which can lead to severe consequences. Prior research on assessing bias\nwithin image generation models suffers from several shortcomings, including\nlimited accuracy, reliance on extensive human labor, and lack of comprehensive\nanalysis. In this paper, we propose BiasPainter, a novel evaluation framework\nthat can accurately, automatically and comprehensively trigger social bias in\nimage generation models. BiasPainter uses a diverse range of seed images of\nindividuals and prompts the image generation models to edit these images using\ngender, race, and age-neutral queries. These queries span 62 professions, 39\nactivities, 57 types of objects, and 70 personality traits. The framework then\ncompares the edited images to the original seed images, focusing on the\nsignificant changes related to gender, race, and age. BiasPainter adopts a key\ninsight that these characteristics should not be modified when subjected to\nneutral prompts. Built upon this design, BiasPainter can trigger the social\nbias and evaluate the fairness of image generation models. We use BiasPainter\nto evaluate six widely-used image generation models, such as stable diffusion\nand Midjourney. Experimental results show that BiasPainter can successfully\ntrigger social bias in image generation models. According to our human\nevaluation, BiasPainter can achieve 90.8% accuracy on automatic bias detection,\nwhich is significantly higher than the results reported in previous work.\n","authors":["Wenxuan Wang","Haonan Bai","Jen-tse Huang","Yuxuan Wan","Youliang Yuan","Haoyi Qiu","Nanyun Peng","Michael R. Lyu"],"pdf_url":"https://arxiv.org/pdf/2401.00763v3.pdf","comment":"ACM MM 2024 Oral"},{"id":"http://arxiv.org/abs/2408.10527v1","updated":"2024-08-20T04:04:22Z","published":"2024-08-20T04:04:22Z","title":"EdgeNAT: Transformer for Efficient Edge Detection","summary":" Transformers, renowned for their powerful feature extraction capabilities,\nhave played an increasingly prominent role in various vision tasks. Especially,\nrecent advancements present transformer with hierarchical structures such as\nDilated Neighborhood Attention Transformer (DiNAT), demonstrating outstanding\nability to efficiently capture both global and local features. However,\ntransformers' application in edge detection has not been fully exploited. In\nthis paper, we propose EdgeNAT, a one-stage transformer-based edge detector\nwith DiNAT as the encoder, capable of extracting object boundaries and\nmeaningful edges both accurately and efficiently. On the one hand, EdgeNAT\ncaptures global contextual information and detailed local cues with DiNAT, on\nthe other hand, it enhances feature representation with a novel SCAF-MLA\ndecoder by utilizing both inter-spatial and inter-channel relationships of\nfeature maps. Extensive experiments on multiple datasets show that our method\nachieves state-of-the-art performance on both RGB and depth images. Notably, on\nthe widely used BSDS500 dataset, our L model achieves impressive performances,\nwith ODS F-measure and OIS F-measure of 86.0%, 87.6% for multi-scale input,and\n84.9%, and 86.3% for single-scale input, surpassing the current\nstate-of-the-art EDTER by 1.2%, 1.1%, 1.7%, and 1.6%, respectively. Moreover,\nas for throughput, our approach runs at 20.87 FPS on RTX 4090 GPU with\nsingle-scale input. The code for our method will be released soon.\n","authors":["Jinghuai Jie","Yan Guo","Guixing Wu","Junmin Wu","Baojian Hua"],"pdf_url":"https://arxiv.org/pdf/2408.10527v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06281v5","updated":"2024-08-20T03:56:03Z","published":"2023-07-12T16:23:09Z","title":"MMBench: Is Your Multi-modal Model an All-around Player?","summary":" Large vision-language models (VLMs) have recently achieved remarkable\nprogress, exhibiting impressive multimodal perception and reasoning abilities.\nHowever, effectively evaluating these large VLMs remains a major challenge,\nhindering future development in this domain. Traditional benchmarks like VQAv2\nor COCO Caption provide quantitative performance measurements but lack\nfine-grained ability assessment and robust evaluation metrics. Meanwhile,\nsubjective benchmarks, such as OwlEval, offer comprehensive evaluations of a\nmodel's abilities by incorporating human labor, which is not scalable and may\ndisplay significant bias. In response to these challenges, we propose MMBench,\na bilingual benchmark for assessing the multi-modal capabilities of VLMs.\nMMBench methodically develops a comprehensive evaluation pipeline, primarily\ncomprised of the following key features: 1. MMBench is meticulously curated\nwith well-designed quality control schemes, surpassing existing similar\nbenchmarks in terms of the number and variety of evaluation questions and\nabilities; 2. MMBench introduces a rigorous CircularEval strategy and\nincorporates large language models to convert free-form predictions into\npre-defined choices, which helps to yield accurate evaluation results for\nmodels with limited instruction-following capabilities. 3. MMBench incorporates\nmultiple-choice questions in both English and Chinese versions, enabling an\napples-to-apples comparison of VLMs' performance under a bilingual context. To\nsummarize, MMBench is a systematically designed objective benchmark for a\nrobust and holistic evaluation of vision-language models. We hope MMBench will\nassist the research community in better evaluating their models and facilitate\nfuture progress in this area. The evalutation code of MMBench has been\nintegrated into VLMEvalKit: https://github.com/open-compass/VLMEvalKit.\n","authors":["Yuan Liu","Haodong Duan","Yuanhan Zhang","Bo Li","Songyang Zhang","Wangbo Zhao","Yike Yuan","Jiaqi Wang","Conghui He","Ziwei Liu","Kai Chen","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2307.06281v5.pdf","comment":"Accepted in ECCV2024 as Oral Presentation"},{"id":"http://arxiv.org/abs/2405.20327v2","updated":"2024-08-20T03:54:10Z","published":"2024-05-30T17:58:00Z","title":"GECO: Generative Image-to-3D within a SECOnd","summary":" Recent years have seen significant advancements in 3D generation. While\nmethods like score distillation achieve impressive results, they often require\nextensive per-scene optimization, which limits their time efficiency. On the\nother hand, reconstruction-based approaches are more efficient but tend to\ncompromise quality due to their limited ability to handle uncertainty. We\nintroduce GECO, a novel method for high-quality 3D generative modeling that\noperates within a second. Our approach addresses the prevalent issues of\nuncertainty and inefficiency in existing methods through a two-stage approach.\nIn the first stage, we train a single-step multi-view generative model with\nscore distillation. Then, a second-stage distillation is applied to address the\nchallenge of view inconsistency in the multi-view generation. This two-stage\nprocess ensures a balanced approach to 3D generation, optimizing both quality\nand efficiency. Our comprehensive experiments demonstrate that GECO achieves\nhigh-quality image-to-3D mesh generation with an unprecedented level of\nefficiency. We will make the code and model publicly available.\n","authors":["Chen Wang","Jiatao Gu","Xiaoxiao Long","Yuan Liu","Lingjie Liu"],"pdf_url":"https://arxiv.org/pdf/2405.20327v2.pdf","comment":"Project Page: https://cwchenwang.github.io/geco"},{"id":"http://arxiv.org/abs/2401.01524v3","updated":"2024-08-20T03:49:28Z","published":"2024-01-03T03:33:48Z","title":"Multimodal self-supervised learning for lesion localization","summary":" Multimodal deep learning utilizing imaging and diagnostic reports has made\nimpressive progress in the field of medical imaging diagnostics, demonstrating\na particularly strong capability for auxiliary diagnosis in cases where\nsufficient annotation information is lacking. Nonetheless, localizing diseases\naccurately without detailed positional annotations remains a challenge.\nAlthough existing methods have attempted to utilize local information to\nachieve fine-grained semantic alignment, their capability in extracting the\nfine-grained semantics of the comprehensive context within reports is limited.\nTo address this problem, a new method is introduced that takes full sentences\nfrom textual reports as the basic units for local semantic alignment. This\napproach combines chest X-ray images with their corresponding textual reports,\nperforming contrastive learning at both global and local levels. The leading\nresults obtained by this method on multiple datasets confirm its efficacy in\nthe task of lesion localization.\n","authors":["Hao Yang","Hong-Yu Zhou","Cheng Li","Weijian Huang","Jiarun Liu","Yong Liang","Guangming Shi","Hairong Zheng","Qiegen Liu","Shanshan Wang"],"pdf_url":"https://arxiv.org/pdf/2401.01524v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10518v1","updated":"2024-08-20T03:35:42Z","published":"2024-08-20T03:35:42Z","title":"BAUST Lipi: A BdSL Dataset with Deep Learning Based Bangla Sign Language\n Recognition","summary":" People commonly communicate in English, Arabic, and Bengali spoken languages\nthrough various mediums. However, deaf and hard-of-hearing individuals\nprimarily use body language and sign language to express their needs and\nachieve independence. Sign language research is burgeoning to enhance\ncommunication with the deaf community. While many researchers have made strides\nin recognizing sign languages such as French, British, Arabic, Turkish, and\nAmerican, there has been limited research on Bangla sign language (BdSL) with\nless-than-satisfactory results. One significant barrier has been the lack of a\ncomprehensive Bangla sign language dataset. In our work, we introduced a new\nBdSL dataset comprising alphabets totaling 18,000 images, with each image being\n224x224 pixels in size. Our dataset encompasses 36 Bengali symbols, of which 30\nare consonants and the remaining six are vowels. Despite our dataset\ncontribution, many existing systems continue to grapple with achieving\nhigh-performance accuracy for BdSL. To address this, we devised a hybrid\nConvolutional Neural Network (CNN) model, integrating multiple convolutional\nlayers, activation functions, dropout techniques, and LSTM layers. Upon\nevaluating our hybrid-CNN model with the newly created BdSL dataset, we\nachieved an accuracy rate of 97.92\\%. We are confident that both our BdSL\ndataset and hybrid CNN model will be recognized as significant milestones in\nBdSL research.\n","authors":["Md Hadiuzzaman","Mohammed Sowket Ali","Tamanna Sultana","Abdur Raj Shafi","Abu Saleh Musa Miah","Jungpil Shin"],"pdf_url":"https://arxiv.org/pdf/2408.10518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.10240v8","updated":"2024-08-20T03:31:41Z","published":"2022-02-21T13:53:04Z","title":"Rethinking the Zigzag Flattening for Image Reading","summary":" Sequence ordering of word vector matters a lot to text reading, which has\nbeen proven in natural language processing (NLP). However, the rule of\ndifferent sequence ordering in computer vision (CV) was not well explored,\ne.g., why the ``zigzag\" flattening (ZF) is commonly utilized as a default\noption to get the image patches ordering in vision networks. Notably, when\ndecomposing multi-scale images, the ZF could not maintain the invariance of\nfeature point positions. To this end, we investigate the Hilbert fractal\nflattening (HF) as another method for sequence ordering in CV and contrast it\nagainst ZF. The HF has proven to be superior to other curves in maintaining\nspatial locality, when performing multi-scale transformations of dimensional\nspace. And it can be easily plugged into most deep neural networks (DNNs).\nExtensive experiments demonstrate that it can yield consistent and significant\nperformance boosts for a variety of architectures. Finally, we hope that our\nstudies spark further research about the flattening strategy of image reading.\n","authors":["Qingsong Zhao","Yi Wang","Zhipeng Zhou","Duoqian Miao","Limin Wang","Yu Qiao","Cairong Zhao"],"pdf_url":"https://arxiv.org/pdf/2202.10240v8.pdf","comment":"Modify the title, and introduce more innovative content"},{"id":"http://arxiv.org/abs/2401.08398v2","updated":"2024-08-20T03:22:34Z","published":"2024-01-16T14:41:31Z","title":"High-Quality Mesh Blendshape Generation from Face Videos via Neural\n Inverse Rendering","summary":" Readily editable mesh blendshapes have been widely used in animation\npipelines, while recent advancements in neural geometry and appearance\nrepresentations have enabled high-quality inverse rendering. Building upon\nthese observations, we introduce a novel technique that reconstructs mesh-based\nblendshape rigs from single or sparse multi-view videos, leveraging\nstate-of-the-art neural inverse rendering. We begin by constructing a\ndeformation representation that parameterizes vertex displacements into\ndifferential coordinates with tetrahedral connections, allowing for\nhigh-quality vertex deformation on high-resolution meshes. By constructing a\nset of semantic regulations in this representation, we achieve joint\noptimization of blendshapes and expression coefficients. Furthermore, to enable\na user-friendly multi-view setup with unsynchronized cameras, we propose a\nneural regressor to model time-varying motion parameters. This approach\nimplicitly considers the time difference across multiple cameras, enhancing the\naccuracy of motion modeling. Experiments demonstrate that, with the flexible\ninput of single or sparse multi-view videos, we reconstruct personalized\nhigh-fidelity blendshapes. These blendshapes are both geometrically and\nsemantically accurate, and they are compatible with industrial animation\npipelines. Code and data are available at\nhttps://github.com/grignarder/high-quality-blendshape-generation.\n","authors":["Xin Ming","Jiawei Li","Jingwang Ling","Libo Zhang","Feng Xu"],"pdf_url":"https://arxiv.org/pdf/2401.08398v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09715v2","updated":"2024-08-20T03:13:41Z","published":"2024-08-19T06:06:30Z","title":"HYDEN: Hyperbolic Density Representations for Medical Images and Reports","summary":" In light of the inherent entailment relations between images and text,\nhyperbolic point vector embeddings, leveraging the hierarchical modeling\nadvantages of hyperbolic space, have been utilized for visual semantic\nrepresentation learning. However, point vector embedding approaches fail to\naddress the issue of semantic uncertainty, where an image may have multiple\ninterpretations, and text may refer to different images, a phenomenon\nparticularly prevalent in the medical domain. Therefor, we propose\n\\textbf{HYDEN}, a novel hyperbolic density embedding based image-text\nrepresentation learning approach tailored for specific medical domain data.\nThis method integrates text-aware local features alongside global features from\nimages, mapping image-text features to density features in hyperbolic space via\nusing hyperbolic pseudo-Gaussian distributions. An encapsulation loss function\nis employed to model the partial order relations between image-text density\ndistributions. Experimental results demonstrate the interpretability of our\napproach and its superior performance compared to the baseline methods across\nvarious zero-shot tasks and different datasets.\n","authors":["Zhi Qiao","Linbin Han","Xiantong Zhen","Jia-Hong Gao","Zhen Qian"],"pdf_url":"https://arxiv.org/pdf/2408.09715v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10503v1","updated":"2024-08-20T03:03:56Z","published":"2024-08-20T03:03:56Z","title":"Adaptive Knowledge Distillation for Classification of Hand Images using\n Explainable Vision Transformers","summary":" Assessing the forensic value of hand images involves the use of unique\nfeatures and patterns present in an individual's hand. The human hand has\ndistinct characteristics, such as the pattern of veins, fingerprints, and the\ngeometry of the hand itself. This paper investigates the use of vision\ntransformers (ViTs) for classification of hand images. We use explainability\ntools to explore the internal representations of ViTs and assess their impact\non the model outputs. Utilizing the internal understanding of ViTs, we\nintroduce distillation methods that allow a student model to adaptively extract\nknowledge from a teacher model while learning on data of a different domain to\nprevent catastrophic forgetting. Two publicly available hand image datasets are\nused to conduct a series of experiments to evaluate performance of the ViTs and\nour proposed adaptive distillation methods. The experimental results\ndemonstrate that ViT models significantly outperform traditional machine\nlearning methods and the internal states of ViTs are useful for explaining the\nmodel outputs in the classification task. By averting catastrophic forgetting,\nour distillation methods achieve excellent performance on data from both source\nand target domains, particularly when these two domains exhibit significant\ndissimilarity. The proposed approaches therefore can be developed and\nimplemented effectively for real-world applications such as access control,\nidentity verification, and authentication systems.\n","authors":["Thanh Thi Nguyen","Campbell Wilson","Janis Dalins"],"pdf_url":"https://arxiv.org/pdf/2408.10503v1.pdf","comment":"Accepted at the ECML PKDD 2024 (Research Track)"},{"id":"http://arxiv.org/abs/2403.04151v3","updated":"2024-08-20T02:58:19Z","published":"2024-03-07T02:17:59Z","title":"Dual-path Frequency Discriminators for Few-shot Anomaly Detection","summary":" Few-shot anomaly detection (FSAD) plays a crucial role in industrial\nmanufacturing. However, existing FSAD methods encounter difficulties leveraging\na limited number of normal samples, frequently failing to detect and locate\ninconspicuous anomalies in the spatial domain. We have further discovered that\nthese subtle anomalies would be more noticeable in the frequency domain. In\nthis paper, we propose a Dual-Path Frequency Discriminators (DFD) network from\na frequency perspective to tackle these issues. The original spatial images are\ntransformed into multi-frequency images, making them more conducive to the\ntailored discriminators in detecting anomalies. Additionally, the\ndiscriminators learn a joint representation with forms of pseudo-anomalies.\nExtensive experiments conducted on MVTec AD and VisA benchmarks demonstrate\nthat our DFD surpasses current state-of-the-art methods. Source code will be\navailable.\n","authors":["Yuhu Bai","Jiangning Zhang","Zhaofeng Chen","Yuhang Dong","Yunkang Cao","Guanzhong Tian"],"pdf_url":"https://arxiv.org/pdf/2403.04151v3.pdf","comment":"Accepted by KBS"},{"id":"http://arxiv.org/abs/2408.10500v1","updated":"2024-08-20T02:46:03Z","published":"2024-08-20T02:46:03Z","title":"SZTU-CMU at MER2024: Improving Emotion-LLaMA with Conv-Attention for\n Multimodal Emotion Recognition","summary":" This paper presents our winning approach for the MER-NOISE and MER-OV tracks\nof the MER2024 Challenge on multimodal emotion recognition. Our system\nleverages the advanced emotional understanding capabilities of Emotion-LLaMA to\ngenerate high-quality annotations for unlabeled samples, addressing the\nchallenge of limited labeled data. To enhance multimodal fusion while\nmitigating modality-specific noise, we introduce Conv-Attention, a lightweight\nand efficient hybrid framework. Extensive experimentation vali-dates the\neffectiveness of our approach. In the MER-NOISE track, our system achieves a\nstate-of-the-art weighted average F-score of 85.30%, surpassing the second and\nthird-place teams by 1.47% and 1.65%, respectively. For the MER-OV track, our\nutilization of Emotion-LLaMA for open-vocabulary annotation yields an 8.52%\nimprovement in average accuracy and recall compared to GPT-4V, securing the\nhighest score among all participating large multimodal models. The code and\nmodel for Emotion-LLaMA are available at\nhttps://github.com/ZebangCheng/Emotion-LLaMA.\n","authors":["Zebang Cheng","Shuyuan Tu","Dawei Huang","Minghan Li","Xiaojiang Peng","Zhi-Qi Cheng","Alexander G. Hauptmann"],"pdf_url":"https://arxiv.org/pdf/2408.10500v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10498v1","updated":"2024-08-20T02:44:48Z","published":"2024-08-20T02:44:48Z","title":"Cervical Cancer Detection Using Multi-Branch Deep Learning Model","summary":" Cervical cancer is a crucial global health concern for women, and the\npersistent infection of High-risk HPV mainly triggers this remains a global\nhealth challenge, with young women diagnosis rates soaring from 10\\% to 40\\%\nover three decades. While Pap smear screening is a prevalent diagnostic method,\nvisual image analysis can be lengthy and often leads to mistakes. Early\ndetection of the disease can contribute significantly to improving patient\noutcomes. In recent decades, many researchers have employed machine learning\ntechniques that achieved promise in cervical cancer detection processes based\non medical images. In recent years, many researchers have employed various\ndeep-learning techniques to achieve high-performance accuracy in detecting\ncervical cancer but are still facing various challenges. This research proposes\nan innovative and novel approach to automate cervical cancer image\nclassification using Multi-Head Self-Attention (MHSA) and convolutional neural\nnetworks (CNNs). The proposed method leverages the strengths of both MHSA\nmechanisms and CNN to effectively capture both local and global features within\ncervical images in two streams. MHSA facilitates the model's ability to focus\non relevant regions of interest, while CNN extracts hierarchical features that\ncontribute to accurate classification. Finally, we combined the two stream\nfeatures and fed them into the classification module to refine the feature and\nthe classification. To evaluate the performance of the proposed approach, we\nused the SIPaKMeD dataset, which classifies cervical cells into five\ncategories. Our model achieved a remarkable accuracy of 98.522\\%. This\nperformance has high recognition accuracy of medical image classification and\nholds promise for its applicability in other medical image recognition tasks.\n","authors":["Tatsuhiro Baba","Abu Saleh Musa Miah","Jungpil Shin","Md. Al Mehedi Hasan"],"pdf_url":"https://arxiv.org/pdf/2408.10498v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10154v2","updated":"2024-08-20T02:43:25Z","published":"2024-08-19T17:04:18Z","title":"LoopSplat: Loop Closure by Registering 3D Gaussian Splats","summary":" Simultaneous Localization and Mapping (SLAM) based on 3D Gaussian Splats\n(3DGS) has recently shown promise towards more accurate, dense 3D scene maps.\nHowever, existing 3DGS-based methods fail to address the global consistency of\nthe scene via loop closure and/or global bundle adjustment. To this end, we\npropose LoopSplat, which takes RGB-D images as input and performs dense mapping\nwith 3DGS submaps and frame-to-model tracking. LoopSplat triggers loop closure\nonline and computes relative loop edge constraints between submaps directly via\n3DGS registration, leading to improvements in efficiency and accuracy over\ntraditional global-to-local point cloud registration. It uses a robust pose\ngraph optimization formulation and rigidly aligns the submaps to achieve global\nconsistency. Evaluation on the synthetic Replica and real-world TUM-RGBD,\nScanNet, and ScanNet++ datasets demonstrates competitive or superior tracking,\nmapping, and rendering compared to existing methods for dense RGB-D SLAM. Code\nis available at loopsplat.github.io.\n","authors":["Liyuan Zhu","Yue Li","Erik Sandström","Shengyu Huang","Konrad Schindler","Iro Armeni"],"pdf_url":"https://arxiv.org/pdf/2408.10154v2.pdf","comment":"Project page: https://loopsplat.github.io/"},{"id":"http://arxiv.org/abs/2408.10496v1","updated":"2024-08-20T02:43:19Z","published":"2024-08-20T02:43:19Z","title":"GPT-based Textile Pilling Classification Using 3D Point Cloud Data","summary":" Textile pilling assessment is critical for textile quality control. We\ncollect thousands of 3D point cloud images in the actual test environment of\ntextiles and organize and label them as TextileNet8 dataset. To the best of our\nknowledge, it is the first publicly available eight-categories 3D point cloud\ndataset in the field of textile pilling assessment. Based on PointGPT, the\nGPT-like big model of point cloud analysis, we incorporate the global features\nof the input point cloud extracted from the non-parametric network into it,\nthus proposing the PointGPT+NN model. Using TextileNet8 as a benchmark, the\nexperimental results show that the proposed PointGPT+NN model achieves an\noverall accuracy (OA) of 91.8% and a mean per-class accuracy (mAcc) of 92.2%.\nTest results on other publicly available datasets also validate the competitive\nperformance of the proposed PointGPT+NN model. The proposed TextileNet8 dataset\nwill be publicly available.\n","authors":["Yu Lu","YuYu Chen","Gang Zhou","Zhenghua Lan"],"pdf_url":"https://arxiv.org/pdf/2408.10496v1.pdf","comment":"8 pages, 2 figures"},{"id":"http://arxiv.org/abs/2401.10110v5","updated":"2024-08-20T02:34:29Z","published":"2024-01-18T16:27:09Z","title":"SVIPTR: Fast and Efficient Scene Text Recognition with Vision Permutable\n Extractor","summary":" Scene Text Recognition (STR) is an important and challenging upstream task\nfor building structured information databases, that involves recognizing text\nwithin images of natural scenes. Although current state-of-the-art (SOTA)\nmodels for STR exhibit high performance, they typically suffer from low\ninference efficiency due to their reliance on hybrid architectures comprised of\nvisual encoders and sequence decoders. In this work, we propose a VIsion\nPermutable extractor for fast and efficient Scene Text Recognition (SVIPTR),\nwhich achieves an impressive balance between high performance and rapid\ninference speeds in the domain of STR. Specifically, SVIPTR leverages a\nvisual-semantic extractor with a pyramid structure, characterized by the\nPermutation and combination of local and global self-attention layers. This\ndesign results in a lightweight and efficient model and its inference is\ninsensitive to input length. Extensive experimental results on various standard\ndatasets for both Chinese and English scene text recognition validate the\nsuperiority of SVIPTR. Notably, the SVIPTR-T (Tiny) variant delivers highly\ncompetitive accuracy on par with other lightweight models and achieves SOTA\ninference speeds. Meanwhile, the SVIPTR-L (Large) attains SOTA accuracy in\nsingle-encoder-type models, while maintaining a low parameter count and\nfavorable inference speed. Our proposed method provides a compelling solution\nfor the STR challenge, which greatly benefits real-world applications requiring\nfast and efficient STR. The code is publicly available at\nhttps://github.com/cxfyxl/VIPTR.\n","authors":["Xianfu Cheng","Weixiao Zhou","Xiang Li","Jian Yang","Hang Zhang","Tao Sun","Wei Zhang","Yuying Mai","Tongliang Li","Xiaoming Chen","Zhoujun Li"],"pdf_url":"https://arxiv.org/pdf/2401.10110v5.pdf","comment":"10 pages, 4 figures, 6 tables"},{"id":"http://arxiv.org/abs/2408.10488v1","updated":"2024-08-20T02:01:30Z","published":"2024-08-20T02:01:30Z","title":"Event Stream based Sign Language Translation: A High-Definition\n Benchmark Dataset and A New Algorithm","summary":" Sign Language Translation (SLT) is a core task in the field of AI-assisted\ndisability. Unlike traditional SLT based on visible light videos, which is\neasily affected by factors such as lighting, rapid hand movements, and privacy\nbreaches, this paper proposes the use of high-definition Event streams for SLT,\neffectively mitigating the aforementioned issues. This is primarily because\nEvent streams have a high dynamic range and dense temporal signals, which can\nwithstand low illumination and motion blur well. Additionally, due to their\nsparsity in space, they effectively protect the privacy of the target person.\nMore specifically, we propose a new high-resolution Event stream sign language\ndataset, termed Event-CSL, which effectively fills the data gap in this area of\nresearch. It contains 14,827 videos, 14,821 glosses, and 2,544 Chinese words in\nthe text vocabulary. These samples are collected in a variety of indoor and\noutdoor scenes, encompassing multiple angles, light intensities, and camera\nmovements. We have benchmarked existing mainstream SLT works to enable fair\ncomparison for future efforts. Based on this dataset and several other\nlarge-scale datasets, we propose a novel baseline method that fully leverages\nthe Mamba model's ability to integrate temporal information of CNN features,\nresulting in improved sign language translation outcomes. Both the benchmark\ndataset and source code will be released on\nhttps://github.com/Event-AHU/OpenESL\n","authors":["Xiao Wang","Yao Rong","Fuling Wang","Jianing Li","Lin Zhu","Bo Jiang","Yaowei Wang"],"pdf_url":"https://arxiv.org/pdf/2408.10488v1.pdf","comment":"First Large-scale and High-Definition Benchmark Dataset for\n Event-based Sign Language Translation"},{"id":"http://arxiv.org/abs/2408.10487v1","updated":"2024-08-20T02:01:17Z","published":"2024-08-20T02:01:17Z","title":"MambaEVT: Event Stream based Visual Object Tracking using State Space\n Model","summary":" Event camera-based visual tracking has drawn more and more attention in\nrecent years due to the unique imaging principle and advantages of low energy\nconsumption, high dynamic range, and dense temporal resolution. Current\nevent-based tracking algorithms are gradually hitting their performance\nbottlenecks, due to the utilization of vision Transformer and the static\ntemplate for target object localization. In this paper, we propose a novel\nMamba-based visual tracking framework that adopts the state space model with\nlinear complexity as a backbone network. The search regions and target template\nare fed into the vision Mamba network for simultaneous feature extraction and\ninteraction. The output tokens of search regions will be fed into the tracking\nhead for target localization. More importantly, we consider introducing a\ndynamic template update strategy into the tracking framework using the Memory\nMamba network. By considering the diversity of samples in the target template\nlibrary and making appropriate adjustments to the template memory module, a\nmore effective dynamic template can be integrated. The effective combination of\ndynamic and static templates allows our Mamba-based tracking algorithm to\nachieve a good balance between accuracy and computational cost on multiple\nlarge-scale datasets, including EventVOT, VisEvent, and FE240hz. The source\ncode will be released on https://github.com/Event-AHU/MambaEVT\n","authors":["Xiao Wang","Chao wang","Shiao Wang","Xixi Wang","Zhicheng Zhao","Lin Zhu","Bo Jiang"],"pdf_url":"https://arxiv.org/pdf/2408.10487v1.pdf","comment":"In Peer Review"},{"id":"http://arxiv.org/abs/2408.09663v2","updated":"2024-08-20T01:51:58Z","published":"2024-08-19T02:46:23Z","title":"CHASE: 3D-Consistent Human Avatars with Sparse Inputs via Gaussian\n Splatting and Contrastive Learning","summary":" Recent advancements in human avatar synthesis have utilized radiance fields\nto reconstruct photo-realistic animatable human avatars. However, both\nNeRFs-based and 3DGS-based methods struggle with maintaining 3D consistency and\nexhibit suboptimal detail reconstruction, especially with sparse inputs. To\naddress this challenge, we propose CHASE, which introduces supervision from\nintrinsic 3D consistency across poses and 3D geometry contrastive learning,\nachieving performance comparable with sparse inputs to that with full inputs.\nFollowing previous work, we first integrate a skeleton-driven rigid deformation\nand a non-rigid cloth dynamics deformation to coordinate the movements of\nindividual Gaussians during animation, reconstructing basic avatar with coarse\n3D consistency. To improve 3D consistency under sparse inputs, we design\nDynamic Avatar Adjustment(DAA) to adjust deformed Gaussians based on a selected\nsimilar pose/image from the dataset. Minimizing the difference between the\nimage rendered by adjusted Gaussians and the image with the similar pose serves\nas an additional form of supervision for avatar. Furthermore, we propose a 3D\ngeometry contrastive learning strategy to maintain the 3D global consistency of\ngenerated avatars. Though CHASE is designed for sparse inputs, it surprisingly\noutperforms current SOTA methods \\textbf{in both full and sparse settings} on\nthe ZJU-MoCap and H36M datasets, demonstrating that our CHASE successfully\nmaintains avatar's 3D consistency, hence improving rendering quality.\n","authors":["Haoyu Zhao","Hao Wang","Chen Yang","Wei Shen"],"pdf_url":"https://arxiv.org/pdf/2408.09663v2.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2402.00827v3","updated":"2024-08-20T01:40:18Z","published":"2024-02-01T18:14:42Z","title":"GaussianStyle: Gaussian Head Avatar via StyleGAN","summary":" Existing methods like Neural Radiation Fields (NeRF) and 3D Gaussian\nSplatting (3DGS) have made significant strides in facial attribute control such\nas facial animation and components editing, yet they struggle with fine-grained\nrepresentation and scalability in dynamic head modeling. To address these\nlimitations, we propose GaussianStyle, a novel framework that integrates the\nvolumetric strengths of 3DGS with the powerful implicit representation of\nStyleGAN. The GaussianStyle preserves structural information, such as\nexpressions and poses, using Gaussian points, while projecting the implicit\nvolumetric representation into StyleGAN to capture high-frequency details and\nmitigate the over-smoothing commonly observed in neural texture rendering.\nExperimental outcomes indicate that our method achieves state-of-the-art\nperformance in reenactment, novel view synthesis, and animation.\n","authors":["Pinxin Liu","Luchuan Song","Daoan Zhang","Hang Hua","Yunlong Tang","Huaijin Tu","Jiebo Luo","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2402.00827v3.pdf","comment":"demo page and code to be updated soon"},{"id":"http://arxiv.org/abs/2408.10469v1","updated":"2024-08-20T00:45:13Z","published":"2024-08-20T00:45:13Z","title":"LSVOS Challenge 3rd Place Report: SAM2 and Cutie based VOS","summary":" Video Object Segmentation (VOS) presents several challenges, including object\nocclusion and fragmentation, the dis-appearance and re-appearance of objects,\nand tracking specific objects within crowded scenes. In this work, we combine\nthe strengths of the state-of-the-art (SOTA) models SAM2 and Cutie to address\nthese challenges. Additionally, we explore the impact of various\nhyperparameters on video instance segmentation performance. Our approach\nachieves a J\\&F score of 0.7952 in the testing phase of LSVOS challenge VOS\ntrack, ranking third overa1l.\n","authors":["Xinyu Liu","Jing Zhang","Kexin Zhang","Xu Liu","Lingling Li"],"pdf_url":"https://arxiv.org/pdf/2408.10469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10467v1","updated":"2024-08-20T00:33:45Z","published":"2024-08-20T00:33:45Z","title":"Learning Multimodal Latent Space with EBM Prior and MCMC Inference","summary":" Multimodal generative models are crucial for various applications. We propose\nan approach that combines an expressive energy-based model (EBM) prior with\nMarkov Chain Monte Carlo (MCMC) inference in the latent space for multimodal\ngeneration. The EBM prior acts as an informative guide, while MCMC inference,\nspecifically through short-run Langevin dynamics, brings the posterior\ndistribution closer to its true form. This method not only provides an\nexpressive prior to better capture the complexity of multimodality but also\nimproves the learning of shared latent variables for more coherent generation\nacross modalities. Our proposed method is supported by empirical experiments,\nunderscoring the effectiveness of our EBM prior with MCMC inference in\nenhancing cross-modal and joint generative tasks in multimodal contexts.\n","authors":["Shiyu Yuan","Carlo Lipizzi","Tian Han"],"pdf_url":"https://arxiv.org/pdf/2408.10467v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2408.09698v2","updated":"2024-08-20T16:09:33Z","published":"2024-08-19T04:44:32Z","title":"Harnessing Multimodal Large Language Models for Multimodal Sequential\n Recommendation","summary":" Recent advances in Large Language Models (LLMs) have demonstrated significant\npotential in the field of Recommendation Systems (RSs). Most existing studies\nhave focused on converting user behavior logs into textual prompts and\nleveraging techniques such as prompt tuning to enable LLMs for recommendation\ntasks. Meanwhile, research interest has recently grown in multimodal\nrecommendation systems that integrate data from images, text, and other sources\nusing modality fusion techniques. This introduces new challenges to the\nexisting LLM-based recommendation paradigm which relies solely on text modality\ninformation. Moreover, although Multimodal Large Language Models (MLLMs)\ncapable of processing multi-modal inputs have emerged, how to equip MLLMs with\nmulti-modal recommendation capabilities remains largely unexplored. To this\nend, in this paper, we propose the Multimodal Large Language Model-enhanced\nMultimodaln Sequential Recommendation (MLLM-MSR) model. To capture the dynamic\nuser preference, we design a two-stage user preference summarization method.\nSpecifically, we first utilize an MLLM-based item-summarizer to extract image\nfeature given an item and convert the image into text. Then, we employ a\nrecurrent user preference summarization generation paradigm to capture the\ndynamic changes in user preferences based on an LLM-based user-summarizer.\nFinally, to enable the MLLM for multi-modal recommendation task, we propose to\nfine-tune a MLLM-based recommender using Supervised Fine-Tuning (SFT)\ntechniques. Extensive evaluations across various datasets validate the\neffectiveness of MLLM-MSR, showcasing its superior ability to capture and adapt\nto the evolving dynamics of user preferences.\n","authors":["Yuyang Ye","Zhi Zheng","Yishan Shen","Tianshu Wang","Hengruo Zhang","Peijun Zhu","Runlong Yu","Kai Zhang","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2408.09698v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09236v2","updated":"2024-08-20T14:57:15Z","published":"2024-08-17T16:04:31Z","title":"Hybrid Semantic Search: Unveiling User Intent Beyond Keywords","summary":" This paper addresses the limitations of traditional keyword-based search in\nunderstanding user intent and introduces a novel hybrid search approach that\nleverages the strengths of non-semantic search engines, Large Language Models\n(LLMs), and embedding models. The proposed system integrates keyword matching,\nsemantic vector embeddings, and LLM-generated structured queries to deliver\nhighly relevant and contextually appropriate search results. By combining these\ncomplementary methods, the hybrid approach effectively captures both explicit\nand implicit user intent.The paper further explores techniques to optimize\nquery execution for faster response times and demonstrates the effectiveness of\nthis hybrid search model in producing comprehensive and accurate search\noutcomes.\n","authors":["Aman Ahluwalia","Bishwajit Sutradhar","Karishma Ghosh","Indrapal Yadav","Arpan Sheetal","Prashant Patil"],"pdf_url":"https://arxiv.org/pdf/2408.09236v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00338v3","updated":"2024-08-20T14:33:32Z","published":"2024-05-01T06:23:54Z","title":"Distillation Matters: Empowering Sequential Recommenders to Match the\n Performance of Large Language Model","summary":" Owing to their powerful semantic reasoning capabilities, Large Language\nModels (LLMs) have been effectively utilized as recommenders, achieving\nimpressive performance. However, the high inference latency of LLMs\nsignificantly restricts their practical deployment. To address this issue, this\nwork investigates knowledge distillation from cumbersome LLM-based\nrecommendation models to lightweight conventional sequential models. It\nencounters three challenges: 1) the teacher's knowledge may not always be\nreliable; 2) the capacity gap between the teacher and student makes it\ndifficult for the student to assimilate the teacher's knowledge; 3) divergence\nin semantic space poses a challenge to distill the knowledge from embeddings.\nTo tackle these challenges, this work proposes a novel distillation strategy,\nDLLM2Rec, specifically tailored for knowledge distillation from LLM-based\nrecommendation models to conventional sequential models. DLLM2Rec comprises: 1)\nImportance-aware ranking distillation, which filters reliable and\nstudent-friendly knowledge by weighting instances according to teacher\nconfidence and student-teacher consistency; 2) Collaborative embedding\ndistillation integrates knowledge from teacher embeddings with collaborative\nsignals mined from the data. Extensive experiments demonstrate the\neffectiveness of the proposed DLLM2Rec, boosting three typical sequential\nmodels with an average improvement of 47.97%, even enabling them to surpass\nLLM-based recommenders in some cases.\n","authors":["Yu Cui","Feng Liu","Pengbo Wang","Bohao Wang","Heng Tang","Yi Wan","Jun Wang","Jiawei Chen"],"pdf_url":"https://arxiv.org/pdf/2405.00338v3.pdf","comment":"11 pages, 2 figures"},{"id":"http://arxiv.org/abs/2406.01631v2","updated":"2024-08-20T13:56:21Z","published":"2024-06-01T11:56:08Z","title":"SUBER: An RL Environment with Simulated Human Behavior for Recommender\n Systems","summary":" Reinforcement learning (RL) has gained popularity in the realm of recommender\nsystems due to its ability to optimize long-term rewards and guide users in\ndiscovering relevant content. However, the successful implementation of RL in\nrecommender systems is challenging because of several factors, including the\nlimited availability of online data for training on-policy methods. This\nscarcity requires expensive human interaction for online model training.\nFurthermore, the development of effective evaluation frameworks that accurately\nreflect the quality of models remains a fundamental challenge in recommender\nsystems. To address these challenges, we propose a comprehensive framework for\nsynthetic environments that simulate human behavior by harnessing the\ncapabilities of large language models (LLMs). We complement our framework with\nin-depth ablation studies and demonstrate its effectiveness with experiments on\nmovie and book recommendations. Using LLMs as synthetic users, this work\nintroduces a modular and novel framework to train RL-based recommender systems.\nThe software, including the RL environment, is publicly available on GitHub.\n","authors":["Nathan Corecco","Giorgio Piatti","Luca A. Lanzendörfer","Flint Xiaofeng Fan","Roger Wattenhofer"],"pdf_url":"https://arxiv.org/pdf/2406.01631v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09380v2","updated":"2024-08-20T13:24:50Z","published":"2024-08-18T06:41:46Z","title":"ELASTIC: Efficient Linear Attention for Sequential Interest Compression","summary":" State-of-the-art sequential recommendation models heavily rely on\ntransformer's attention mechanism. However, the quadratic computational and\nmemory complexities of self attention have limited its scalability for modeling\nusers' long range behaviour sequences. To address this problem, we propose\nELASTIC, an Efficient Linear Attention for SequenTial Interest Compression,\nrequiring only linear time complexity and decoupling model capacity from\ncomputational cost. Specifically, ELASTIC introduces a fixed length interest\nexperts with linear dispatcher attention mechanism which compresses the\nlong-term behaviour sequences to a significantly more compact representation\nwhich reduces up to 90% GPU memory usage with x2.7 inference speed up. The\nproposed linear dispatcher attention mechanism significantly reduces the\nquadratic complexity and makes the model feasible for adequately modeling\nextremely long sequences. Moreover, in order to retain the capacity for\nmodeling various user interests, ELASTIC initializes a vast learnable interest\nmemory bank and sparsely retrieves compressed user's interests from the memory\nwith a negligible computational overhead. The proposed interest memory\nretrieval technique significantly expands the cardinality of available interest\nspace while keeping the same computational cost, thereby striking a trade-off\nbetween recommendation accuracy and efficiency. To validate the effectiveness\nof our proposed ELASTIC, we conduct extensive experiments on various public\ndatasets and compare it with several strong sequential recommenders.\nExperimental results demonstrate that ELASTIC consistently outperforms\nbaselines by a significant margin and also highlight the computational\nefficiency of ELASTIC when modeling long sequences. We will make our\nimplementation code publicly available.\n","authors":["Jiaxin Deng","Shiyao Wang","Song Lu","Yinfeng Li","Xinchen Luo","Yuanjun Liu","Peixing Xu","Guorui Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.09380v2.pdf","comment":"We hereby withdraw this paper from arXiv due to incomplete\n experiments. Upon further review, we have determined that additional\n experimental work is necessary to fully validate our findings and conclusions"},{"id":"http://arxiv.org/abs/2408.10808v1","updated":"2024-08-20T12:58:16Z","published":"2024-08-20T12:58:16Z","title":"ColBERT Retrieval and Ensemble Response Scoring for Language Model\n Question Answering","summary":" Domain-specific question answering remains challenging for language models,\ngiven the deep technical knowledge required to answer questions correctly. This\ndifficulty is amplified for smaller language models that cannot encode as much\ninformation in their parameters as larger models. The \"Specializing Large\nLanguage Models for Telecom Networks\" challenge aimed to enhance the\nperformance of two small language models, Phi-2 and Falcon-7B in\ntelecommunication question answering. In this paper, we present our question\nanswering systems for this challenge. Our solutions achieved leading marks of\n81.9% accuracy for Phi-2 and 57.3% for Falcon-7B. We have publicly released our\ncode and fine-tuned models.\n","authors":["Alex Gichamba","Tewodros Kederalah Idris","Brian Ebiyau","Eric Nyberg","Teruko Mitamura"],"pdf_url":"https://arxiv.org/pdf/2408.10808v1.pdf","comment":"This work has been submitted to the 2024 IEEE Globecom Workshops for\n possible publication. Copyright may be transferred without notice, after\n which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2408.09713v2","updated":"2024-08-20T12:22:16Z","published":"2024-08-19T06:05:24Z","title":"Carbon Footprint Accounting Driven by Large Language Models and\n Retrieval-augmented Generation","summary":" Carbon footprint accounting is crucial for quantifying greenhouse gas\nemissions and achieving carbon neutrality.The dynamic nature of processes,\naccounting rules, carbon-related policies, and energy supply structures\nnecessitates real-time updates of CFA. Traditional life cycle assessment\nmethods rely heavily on human expertise, making near-real-time updates\nchallenging. This paper introduces a novel approach integrating large language\nmodels (LLMs) with retrieval-augmented generation technology to enhance the\nreal-time, professional, and economical aspects of carbon footprint information\nretrieval and analysis. By leveraging LLMs' logical and language understanding\nabilities and RAG's efficient retrieval capabilities, the proposed method\nLLMs-RAG-CFA can retrieve more relevant professional information to assist\nLLMs, enhancing the model's generative abilities. This method offers broad\nprofessional coverage, efficient real-time carbon footprint information\nacquisition and accounting, and cost-effective automation without frequent\nLLMs' parameter updates. Experimental results across five industries(primary\naluminum, lithium battery, photovoltaic, new energy vehicles, and\ntransformers)demonstrate that the LLMs-RAG-CFA method outperforms traditional\nmethods and other LLMs, achieving higher information retrieval rates and\nsignificantly lower information deviations and carbon footprint accounting\ndeviations. The economically viable design utilizes RAG technology to balance\nreal-time updates with cost-effectiveness, providing an efficient, reliable,\nand cost-saving solution for real-time carbon emission management, thereby\nenhancing environmental sustainability practices.\n","authors":["Haijin Wang","Mianrong Zhang","Zheng Chen","Nan Shang","Shangheng Yao","Fushuan Wen","Junhua Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.09713v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.06871v4","updated":"2024-08-20T11:29:37Z","published":"2024-02-10T03:21:13Z","title":"Non-autoregressive Generative Models for Reranking Recommendation","summary":" Contemporary recommendation systems are designed to meet users' needs by\ndelivering tailored lists of items that align with their specific demands or\ninterests. In a multi-stage recommendation system, reranking plays a crucial\nrole by modeling the intra-list correlations among items. The key challenge of\nreranking lies in the exploration of optimal sequences within the combinatorial\nspace of permutations. Recent research proposes a generator-evaluator learning\nparadigm, where the generator generates multiple feasible sequences and the\nevaluator picks out the best sequence based on the estimated listwise score.\nThe generator is of vital importance, and generative models are well-suited for\nthe generator function. Current generative models employ an autoregressive\nstrategy for sequence generation. However, deploying autoregressive models in\nreal-time industrial systems is challenging. To address these issues, we\npropose a Non-AutoRegressive generative model for reranking Recommendation\n(NAR4Rec) designed to enhance efficiency and effectiveness. To tackle\nchallenges such as sparse training samples and dynamic candidates, we introduce\na matching model. Considering the diverse nature of user feedback, we employ a\nsequence-level unlikelihood training objective to differentiate feasible\nsequences from unfeasible ones. Additionally, to overcome the lack of\ndependency modeling in non-autoregressive models regarding target items, we\nintroduce contrastive decoding to capture correlations among these items.\nExtensive offline experiments validate the superior performance of NAR4Rec over\nstate-of-the-art reranking methods. Online A/B tests reveal that NAR4Rec\nsignificantly enhances the user experience. Furthermore, NAR4Rec has been fully\ndeployed in a popular video app Kuaishou with over 300 million daily active\nusers.\n","authors":["Yuxin Ren","Qiya Yang","Yichun Wu","Wei Xu","Yalong Wang","Zhiqiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.06871v4.pdf","comment":"Accepted by KDD 2024"},{"id":"http://arxiv.org/abs/2408.10734v1","updated":"2024-08-20T11:05:56Z","published":"2024-08-20T11:05:56Z","title":"Vector Symbolic Open Source Information Discovery","summary":" Combined, joint, intra-governmental, inter-agency and multinational (CJIIM)\noperations require rapid data sharing without the bottlenecks of metadata\ncuration and alignment. Curation and alignment is particularly infeasible for\nexternal open source information (OSINF), e.g., social media, which has become\nincreasingly valuable in understanding unfolding situations. Large language\nmodels (transformers) facilitate semantic data and metadata alignment but are\ninefficient in CJIIM settings characterised as denied, degraded, intermittent\nand low bandwidth (DDIL). Vector symbolic architectures (VSA) support semantic\ninformation processing using highly compact binary vectors, typically 1-10k\nbits, suitable in a DDIL setting. We demonstrate a novel integration of\ntransformer models with VSA, combining the power of the former for semantic\nmatching with the compactness and representational structure of the latter. The\napproach is illustrated via a proof-of-concept OSINF data discovery portal that\nallows partners in a CJIIM operation to share data sources with minimal\nmetadata curation and low communications bandwidth. This work was carried out\nas a bridge between previous low technology readiness level (TRL) research and\nfuture higher-TRL technology demonstration and deployment.\n","authors":["Cai Davies","Sam Meek","Philip Hawkins","Benomy Tutcher","Graham Bent","Alun Preece"],"pdf_url":"https://arxiv.org/pdf/2408.10734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.13333v3","updated":"2024-08-20T10:25:04Z","published":"2023-09-23T10:35:01Z","title":"mdendro: An R package for extended agglomerative hierarchical clustering","summary":" \"mdendro\" is an R package that provides a comprehensive collection of linkage\nmethods for agglomerative hierarchical clustering on a matrix of proximity data\n(distances or similarities), returning a multifurcated dendrogram or\nmultidendrogram. Multidendrograms can group more than two clusters at the same\ntime, solving the nonuniqueness problem that arises when there are ties in the\ndata. This problem causes that different binary dendrograms are possible\ndepending both on the order of the input data and on the criterion used to\nbreak ties. Weighted and unweighted versions of the most common linkage methods\nare included in the package, which also implements two parametric linkage\nmethods. In addition, package \"mdendro\" provides five descriptive measures to\nanalyze the resulting dendrograms: cophenetic correlation coefficient, space\ndistortion ratio, agglomeration coefficient, chaining coefficient and tree\nbalance.\n","authors":["Alberto Fernández","Sergio Gómez"],"pdf_url":"https://arxiv.org/pdf/2309.13333v3.pdf","comment":"29 pages, 15 figures. Software available at CRAN\n (https://cran.r-project.org/package=mdendro) and Github\n (https://sergio-gomez.github.io/mdendro/)"},{"id":"http://arxiv.org/abs/2308.05680v2","updated":"2024-08-20T10:24:50Z","published":"2023-08-10T16:33:17Z","title":"Breaking Language Barriers with MMTweets: Advancing Cross-Lingual\n Debunked Narrative Retrieval for Fact-Checking","summary":" Finding previously debunked narratives involves identifying claims that have\nalready undergone fact-checking. The issue intensifies when similar false\nclaims persist in multiple languages, despite the availability of debunks for\nseveral months in another language. Hence, automatically finding debunks (or\nfact-checks) in multiple languages is crucial to make the best use of scarce\nfact-checkers' resources. Mainly due to the lack of readily available data,\nthis is an understudied problem, particularly when considering the\ncross-lingual scenario, i.e. the retrieval of debunks in a language different\nfrom the language of the online post being checked. This study introduces\ncross-lingual debunked narrative retrieval and addresses this research gap by:\n(i) creating Multilingual Misinformation Tweets (MMTweets): a dataset that\nstands out, featuring cross-lingual pairs, images, human annotations, and\nfine-grained labels, making it a comprehensive resource compared to its\ncounterparts; (ii) conducting an extensive experiment to benchmark\nstate-of-the-art cross-lingual retrieval models and introducing multistage\nretrieval methods tailored for the task; and (iii) comprehensively evaluating\nretrieval models for their cross-lingual and cross-dataset transfer\ncapabilities within MMTweets, and conducting a retrieval latency analysis. We\nfind that MMTweets presents challenges for cross-lingual debunked narrative\nretrieval, highlighting areas for improvement in retrieval models. Nonetheless,\nthe study provides valuable insights for creating MMTweets datasets and\noptimising debunked narrative retrieval models to empower fact-checking\nendeavours. The dataset and annotation codebook are publicly available at\nhttps://doi.org/10.5281/zenodo.10637161.\n","authors":["Iknoor Singh","Carolina Scarton","Xingyi Song","Kalina Bontcheva"],"pdf_url":"https://arxiv.org/pdf/2308.05680v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10666v1","updated":"2024-08-20T09:08:13Z","published":"2024-08-20T09:08:13Z","title":"Accelerating the Surrogate Retraining for Poisoning Attacks against\n Recommender Systems","summary":" Recent studies have demonstrated the vulnerability of recommender systems to\ndata poisoning attacks, where adversaries inject carefully crafted fake user\ninteractions into the training data of recommenders to promote target items.\nCurrent attack methods involve iteratively retraining a surrogate recommender\non the poisoned data with the latest fake users to optimize the attack.\nHowever, this repetitive retraining is highly time-consuming, hindering the\nefficient assessment and optimization of fake users. To mitigate this\ncomputational bottleneck and develop a more effective attack in an affordable\ntime, we analyze the retraining process and find that a change in the\nrepresentation of one user/item will cause a cascading effect through the\nuser-item interaction graph. Under theoretical guidance, we introduce\n\\emph{Gradient Passing} (GP), a novel technique that explicitly passes\ngradients between interacted user-item pairs during backpropagation, thereby\napproximating the cascading effect and accelerating retraining. With just a\nsingle update, GP can achieve effects comparable to multiple original training\niterations. Under the same number of retraining epochs, GP enables a closer\napproximation of the surrogate recommender to the victim. This more accurate\napproximation provides better guidance for optimizing fake users, ultimately\nleading to enhanced data poisoning attacks. Extensive experiments on real-world\ndatasets demonstrate the efficiency and effectiveness of our proposed GP.\n","authors":["Yunfan Wu","Qi Cao","Shuchang Tao","Kaike Zhang","Fei Sun","Huawei Shen"],"pdf_url":"https://arxiv.org/pdf/2408.10666v1.pdf","comment":"Accepted by RecSys 2024"},{"id":"http://arxiv.org/abs/2408.10645v1","updated":"2024-08-20T08:36:59Z","published":"2024-08-20T08:36:59Z","title":"CoRA: Collaborative Information Perception by Large Language Model's\n Weights for Recommendation","summary":" Involving collaborative information in Large Language Models (LLMs) is a\npromising technique for adapting LLMs for recommendation. Existing methods\nachieve this by concatenating collaborative features with text tokens into a\nunified sequence input and then fine-tuning to align these features with LLM's\ninput space. Although effective, in this work, we identify two limitations when\nadapting LLMs to recommendation tasks, which hinder the integration of general\nknowledge and collaborative information, resulting in sub-optimal\nrecommendation performance. (1) Fine-tuning LLM with recommendation data can\nundermine its inherent world knowledge and fundamental competencies, which are\ncrucial for interpreting and inferring recommendation text. (2) Incorporating\ncollaborative features into textual prompts disrupts the semantics of the\noriginal prompts, preventing LLM from generating appropriate outputs. In this\npaper, we propose a new paradigm, CoRA (an acronym for Collaborative LoRA),\nwith a collaborative weights generator. Rather than input space alignment, this\nmethod aligns collaborative information with LLM's parameter space,\nrepresenting them as incremental weights to update LLM's output. This way, LLM\nperceives collaborative information without altering its general knowledge and\ntext inference capabilities. Specifically, we employ a collaborative filtering\nmodel to extract user and item embeddings, converting them into collaborative\nweights with low-rank properties through the collaborative weights generator.\nWe then merge the collaborative weights into LLM's weights, enabling LLM to\nperceive the collaborative signals and generate personalized recommendations\nwithout fine-tuning or extra collaborative tokens in prompts. Extensive\nexperiments confirm that CoRA effectively integrates collaborative information\ninto LLM, enhancing recommendation performance.\n","authors":["Yuting Liu","Jinghao Zhang","Yizhou Dang","Yuliang Liang","Qiang Liu","Guibing Guo","Jianzhe Zhao","Xingwei Wang"],"pdf_url":"https://arxiv.org/pdf/2408.10645v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10613v1","updated":"2024-08-20T07:48:19Z","published":"2024-08-20T07:48:19Z","title":"Task-level Distributionally Robust Optimization for Large Language\n Model-based Dense Retrieval","summary":" Large Language Model-based Dense Retrieval (LLM-DR) optimizes over numerous\nheterogeneous fine-tuning collections from different domains. However, the\ndiscussion about its training data distribution is still minimal. Previous\nstudies rely on empirically assigned dataset choices or sampling ratios, which\ninevitably leads to sub-optimal retrieval performances. In this paper, we\npropose a new task-level Distributionally Robust Optimization (tDRO) algorithm\nfor LLM-DR fine-tuning, targeted at improving the universal domain\ngeneralization ability by end-to-end reweighting the data distribution of each\ntask. The tDRO parameterizes the domain weights and updates them with scaled\ndomain gradients. The optimized weights are then transferred to the LLM-DR\nfine-tuning to train more robust retrievers. Experiments show optimal\nimprovements in large-scale retrieval benchmarks and reduce up to 30% dataset\nusage after applying our optimization algorithm with a series of\ndifferent-sized LLM-DR models.\n","authors":["Guangyuan Ma","Yongliang Ma","Xing Wu","Zhenpeng Su","Ming Zhou","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2408.10613v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10604v1","updated":"2024-08-20T07:37:06Z","published":"2024-08-20T07:37:06Z","title":"Multilingual Non-Factoid Question Answering with Silver Answers","summary":" Most existing Question Answering Datasets (QuADs) primarily focus on\nfactoid-based short-context Question Answering (QA) in high-resource languages.\nHowever, the scope of such datasets for low-resource languages remains limited,\nwith only a few works centered on factoid-based QuADs and none on non-factoid\nQuADs. Therefore, this work presents MuNfQuAD, a multilingual QuAD with\nnon-factoid questions. It utilizes interrogative sub-headings from BBC news\narticles as questions and the corresponding paragraphs as silver answers. The\ndataset comprises over 370K QA pairs across 38 languages, encompassing several\nlow-resource languages, and stands as the largest multilingual QA dataset to\ndate. Based on the manual annotations of 790 QA-pairs from MuNfQuAD (golden\nset), we observe that 98\\% of questions can be answered using their\ncorresponding silver answer. Our fine-tuned Answer Paragraph Selection (APS)\nmodel outperforms the baselines. The APS model attained an accuracy of 80\\% and\n72\\%, as well as a macro F1 of 72\\% and 66\\%, on the MuNfQuAD testset and the\ngolden set, respectively. Furthermore, the APS model effectively generalizes\ncertain a language within the golden set, even after being fine-tuned on silver\nlabels.\n","authors":["Ritwik Mishra","Sreeram Vennam","Rajiv Ratn Shah","Ponnurangam Kumaraguru"],"pdf_url":"https://arxiv.org/pdf/2408.10604v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17442v2","updated":"2024-08-20T07:03:43Z","published":"2024-03-26T07:19:26Z","title":"Touch the Core: Exploring Task Dependence Among Hybrid Targets for\n Recommendation","summary":" As user behaviors become complicated on business platforms, online\nrecommendations focus more on how to touch the core conversions, which are\nhighly related to the interests of platforms. These core conversions are\nusually continuous targets, such as \\textit{watch time}, \\textit{revenue}, and\nso on, whose predictions can be enhanced by previous discrete conversion\nactions. Therefore, multi-task learning (MTL) can be adopted as the paradigm to\nlearn these hybrid targets. However, existing works mainly emphasize\ninvestigating the sequential dependence among discrete conversion actions,\nwhich neglects the complexity of dependence between discrete conversions and\nthe final continuous conversion. Moreover, simultaneously optimizing hybrid\ntasks with stronger task dependence will suffer from volatile issues where the\ncore regression task might have a larger influence on other tasks. In this\npaper, we study the MTL problem with hybrid targets for the first time and\npropose the model named Hybrid Targets Learning Network (HTLNet) to explore\ntask dependence and enhance optimization. Specifically, we introduce label\nembedding for each task to explicitly transfer the label information among\nthese tasks, which can effectively explore logical task dependence. We also\nfurther design the gradient adjustment regime between the final regression task\nand other classification tasks to enhance the optimization. Extensive\nexperiments on two offline public datasets and one real-world industrial\ndataset are conducted to validate the effectiveness of HTLNet. Moreover, online\nA/B tests on the financial recommender system also show that our model has\nimproved significantly. Our implementation is available\nhere\\footnote{\\url{https://github.com/fuyuanlyu/HTLNet}}.\n","authors":["Xing Tang","Yang Qiao","Fuyuan Lyu","Dugang Liu","Xiuqiang He"],"pdf_url":"https://arxiv.org/pdf/2403.17442v2.pdf","comment":"Accepted by RecSys 2024"},{"id":"http://arxiv.org/abs/2408.10555v1","updated":"2024-08-20T05:38:47Z","published":"2024-08-20T05:38:47Z","title":"Target-Prompt Online Graph Collaborative Learning for Temporal QoS\n Prediction","summary":" In service-oriented architecture, accurately predicting the Quality of\nService (QoS) is vital for maintaining reliability and enhancing user\nsatisfaction. However, current methods often neglect high-order latent\ncollaborative relationships and fail to dynamically adjust feature learning for\nspecific user-service invocations, which are critical for precise feature\nextraction. Moreover, relying on RNNs to capture QoS evolution limits the\nability to detect long-term trends due to challenges in managing long-range\ndependencies. To address these issues, we propose the Target-Prompt Online\nGraph Collaborative Learning (TOGCL) framework for temporal QoS prediction. It\nleverages a dynamic user-service invocation graph to comprehensively model\nhistorical interactions. Building on this graph, it develops a target-prompt\ngraph attention network to extract online deep latent features of users and\nservices at each time slice, considering implicit target-neighboring\ncollaborative relationships and historical QoS values. Additionally, a\nmulti-layer Transformer encoder is employed to uncover temporal feature\nevolution patterns, enhancing temporal QoS prediction. Extensive experiments on\nthe WS-DREAM dataset demonstrate that TOGCL significantly outperforms\nstate-of-the-art methods across multiple metrics, achieving improvements of up\nto 38.80\\%. These results underscore the effectiveness of TOGCL for temporal\nQoS prediction.\n","authors":["Shengxiang Hu","Guobing Zou","Song Yang","Shiyi Lin","Bofeng Zhang","Yixin Chen"],"pdf_url":"https://arxiv.org/pdf/2408.10555v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2408.10536v1","updated":"2024-08-20T04:30:26Z","published":"2024-08-20T04:30:26Z","title":"Synergistic Approach for Simultaneous Optimization of Monolingual,\n Cross-lingual, and Multilingual Information Retrieval","summary":" Information retrieval across different languages is an increasingly important\nchallenge in natural language processing. Recent approaches based on\nmultilingual pre-trained language models have achieved remarkable success, yet\nthey often optimize for either monolingual, cross-lingual, or multilingual\nretrieval performance at the expense of others. This paper proposes a novel\nhybrid batch training strategy to simultaneously improve zero-shot retrieval\nperformance across monolingual, cross-lingual, and multilingual settings while\nmitigating language bias. The approach fine-tunes multilingual language models\nusing a mix of monolingual and cross-lingual question-answer pair batches\nsampled based on dataset size. Experiments on XQuAD-R, MLQA-R, and MIRACL\nbenchmark datasets show that the proposed method consistently achieves\ncomparable or superior results in zero-shot retrieval across various languages\nand retrieval tasks compared to monolingual-only or cross-lingual-only\ntraining. Hybrid batch training also substantially reduces language bias in\nmultilingual retrieval compared to monolingual training. These results\ndemonstrate the effectiveness of the proposed approach for learning\nlanguage-agnostic representations that enable strong zero-shot retrieval\nperformance across diverse languages.\n","authors":["Adel Elmahdy","Sheng-Chieh Lin","Amin Ahmad"],"pdf_url":"https://arxiv.org/pdf/2408.10536v1.pdf","comment":"15 pages, 2 figures, 13 tables"},{"id":"http://arxiv.org/abs/2406.18938v2","updated":"2024-08-20T03:47:41Z","published":"2024-06-27T07:10:37Z","title":"Towards Personalized Federated Multi-Scenario Multi-Task Recommendation","summary":" In modern recommender systems, especially in e-commerce, predicting multiple\ntargets such as click-through rate (CTR) and post-view conversion rate (CTCVR)\nis common. Multi-task recommender systems are increasingly popular in both\nresearch and practice, as they leverage shared knowledge across diverse\nbusiness scenarios to enhance performance. However, emerging real-world\nscenarios and data privacy concerns complicate the development of a unified\nmulti-task recommendation model.\n In this paper, we propose PF-MSMTrec, a novel framework for personalized\nfederated multi-scenario multi-task recommendation. In this framework, each\nscenario is assigned to a dedicated client utilizing the Multi-gate\nMixture-of-Experts (MMoE) structure. To address the unique challenges of\nmultiple optimization conflicts, we introduce a bottom-up joint learning\nmechanism. First, we design a parameter template to decouple the expert network\nparameters, distinguishing scenario-specific parameters as shared knowledge for\nfederated parameter aggregation. Second, we implement personalized federated\nlearning for each expert network during a federated communication round, using\nthree modules: federated batch normalization, conflict coordination, and\npersonalized aggregation. Finally, we conduct an additional round of\npersonalized federated parameter aggregation on the task tower network to\nobtain prediction results for multiple tasks. Extensive experiments on two\npublic datasets demonstrate that our proposed method outperforms\nstate-of-the-art approaches. The source code and datasets will be released as\nopen-source for public access.\n","authors":["Yue Ding","Yanbiao Ji","Xun Cai","Xin Xin","Yuxiang Lu","Suizhi Huang","Chang Liu","Xiaofeng Gao","Tsuyoshi Murata","Hongtao Lu"],"pdf_url":"https://arxiv.org/pdf/2406.18938v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10520v1","updated":"2024-08-20T03:45:24Z","published":"2024-08-20T03:45:24Z","title":"Efficient and Deployable Knowledge Infusion for Open-World\n Recommendations via Large Language Models","summary":" Recommender systems (RSs) play a pervasive role in today's online services,\nyet their closed-loop nature constrains their access to open-world knowledge.\nRecently, large language models (LLMs) have shown promise in bridging this gap.\nHowever, previous attempts to directly implement LLMs as recommenders fall\nshort in meeting the requirements of industrial RSs, particularly in terms of\nonline inference latency and offline resource efficiency. Thus, we propose REKI\nto acquire two types of external knowledge about users and items from LLMs.\nSpecifically, we introduce factorization prompting to elicit accurate knowledge\nreasoning on user preferences and items. We develop individual knowledge\nextraction and collective knowledge extraction tailored for different scales of\nscenarios, effectively reducing offline resource consumption. Subsequently,\ngenerated knowledge undergoes efficient transformation and condensation into\naugmented vectors through a hybridized expert-integrated network, ensuring\ncompatibility. The obtained vectors can then be used to enhance any\nconventional recommendation model. We also ensure efficient inference by\npreprocessing and prestoring the knowledge from LLMs. Experiments demonstrate\nthat REKI outperforms state-of-the-art baselines and is compatible with lots of\nrecommendation algorithms and tasks. Now, REKI has been deployed to Huawei's\nnews and music recommendation platforms and gained a 7% and 1.99% improvement\nduring the online A/B test.\n","authors":["Yunjia Xi","Weiwen Liu","Jianghao Lin","Muyan Weng","Xiaoling Cai","Hong Zhu","Jieming Zhu","Bo Chen","Ruiming Tang","Yong Yu","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.10520v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2306.10933"},{"id":"http://arxiv.org/abs/2408.10490v1","updated":"2024-08-20T02:19:35Z","published":"2024-08-20T02:19:35Z","title":"Analysis of Plan-based Retrieval for Grounded Text Generation","summary":" In text generation, hallucinations refer to the generation of seemingly\ncoherent text that contradicts established knowledge. One compelling hypothesis\nis that hallucinations occur when a language model is given a generation task\noutside its parametric knowledge (due to rarity, recency, domain, etc.). A\ncommon strategy to address this limitation is to infuse the language models\nwith retrieval mechanisms, providing the model with relevant knowledge for the\ntask. In this paper, we leverage the planning capabilities of instruction-tuned\nLLMs and analyze how planning can be used to guide retrieval to further reduce\nthe frequency of hallucinations. We empirically evaluate several variations of\nour proposed approach on long-form text generation tasks. By improving the\ncoverage of relevant facts, plan-guided retrieval and generation can produce\nmore informative responses while providing a higher rate of attribution to\nsource documents.\n","authors":["Ameya Godbole","Nicholas Monath","Seungyeon Kim","Ankit Singh Rawat","Andrew McCallum","Manzil Zaheer"],"pdf_url":"https://arxiv.org/pdf/2408.10490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17722v2","updated":"2024-08-20T02:16:41Z","published":"2024-07-25T02:48:56Z","title":"Text-Driven Neural Collaborative Filtering Model for Paper Source\n Tracing","summary":" Identifying significant references within the complex interrelations of a\ncitation knowledge graph is challenging, which encompasses connections through\ncitations, authorship, keywords, and other relational attributes. The Paper\nSource Tracing (PST) task seeks to automate the identification of pivotal\nreferences for given scholarly articles utilizing advanced data mining\ntechniques. In the KDD CUP OAG-Challenge PST track, we design a\nrecommendation-based framework tailored for the PST task. This framework\nemploys the Neural Collaborative Filtering (NCF) model to generate final\npredictions. To process the textual attributes of the papers and extract input\nfeatures for the model, we utilize SciBERT, a pre-trained language model.\nAccording to the experimental results, our method achieved a score of 0.37814\non the Mean Average Precision (MAP) metric, outperforming baseline models and\nranking 11th among all participating teams. The source code is publicly\navailable at https://github.com/MyLove-XAB/KDDCupFinal.\n","authors":["Aobo Xu","Bingyu Chang","Qingpeng Liu","Ling Jian"],"pdf_url":"https://arxiv.org/pdf/2407.17722v2.pdf","comment":"KDD CUP 2024 OAG-Challenges, Paper Source Tracing, Technical Report\n of Team AoboSama @ KDD CUP 2024. August 25--29, 2024. Barcelona, Spain"},{"id":"http://arxiv.org/abs/2408.10469v1","updated":"2024-08-20T00:45:13Z","published":"2024-08-20T00:45:13Z","title":"LSVOS Challenge 3rd Place Report: SAM2 and Cutie based VOS","summary":" Video Object Segmentation (VOS) presents several challenges, including object\nocclusion and fragmentation, the dis-appearance and re-appearance of objects,\nand tracking specific objects within crowded scenes. In this work, we combine\nthe strengths of the state-of-the-art (SOTA) models SAM2 and Cutie to address\nthese challenges. Additionally, we explore the impact of various\nhyperparameters on video instance segmentation performance. Our approach\nachieves a J\\&F score of 0.7952 in the testing phase of LSVOS challenge VOS\ntrack, ranking third overa1l.\n","authors":["Xinyu Liu","Jing Zhang","Kexin Zhang","Xu Liu","Lingling Li"],"pdf_url":"https://arxiv.org/pdf/2408.10469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.13274v2","updated":"2024-08-20T23:28:47Z","published":"2024-07-18T08:29:55Z","title":"Aligning Explanations for Recommendation with Rating and Feature via\n Maximizing Mutual Information","summary":" Providing natural language-based explanations to justify recommendations\nhelps to improve users' satisfaction and gain users' trust. However, as current\nexplanation generation methods are commonly trained with an objective to mimic\nexisting user reviews, the generated explanations are often not aligned with\nthe predicted ratings or some important features of the recommended items, and\nthus, are suboptimal in helping users make informed decision on the\nrecommendation platform. To tackle this problem, we propose a flexible\nmodel-agnostic method named MMI (Maximizing Mutual Information) framework to\nenhance the alignment between the generated natural language explanations and\nthe predicted rating/important item features. Specifically, we propose to use\nmutual information (MI) as a measure for the alignment and train a neural MI\nestimator. Then, we treat a well-trained explanation generation model as the\nbackbone model and further fine-tune it through reinforcement learning with\nguidance from the MI estimator, which rewards a generated explanation that is\nmore aligned with the predicted rating or a pre-defined feature of the\nrecommended item. Experiments on three datasets demonstrate that our MMI\nframework can boost different backbone models, enabling them to outperform\nexisting baselines in terms of alignment with predicted ratings and item\nfeatures. Additionally, user studies verify that MI-enhanced explanations\nindeed facilitate users' decisions and are favorable compared with other\nbaselines due to their better alignment properties.\n","authors":["Yurou Zhao","Yiding Sun","Ruidong Han","Fei Jiang","Lu Guan","Xiang Li","Wei Lin","Weizhi Ma","Jiaxin Mao"],"pdf_url":"https://arxiv.org/pdf/2407.13274v2.pdf","comment":"This paper has been accepted by cikm2024, and the code repository\n will be updated soon"},{"id":"http://arxiv.org/abs/2408.11189v1","updated":"2024-08-20T20:47:27Z","published":"2024-08-20T20:47:27Z","title":"Reading with Intent","summary":" Retrieval augmented generation (RAG) systems augment how knowledge language\nmodels are by integrating external information sources such as Wikipedia,\ninternal documents, scientific papers, or the open internet. RAG systems that\nrely on the open internet as their knowledge source have to contend with the\ncomplexities of human-generated content. Human communication extends much\ndeeper than just the words rendered as text. Intent, tonality, and connotation\ncan all change the meaning of what is being conveyed. Recent real-world\ndeployments of RAG systems have shown some difficulty in understanding these\nnuances of human communication. One significant challenge for these systems\nlies in processing sarcasm. Though the Large Language Models (LLMs) that make\nup the backbone of these RAG systems are able to detect sarcasm, they currently\ndo not always use these detections for the subsequent processing of text. To\naddress these issues, in this paper, we synthetically generate sarcastic\npassages from Natural Question's Wikipedia retrieval corpus. We then test the\nimpact of these passages on the performance of both the retriever and reader\nportion of the RAG pipeline. We introduce a prompting system designed to\nenhance the model's ability to interpret and generate responses in the presence\nof sarcasm, thus improving overall system performance. Finally, we conduct\nablation studies to validate the effectiveness of our approach, demonstrating\nimprovements in handling sarcastic content within RAG systems.\n","authors":["Benjamin Reichman","Kartik Talamadupula","Toshish Jawale","Larry Heck"],"pdf_url":"https://arxiv.org/pdf/2408.11189v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05070v3","updated":"2024-08-20T19:14:31Z","published":"2024-02-07T18:21:17Z","title":"A Roadmap to Pluralistic Alignment","summary":" With increased power and prevalence of AI systems, it is ever more critical\nthat AI systems are designed to serve all, i.e., people with diverse values and\nperspectives. However, aligning models to serve pluralistic human values\nremains an open research question. In this piece, we propose a roadmap to\npluralistic alignment, specifically using language models as a test bed. We\nidentify and formalize three possible ways to define and operationalize\npluralism in AI systems: 1) Overton pluralistic models that present a spectrum\nof reasonable responses; 2) Steerably pluralistic models that can steer to\nreflect certain perspectives; and 3) Distributionally pluralistic models that\nare well-calibrated to a given population in distribution. We also formalize\nand discuss three possible classes of pluralistic benchmarks: 1)\nMulti-objective benchmarks, 2) Trade-off steerable benchmarks, which\nincentivize models to steer to arbitrary trade-offs, and 3) Jury-pluralistic\nbenchmarks which explicitly model diverse human ratings. We use this framework\nto argue that current alignment techniques may be fundamentally limited for\npluralistic AI; indeed, we highlight empirical evidence, both from our own\nexperiments and from other work, that standard alignment procedures might\nreduce distributional pluralism in models, motivating the need for further\nresearch on pluralistic alignment.\n","authors":["Taylor Sorensen","Jared Moore","Jillian Fisher","Mitchell Gordon","Niloofar Mireshghallah","Christopher Michael Rytting","Andre Ye","Liwei Jiang","Ximing Lu","Nouha Dziri","Tim Althoff","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2402.05070v3.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2408.11133v1","updated":"2024-08-20T18:31:20Z","published":"2024-08-20T18:31:20Z","title":"Public Health in Disaster: Emotional Health and Life Incidents\n Extraction during Hurricane Harvey","summary":" Countless disasters have resulted from climate change, causing severe damage\nto infrastructure and the economy. These disasters have significant societal\nimpacts, necessitating mental health services for the millions affected. To\nprepare for and respond effectively to such events, it is important to\nunderstand people's emotions and the life incidents they experience before and\nafter a disaster strikes. In this case study, we collected a dataset of\napproximately 400,000 public tweets related to the storm. Using a BERT-based\nmodel, we predicted the emotions associated with each tweet. To efficiently\nidentify these topics, we utilized the Latent Dirichlet Allocation (LDA)\ntechnique for topic modeling, which allowed us to bypass manual content\nanalysis and extract meaningful patterns from the data. However, rather than\nstopping at topic identification like previous methods \\cite{math11244910}, we\nfurther refined our analysis by integrating Graph Neural Networks (GNN) and\nLarge Language Models (LLM). The GNN was employed to generate embeddings and\nconstruct a similarity graph of the tweets, which was then used to optimize\nclustering. Subsequently, we used an LLM to automatically generate descriptive\nnames for each event cluster, offering critical insights for disaster\npreparedness and response strategies.\n","authors":["Thomas Hoang","Quynh Anh Nguyen","Long Nguyen"],"pdf_url":"https://arxiv.org/pdf/2408.11133v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11119v1","updated":"2024-08-20T18:21:54Z","published":"2024-08-20T18:21:54Z","title":"Mistral-SPLADE: LLMs for for better Learned Sparse Retrieval","summary":" Learned Sparse Retrievers (LSR) have evolved into an effective retrieval\nstrategy that can bridge the gap between traditional keyword-based sparse\nretrievers and embedding-based dense retrievers. At its core, learned sparse\nretrievers try to learn the most important semantic keyword expansions from a\nquery and/or document which can facilitate better retrieval with overlapping\nkeyword expansions. LSR like SPLADE has typically been using encoder only\nmodels with MLM (masked language modeling) style objective in conjunction with\nknown ways of retrieval performance improvement such as hard negative mining,\ndistillation, etc. In this work, we propose to use decoder-only model for\nlearning semantic keyword expansion. We posit, decoder only models that have\nseen much higher magnitudes of data are better equipped to learn keyword\nexpansions needed for improved retrieval. We use Mistral as the backbone to\ndevelop our Learned Sparse Retriever similar to SPLADE and train it on a subset\nof sentence-transformer data which is often used for training text embedding\nmodels. Our experiments support the hypothesis that a sparse retrieval model\nbased on decoder only large language model (LLM) surpasses the performance of\nexisting LSR systems, including SPLADE and all its variants. The LLM based\nmodel (Echo-Mistral-SPLADE) now stands as a state-of-the-art learned sparse\nretrieval model on the BEIR text retrieval benchmark.\n","authors":["Meet Doshi","Vishwajeet Kumar","Rudra Murthy","Vignesh P","Jaydeep Sen"],"pdf_url":"https://arxiv.org/pdf/2408.11119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11875v1","updated":"2024-08-20T09:29:31Z","published":"2024-08-20T09:29:31Z","title":"Hierarchical Retrieval-Augmented Generation Model with Rethink for\n Multi-hop Question Answering","summary":" Multi-hop Question Answering (QA) necessitates complex reasoning by\nintegrating multiple pieces of information to resolve intricate questions.\nHowever, existing QA systems encounter challenges such as outdated information,\ncontext window length limitations, and an accuracy-quantity trade-off. To\naddress these issues, we propose a novel framework, the Hierarchical\nRetrieval-Augmented Generation Model with Rethink (HiRAG), comprising\nDecomposer, Definer, Retriever, Filter, and Summarizer five key modules. We\nintroduce a new hierarchical retrieval strategy that incorporates both sparse\nretrieval at the document level and dense retrieval at the chunk level,\neffectively integrating their strengths. Additionally, we propose a\nsingle-candidate retrieval method to mitigate the limitations of\nmulti-candidate retrieval. We also construct two new corpora, Indexed\nWikicorpus and Profile Wikicorpus, to address the issues of outdated and\ninsufficient knowledge.\n Our experimental results on four datasets demonstrate that HiRAG outperforms\nstate-of-the-art models across most metrics, and our Indexed Wikicorpus is\neffective. The code for HiRAG is available at\nhttps://github.com/2282588541a/HiRAG\n","authors":["Xiaoming Zhang","Ming Wang","Xiaocui Yang","Daling Wang","Shi Feng","Yifei Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.11875v1.pdf","comment":"undereview"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2408.11052v1","updated":"2024-08-20T17:58:40Z","published":"2024-08-20T17:58:40Z","title":"Accelerating Goal-Conditioned RL Algorithms and Research","summary":" Self-supervision has the potential to transform reinforcement learning (RL),\nparalleling the breakthroughs it has enabled in other areas of machine\nlearning. While self-supervised learning in other domains aims to find patterns\nin a fixed dataset, self-supervised goal-conditioned reinforcement learning\n(GCRL) agents discover new behaviors by learning from the goals achieved during\nunstructured interaction with the environment. However, these methods have\nfailed to see similar success, both due to a lack of data from slow\nenvironments as well as a lack of stable algorithms. We take a step toward\naddressing both of these issues by releasing a high-performance codebase and\nbenchmark JaxGCRL for self-supervised GCRL, enabling researchers to train\nagents for millions of environment steps in minutes on a single GPU. The key to\nthis performance is a combination of GPU-accelerated environments and a stable,\nbatched version of the contrastive reinforcement learning algorithm, based on\nan infoNCE objective, that effectively makes use of this increased data\nthroughput. With this approach, we provide a foundation for future research in\nself-supervised GCRL, enabling researchers to quickly iterate on new ideas and\nevaluate them in a diverse set of challenging environments. Website + Code:\nhttps://github.com/MichalBortkiewicz/JaxGCRL\n","authors":["Michał Bortkiewicz","Władek Pałucki","Vivek Myers","Tadeusz Dziarmaga","Tomasz Arczewski","Łukasz Kuciński","Benjamin Eysenbach"],"pdf_url":"https://arxiv.org/pdf/2408.11052v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11048v1","updated":"2024-08-20T17:56:52Z","published":"2024-08-20T17:56:52Z","title":"RP1M: A Large-Scale Motion Dataset for Piano Playing with Bi-Manual\n Dexterous Robot Hands","summary":" It has been a long-standing research goal to endow robot hands with\nhuman-level dexterity. Bi-manual robot piano playing constitutes a task that\ncombines challenges from dynamic tasks, such as generating fast while precise\nmotions, with slower but contact-rich manipulation problems. Although\nreinforcement learning based approaches have shown promising results in\nsingle-task performance, these methods struggle in a multi-song setting. Our\nwork aims to close this gap and, thereby, enable imitation learning approaches\nfor robot piano playing at scale. To this end, we introduce the Robot Piano 1\nMillion (RP1M) dataset, containing bi-manual robot piano playing motion data of\nmore than one million trajectories. We formulate finger placements as an\noptimal transport problem, thus, enabling automatic annotation of vast amounts\nof unlabeled songs. Benchmarking existing imitation learning approaches shows\nthat such approaches reach state-of-the-art robot piano playing performance by\nleveraging RP1M.\n","authors":["Yi Zhao","Le Chen","Jan Schneider","Quankai Gao","Juho Kannala","Bernhard Schölkopf","Joni Pajarinen","Dieter Büchler"],"pdf_url":"https://arxiv.org/pdf/2408.11048v1.pdf","comment":"Project Website: https://rp1m.github.io/"},{"id":"http://arxiv.org/abs/2404.01099v2","updated":"2024-08-20T17:54:08Z","published":"2024-04-01T13:12:30Z","title":"What is in Your Safe Data? Identifying Benign Data that Breaks Safety","summary":" Current Large Language Models (LLMs), even those tuned for safety and\nalignment, are susceptible to jailbreaking. Some have found that just further\nfine-tuning an aligned model with benign data (i.e., data without harmful\ncontent) surprisingly leads to substantial degradation in safety. We delve into\nthe data-centric aspects of why benign fine-tuning inadvertently contributes to\njailbreaking. First, we represent fine-tuning data through two lenses:\nrepresentation and gradient spaces. Additionally, we propose a bi-directional\nanchoring method that, during the selection process, prioritizes data points\nthat are close to harmful examples and far from benign ones. Our approach\neffectively identifies subsets of benign data that are more likely to degrade\nthe model's safety after fine-tuning. Training on just 100 of these seemingly\nbenign datapoints surprisingly leads to the fine-tuned model affirmatively\nresponding to >70% of tested harmful requests, compared to <20% after\nfine-tuning on randomly selected data. We also observe that the selected data\nfrequently appear as lists, bullet points, or math questions, indicating a\nsystematic pattern in fine-tuning data that contributes to jailbreaking.\n","authors":["Luxi He","Mengzhou Xia","Peter Henderson"],"pdf_url":"https://arxiv.org/pdf/2404.01099v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11032v1","updated":"2024-08-20T17:33:20Z","published":"2024-08-20T17:33:20Z","title":"Atmospheric Transport Modeling of CO$_2$ with Neural Networks","summary":" Accurately describing the distribution of CO$_2$ in the atmosphere with\natmospheric tracer transport models is essential for greenhouse gas monitoring\nand verification support systems to aid implementation of international climate\nagreements. Large deep neural networks are poised to revolutionize weather\nprediction, which requires 3D modeling of the atmosphere. While similar in this\nregard, atmospheric transport modeling is subject to new challenges. Both,\nstable predictions for longer time horizons and mass conservation throughout\nneed to be achieved, while IO plays a larger role compared to computational\ncosts. In this study we explore four different deep neural networks (UNet,\nGraphCast, Spherical Fourier Neural Operator and SwinTransformer) which have\nproven as state-of-the-art in weather prediction to assess their usefulness for\natmospheric tracer transport modeling. For this, we assemble the CarbonBench\ndataset, a systematic benchmark tailored for machine learning emulators of\nEulerian atmospheric transport. Through architectural adjustments, we decouple\nthe performance of our emulators from the distribution shift caused by a steady\nrise in atmospheric CO$_2$. More specifically, we center CO$_2$ input fields to\nzero mean and then use an explicit flux scheme and a mass fixer to assure mass\nbalance. This design enables stable and mass conserving transport for over 6\nmonths with all four neural network architectures. In our study, the\nSwinTransformer displays particularly strong emulation skill (90-day $R^2 >\n0.99$), with physically plausible emulation even for forward runs of multiple\nyears. This work paves the way forward towards high resolution forward and\ninverse modeling of inert trace gases with neural networks.\n","authors":["Vitus Benson","Ana Bastos","Christian Reimers","Alexander J. Winkler","Fanny Yang","Markus Reichstein"],"pdf_url":"https://arxiv.org/pdf/2408.11032v1.pdf","comment":"Code: https://github.com/vitusbenson/carbonbench"},{"id":"http://arxiv.org/abs/2311.10653v2","updated":"2024-08-20T17:21:43Z","published":"2023-11-17T17:14:42Z","title":"Learning Realistic Joint Space Boundaries for Range of Motion Analysis\n of Healthy and Impaired Human Arms","summary":" A realistic human kinematic model that satisfies anatomical constraints is\nessential for human-robot interaction, biomechanics and robot-assisted\nrehabilitation. Modeling realistic joint constraints, however, is challenging\nas human arm motion is constrained by joint limits, inter- and intra-joint\ndependencies, self-collisions, individual capabilities and muscular or\nneurological constraints which are difficult to represent. Hence, physicians\nand researchers have relied on simple box-constraints, ignoring important\nanatomical factors. In this paper, we propose a data-driven method to learn\nrealistic anatomically constrained upper-limb range of motion (RoM) boundaries\nfrom motion capture data. This is achieved by fitting a one-class support\nvector machine to a dataset of upper-limb joint space exploration motions with\nan efficient hyper-parameter tuning scheme. Our approach outperforms similar\nworks focused on valid RoM learning. Further, we propose an impairment index\n(II) metric that offers a quantitative assessment of capability/impairment when\ncomparing healthy and impaired arms. We validate the metric on healthy subjects\nphysically constrained to emulate hemiplegia and different disability levels as\nstroke patients.\n","authors":["Shafagh Keyvanian","Michelle J. Johnson","Nadia Figueroa"],"pdf_url":"https://arxiv.org/pdf/2311.10653v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11019v1","updated":"2024-08-20T17:18:54Z","published":"2024-08-20T17:18:54Z","title":"An Overlooked Role of Context-Sensitive Dendrites","summary":" To date, most dendritic studies have predominantly focused on the apical zone\nof pyramidal two-point neurons (TPNs) receiving only feedback (FB) connections\nfrom higher perceptual layers and using them for learning. Recent cellular\nneurophysiology and computational neuroscience studies suggests that the apical\ninput (context), coming from feedback and lateral connections, is multifaceted\nand far more diverse, with greater implications for ongoing learning and\nprocessing in the brain than previously realized. In addition to the FB, the\napical tuft receives signals from neighboring cells of the same network as\nproximal (P) context, other parts of the brain as distal (D) context, and\noverall coherent information across the network as universal (U) context. The\nintegrated context (C) amplifies and suppresses the transmission of coherent\nand conflicting feedforward (FF) signals, respectively. Specifically, we show\nthat complex context-sensitive (CS)-TPNs flexibly integrate C moment-by-moment\nwith the FF somatic current at the soma such that the somatic current is\namplified when both feedforward (FF) and C are coherent; otherwise, it is\nattenuated. This generates the event only when the FF and C currents are\ncoherent, which is then translated into a singlet or a burst based on the FB\ninformation. Spiking simulation results show that this flexible integration of\nsomatic and contextual currents enables the propagation of more coherent\nsignals (bursts), making learning faster with fewer neurons. Similar behavior\nis observed when this functioning is used in conventional artificial networks,\nwhere orders of magnitude fewer neurons are required to process vast amounts of\nheterogeneous real-world audio-visual (AV) data trained using backpropagation\n(BP). The computational findings presented here demonstrate the universality of\nCS-TPNs, suggesting a dendritic narrative that was previously overlooked.\n","authors":["Mohsin Raza","Ahsan Adeel"],"pdf_url":"https://arxiv.org/pdf/2408.11019v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.00050v3","updated":"2024-08-20T17:16:20Z","published":"2023-04-28T19:00:43Z","title":"Causal Reasoning and Large Language Models: Opening a New Frontier for\n Causality","summary":" The causal capabilities of large language models (LLMs) are a matter of\nsignificant debate, with critical implications for the use of LLMs in\nsocietally impactful domains such as medicine, science, law, and policy. We\nconduct a \"behavorial\" study of LLMs to benchmark their capability in\ngenerating causal arguments. Across a wide range of tasks, we find that LLMs\ncan generate text corresponding to correct causal arguments with high\nprobability, surpassing the best-performing existing methods. Algorithms based\non GPT-3.5 and 4 outperform existing algorithms on a pairwise causal discovery\ntask (97%, 13 points gain), counterfactual reasoning task (92%, 20 points gain)\nand event causality (86% accuracy in determining necessary and sufficient\ncauses in vignettes). We perform robustness checks across tasks and show that\nthe capabilities cannot be explained by dataset memorization alone, especially\nsince LLMs generalize to novel datasets that were created after the training\ncutoff date.\n That said, LLMs exhibit unpredictable failure modes, and we discuss the kinds\nof errors that may be improved and what are the fundamental limits of LLM-based\nanswers. Overall, by operating on the text metadata, LLMs bring capabilities so\nfar understood to be restricted to humans, such as using collected knowledge to\ngenerate causal graphs or identifying background causal context from natural\nlanguage. As a result, LLMs may be used by human domain experts to save effort\nin setting up a causal analysis, one of the biggest impediments to the\nwidespread adoption of causal methods. Given that LLMs ignore the actual data,\nour results also point to a fruitful research direction of developing\nalgorithms that combine LLMs with existing causal techniques. Code and datasets\nare available at https://github.com/py-why/pywhy-llm.\n","authors":["Emre Kıcıman","Robert Ness","Amit Sharma","Chenhao Tan"],"pdf_url":"https://arxiv.org/pdf/2305.00050v3.pdf","comment":"Added three novel datasets. To be published in TMLR. Authors listed\n alphabetically"},{"id":"http://arxiv.org/abs/2402.03629v3","updated":"2024-08-20T17:08:53Z","published":"2024-02-06T01:56:29Z","title":"Disparate Impact on Group Accuracy of Linearization for Private\n Inference","summary":" Ensuring privacy-preserving inference on cryptographically secure data is a\nwell-known computational challenge. To alleviate the bottleneck of costly\ncryptographic computations in non-linear activations, recent methods have\nsuggested linearizing a targeted portion of these activations in neural\nnetworks. This technique results in significantly reduced runtimes with often\nnegligible impacts on accuracy. In this paper, we demonstrate that such\ncomputational benefits may lead to increased fairness costs. Specifically, we\nfind that reducing the number of ReLU activations disproportionately decreases\nthe accuracy for minority groups compared to majority groups. To explain these\nobservations, we provide a mathematical interpretation under restricted\nassumptions about the nature of the decision boundary, while also showing the\nprevalence of this problem across widely used datasets and architectures.\nFinally, we show how a simple procedure altering the fine-tuning step for\nlinearized models can serve as an effective mitigation strategy.\n","authors":["Saswat Das","Marco Romanelli","Ferdinando Fioretto"],"pdf_url":"https://arxiv.org/pdf/2402.03629v3.pdf","comment":"Extended version of the paper accepted to appear at the Forty-first\n International Conference on Machine Learning (ICML) 2024"},{"id":"http://arxiv.org/abs/2408.10998v1","updated":"2024-08-20T16:46:54Z","published":"2024-08-20T16:46:54Z","title":"Audio Match Cutting: Finding and Creating Matching Audio Transitions in\n Movies and Videos","summary":" A \"match cut\" is a common video editing technique where a pair of shots that\nhave a similar composition transition fluidly from one to another. Although\nmatch cuts are often visual, certain match cuts involve the fluid transition of\naudio, where sounds from different sources merge into one indistinguishable\ntransition between two shots. In this paper, we explore the ability to\nautomatically find and create \"audio match cuts\" within videos and movies. We\ncreate a self-supervised audio representation for audio match cutting and\ndevelop a coarse-to-fine audio match pipeline that recommends matching shots\nand creates the blended audio. We further annotate a dataset for the proposed\naudio match cut task and compare the ability of multiple audio representations\nto find audio match cut candidates. Finally, we evaluate multiple methods to\nblend two matching audio candidates with the goal of creating a smooth\ntransition. Project page and examples are available at:\nhttps://denfed.github.io/audiomatchcut/\n","authors":["Dennis Fedorishin","Lie Lu","Srirangaraj Setlur","Venu Govindaraju"],"pdf_url":"https://arxiv.org/pdf/2408.10998v1.pdf","comment":"Accepted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2406.11168v2","updated":"2024-08-20T16:45:15Z","published":"2024-06-17T03:17:33Z","title":"Two-Timescale Optimization Framework for Decentralized Linear-Quadratic\n Optimal Control","summary":" This study investigates a decentralized linear-quadratic optimal control\nproblem, and several approximate separable constrained optimization problems\nare formulated for the first time based on the selection of sparsity promoting\nfunctions. First, for the optimization problem with weighted $\\ell_1$ sparsity\npromoting function, a two-timescale algorithm is adopted that is based on the\nBSUM (Block Successive Upper-bound Minimization) framework and a differential\nequation solver. Second, a piecewise quadratic sparsity promoting function is\nintroduced, and the induced optimization problem demonstrates an accelerated\nconvergence rate by performing the same two-timescale algorithm. Finally, the\noptimization problem with $\\ell_0$ sparsity promoting function is considered\nthat is nonconvex and discontinuous, and can be approximated by successive\ncoordinatewise convex optimization problems.\n","authors":["Lechen Feng","Yuan-Hua Ni","Xuebo Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.11168v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10996v1","updated":"2024-08-20T16:43:45Z","published":"2024-08-20T16:43:45Z","title":"Approximation Rates for Shallow ReLU$^k$ Neural Networks on Sobolev\n Spaces via the Radon Transform","summary":" Let $\\Omega\\subset \\mathbb{R}^d$ be a bounded domain. We consider the problem\nof how efficiently shallow neural networks with the ReLU$^k$ activation\nfunction can approximate functions from Sobolev spaces $W^s(L_p(\\Omega))$ with\nerror measured in the $L_q(\\Omega)$-norm. Utilizing the Radon transform and\nrecent results from discrepancy theory, we provide a simple proof of nearly\noptimal approximation rates in a variety of cases, including when $q\\leq p$,\n$p\\geq 2$, and $s \\leq k + (d+1)/2$. The rates we derive are optimal up to\nlogarithmic factors, and significantly generalize existing results. An\ninteresting consequence is that the adaptivity of shallow ReLU$^k$ neural\nnetworks enables them to obtain optimal approximation rates for smoothness up\nto order $s = k + (d+1)/2$, even though they represent piecewise polynomials of\nfixed degree $k$.\n","authors":["Tong Mao","Jonathan W. Siegel","Jinchao Xu"],"pdf_url":"https://arxiv.org/pdf/2408.10996v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07215v3","updated":"2024-08-20T16:37:12Z","published":"2023-06-12T16:20:36Z","title":"Efficient and Robust Quantization-aware Training via Adaptive Coreset\n Selection","summary":" Quantization-aware training (QAT) is a representative model compression\nmethod to reduce redundancy in weights and activations. However, most existing\nQAT methods require end-to-end training on the entire dataset, which suffers\nfrom long training time and high energy costs. In addition, the potential label\nnoise in the training data undermines the robustness of QAT. We propose two\nmetrics based on analysis of loss and gradient of quantized weights: error\nvector score and disagreement score, to quantify the importance of each sample\nduring training. Guided by these two metrics, we proposed a quantization-aware\nAdaptive Coreset Selection (ACS) method to select the data for the current\ntraining epoch. We evaluate our method on various networks (ResNet-18,\nMobileNetV2, RetinaNet), datasets(CIFAR-10, CIFAR-100, ImageNet-1K, COCO), and\nunder different quantization settings. Specifically, our method can achieve an\naccuracy of 68.39\\% of 4-bit quantized ResNet-18 on the ImageNet-1K dataset\nwith only a 10\\% subset, which has an absolute gain of 4.24\\% compared to the\nbaseline. Our method can also improve the robustness of QAT by removing noisy\nsamples in the training set.\n","authors":["Xijie Huang","Zechun Liu","Shih-Yang Liu","Kwang-Ting Cheng"],"pdf_url":"https://arxiv.org/pdf/2306.07215v3.pdf","comment":"Accepted by TMLR, Code: https://github.com/HuangOwen/QAT-ACS"},{"id":"http://arxiv.org/abs/2408.10976v1","updated":"2024-08-20T16:09:40Z","published":"2024-08-20T16:09:40Z","title":"Kernel-Based Differentiable Learning of Non-Parametric Directed Acyclic\n Graphical Models","summary":" Causal discovery amounts to learning a directed acyclic graph (DAG) that\nencodes a causal model. This model selection problem can be challenging due to\nits large combinatorial search space, particularly when dealing with\nnon-parametric causal models. Recent research has sought to bypass the\ncombinatorial search by reformulating causal discovery as a continuous\noptimization problem, employing constraints that ensure the acyclicity of the\ngraph. In non-parametric settings, existing approaches typically rely on\nfinite-dimensional approximations of the relationships between nodes, resulting\nin a score-based continuous optimization problem with a smooth acyclicity\nconstraint. In this work, we develop an alternative approximation method by\nutilizing reproducing kernel Hilbert spaces (RKHS) and applying general\nsparsity-inducing regularization terms based on partial derivatives. Within\nthis framework, we introduce an extended RKHS representer theorem. To enforce\nacyclicity, we advocate the log-determinant formulation of the acyclicity\nconstraint and show its stability. Finally, we assess the performance of our\nproposed RKHS-DAGMA procedure through simulations and illustrative data\nanalyses.\n","authors":["Yurou Liang","Oleksandr Zadorozhnyi","Mathias Drton"],"pdf_url":"https://arxiv.org/pdf/2408.10976v1.pdf","comment":"To be published in the Proceedings of Probabilistic Graphical Models\n (PGM) 2024"},{"id":"http://arxiv.org/abs/2408.10958v1","updated":"2024-08-20T15:56:01Z","published":"2024-08-20T15:56:01Z","title":"Kilometer-Scale Convection Allowing Model Emulation using Generative\n Diffusion Modeling","summary":" Storm-scale convection-allowing models (CAMs) are an important tool for\npredicting the evolution of thunderstorms and mesoscale convective systems that\nresult in damaging extreme weather. By explicitly resolving convective dynamics\nwithin the atmosphere they afford meteorologists the nuance needed to provide\noutlook on hazard. Deep learning models have thus far not proven skilful at\nkm-scale atmospheric simulation, despite being competitive at coarser\nresolution with state-of-the-art global, medium-range weather forecasting. We\npresent a generative diffusion model called StormCast, which emulates the\nhigh-resolution rapid refresh (HRRR) model-NOAA's state-of-the-art 3km\noperational CAM. StormCast autoregressively predicts 99 state variables at km\nscale using a 1-hour time step, with dense vertical resolution in the\natmospheric boundary layer, conditioned on 26 synoptic variables. We present\nevidence of successfully learnt km-scale dynamics including competitive 1-6\nhour forecast skill for composite radar reflectivity alongside physically\nrealistic convective cluster evolution, moist updrafts, and cold pool\nmorphology. StormCast predictions maintain realistic power spectra for multiple\npredicted variables across multi-hour forecasts. Together, these results\nestablish the potential for autoregressive ML to emulate CAMs -- opening up new\nkm-scale frontiers for regional ML weather prediction and future climate hazard\ndynamical downscaling.\n","authors":["Jaideep Pathak","Yair Cohen","Piyush Garg","Peter Harrington","Noah Brenowitz","Dale Durran","Morteza Mardani","Arash Vahdat","Shaoming Xu","Karthik Kashinath","Michael Pritchard"],"pdf_url":"https://arxiv.org/pdf/2408.10958v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.13629v2","updated":"2024-08-20T15:48:49Z","published":"2024-06-19T15:25:29Z","title":"InstructRAG: Instructing Retrieval-Augmented Generation via\n Self-Synthesized Rationales","summary":" Retrieval-augmented generation (RAG) has shown promising potential to enhance\nthe accuracy and factuality of language models (LMs). However, imperfect\nretrievers or noisy corpora can introduce misleading or even erroneous\ninformation to the retrieved contents, posing a significant challenge to the\ngeneration quality. Existing RAG methods typically address this challenge by\ndirectly predicting final answers despite potentially noisy inputs, resulting\nin an implicit denoising process that is difficult to interpret and verify. On\nthe other hand, the acquisition of explicit denoising supervision is often\ncostly, involving significant human efforts. In this work, we propose\nInstructRAG, where LMs explicitly learn the denoising process through\nself-synthesized rationales -- First, we instruct the LM to explain how the\nground-truth answer is derived from retrieved documents. Then, these rationales\ncan be used either as demonstrations for in-context learning of explicit\ndenoising or as supervised fine-tuning data to train the model. Compared to\nstandard RAG approaches, InstructRAG requires no additional supervision, allows\nfor easier verification of the predicted answers, and effectively improves\ngeneration accuracy. Experiments show InstructRAG consistently outperforms\nexisting RAG methods in both training-free and trainable scenarios, achieving a\nrelative improvement of 8.3% over the best baseline method on average across\nfive knowledge-intensive benchmarks. Extensive analysis indicates that\nInstructRAG scales well with increased numbers of retrieved documents and\nconsistently exhibits robust denoising ability even in out-of-domain datasets,\ndemonstrating strong generalizability.\n","authors":["Zhepei Wei","Wei-Lin Chen","Yu Meng"],"pdf_url":"https://arxiv.org/pdf/2406.13629v2.pdf","comment":"Code: https://github.com/weizhepei/InstructRAG"},{"id":"http://arxiv.org/abs/2408.10951v1","updated":"2024-08-20T15:42:10Z","published":"2024-08-20T15:42:10Z","title":"Wave-Mask/Mix: Exploring Wavelet-Based Augmentations for Time Series\n Forecasting","summary":" Data augmentation is important for improving machine learning model\nperformance when faced with limited real-world data. In time series forecasting\n(TSF), where accurate predictions are crucial in fields like finance,\nhealthcare, and manufacturing, traditional augmentation methods for\nclassification tasks are insufficient to maintain temporal coherence. This\nresearch introduces two augmentation approaches using the discrete wavelet\ntransform (DWT) to adjust frequency elements while preserving temporal\ndependencies in time series data. Our methods, Wavelet Masking (WaveMask) and\nWavelet Mixing (WaveMix), are evaluated against established baselines across\nvarious forecasting horizons. To the best of our knowledge, this is the first\nstudy to conduct extensive experiments on multivariate time series using\nDiscrete Wavelet Transform as an augmentation technique. Experimental results\ndemonstrate that our techniques achieve competitive results with previous\nmethods. We also explore cold-start forecasting using downsampled training\ndatasets, comparing outcomes to baseline methods.\n","authors":["Dona Arabi","Jafar Bakhshaliyev","Ayse Coskuner","Kiran Madhusudhanan","Kami Serdar Uckardes"],"pdf_url":"https://arxiv.org/pdf/2408.10951v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03151v3","updated":"2024-08-20T15:41:27Z","published":"2024-06-05T11:15:45Z","title":"Which Side Are You On? A Multi-task Dataset for End-to-End Argument\n Summarisation and Evaluation","summary":" With the recent advances of large language models (LLMs), it is no longer\ninfeasible to build an automated debate system that helps people to synthesise\npersuasive arguments. Previous work attempted this task by integrating multiple\ncomponents. In our work, we introduce an argument mining dataset that captures\nthe end-to-end process of preparing an argumentative essay for a debate, which\ncovers the tasks of claim and evidence identification (Task 1 ED), evidence\nconvincingness ranking (Task 2 ECR), argumentative essay summarisation and\nhuman preference ranking (Task 3 ASR) and metric learning for automated\nevaluation of resulting essays, based on human feedback along argument quality\ndimensions (Task 4 SQE). Our dataset contains 14k examples of claims that are\nfully annotated with the various properties supporting the aforementioned\ntasks. We evaluate multiple generative baselines for each of these tasks,\nincluding representative LLMs. We find, that while they show promising results\non individual tasks in our benchmark, their end-to-end performance on all four\ntasks in succession deteriorates significantly, both in automated measures as\nwell as in human-centred evaluation. This challenge presented by our proposed\ndataset motivates future research on end-to-end argument mining and\nsummarisation. The repository of this project is available at\nhttps://github.com/HaoBytes/ArgSum-Datatset\n","authors":["Hao Li","Yuping Wu","Viktor Schlegel","Riza Batista-Navarro","Tharindu Madusanka","Iqra Zahid","Jiayan Zeng","Xiaochi Wang","Xinran He","Yizhi Li","Goran Nenadic"],"pdf_url":"https://arxiv.org/pdf/2406.03151v3.pdf","comment":"Published on ACL 2024 Findings"},{"id":"http://arxiv.org/abs/2408.10948v1","updated":"2024-08-20T15:41:20Z","published":"2024-08-20T15:41:20Z","title":"GAIM: Attacking Graph Neural Networks via Adversarial Influence\n Maximization","summary":" Recent studies show that well-devised perturbations on graph structures or\nnode features can mislead trained Graph Neural Network (GNN) models. However,\nthese methods often overlook practical assumptions, over-rely on heuristics, or\nseparate vital attack components. In response, we present GAIM, an integrated\nadversarial attack method conducted on a node feature basis while considering\nthe strict black-box setting. Specifically, we define an adversarial influence\nfunction to theoretically assess the adversarial impact of node perturbations,\nthereby reframing the GNN attack problem into the adversarial influence\nmaximization problem. In our approach, we unify the selection of the target\nnode and the construction of feature perturbations into a single optimization\nproblem, ensuring a unique and consistent feature perturbation for each target\nnode. We leverage a surrogate model to transform this problem into a solvable\nlinear programming task, streamlining the optimization process. Moreover, we\nextend our method to accommodate label-oriented attacks, broadening its\napplicability. Thorough evaluations on five benchmark datasets across three\npopular models underscore the effectiveness of our method in both untargeted\nand label-oriented targeted attacks. Through comprehensive analysis and\nablation studies, we demonstrate the practical value and efficacy inherent to\nour design choices.\n","authors":["Xiaodong Yang","Xiaoting Li","Huiyuan Chen","Yiwei Cai"],"pdf_url":"https://arxiv.org/pdf/2408.10948v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10943v3","updated":"2024-08-20T15:41:10Z","published":"2023-12-18T05:42:31Z","title":"Model Stealing Attack against Graph Classification with Authenticity,\n Uncertainty and Diversity","summary":" Recent research demonstrates that GNNs are vulnerable to the model stealing\nattack, a nefarious endeavor geared towards duplicating the target model via\nquery permissions. However, they mainly focus on node classification tasks,\nneglecting the potential threats entailed within the domain of graph\nclassification tasks. Furthermore, their practicality is questionable due to\nunreasonable assumptions, specifically concerning the large data requirements\nand extensive model knowledge. To this end, we advocate following strict\nsettings with limited real data and hard-label awareness to generate synthetic\ndata, thereby facilitating the stealing of the target model. Specifically,\nfollowing important data generation principles, we introduce three model\nstealing attacks to adapt to different actual scenarios: MSA-AU is inspired by\nactive learning and emphasizes the uncertainty to enhance query value of\ngenerated samples; MSA-AD introduces diversity based on Mixup augmentation\nstrategy to alleviate the query inefficiency issue caused by over-similar\nsamples generated by MSA-AU; MSA-AUD combines the above two strategies to\nseamlessly integrate the authenticity, uncertainty, and diversity of the\ngenerated samples. Finally, extensive experiments consistently demonstrate the\nsuperiority of the proposed methods in terms of concealment, query efficiency,\nand stealing performance.\n","authors":["Zhihao Zhu","Chenwang Wu","Rui Fan","Yi Yang","Zhen Wang","Defu Lian","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2312.10943v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08815v2","updated":"2024-08-20T15:33:12Z","published":"2023-11-15T09:34:08Z","title":"Self-Supervised Disentanglement by Leveraging Structure in Data\n Augmentations","summary":" Self-supervised representation learning often uses data augmentations to\ninduce some invariance to \"style\" attributes of the data. However, with\ndownstream tasks generally unknown at training time, it is difficult to deduce\na priori which attributes of the data are indeed \"style\" and can be safely\ndiscarded. To deal with this, current approaches try to retain some style\ninformation by tuning the degree of invariance to some particular task, such as\nImageNet object classification. However, prior work has shown that such\ntask-specific tuning can lead to significant performance degradation on other\ntasks that rely on the discarded style. To address this, we introduce a more\nprincipled approach that seeks to disentangle style features rather than\ndiscard them. The key idea is to add multiple style embedding spaces where: (i)\neach is invariant to all-but-one augmentation; and (ii) joint entropy is\nmaximized. We formalize our structured data-augmentation procedure from a\ncausal latent-variable-model perspective, and prove identifiability of both\ncontent and individual style variables. We empirically demonstrate the benefits\nof our approach on both synthetic and real-world data.\n","authors":["Cian Eastwood","Julius von Kügelgen","Linus Ericsson","Diane Bouchacourt","Pascal Vincent","Bernhard Schölkopf","Mark Ibrahim"],"pdf_url":"https://arxiv.org/pdf/2311.08815v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10942v1","updated":"2024-08-20T15:32:47Z","published":"2024-08-20T15:32:47Z","title":"Robust Regression with Ensembles Communicating over Noisy Channels","summary":" As machine-learning models grow in size, their implementation requirements\ncannot be met by a single computer system. This observation motivates\ndistributed settings, in which intermediate computations are performed across a\nnetwork of processing units, while the central node only aggregates their\noutputs. However, distributing inference tasks across low-precision or faulty\nedge devices, operating over a network of noisy communication channels, gives\nrise to serious reliability challenges. We study the problem of an ensemble of\ndevices, implementing regression algorithms, that communicate through additive\nnoisy channels in order to collaboratively perform a joint regression task. We\ndefine the problem formally, and develop methods for optimizing the aggregation\ncoefficients for the parameters of the noise in the channels, which can\npotentially be correlated. Our results apply to the leading state-of-the-art\nensemble regression methods: bagging and gradient boosting. We demonstrate the\neffectiveness of our algorithms on both synthetic and real-world datasets.\n","authors":["Yuval Ben-Hur","Yuval Cassuto"],"pdf_url":"https://arxiv.org/pdf/2408.10942v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.00993v2","updated":"2024-08-20T15:31:45Z","published":"2022-02-02T12:26:25Z","title":"Normalise for Fairness: A Simple Normalisation Technique for Fairness in\n Regression Machine Learning Problems","summary":" Algorithms and Machine Learning (ML) are increasingly affecting everyday life\nand several decision-making processes, where ML has an advantage due to\nscalability or superior performance. Fairness in such applications is crucial,\nwhere models should not discriminate their results based on race, gender, or\nother protected groups. This is especially crucial for models affecting very\nsensitive topics, like interview invitation or recidivism prediction. Fairness\nis not commonly studied for regression problems compared to binary\nclassification problems; hence, we present a simple, yet effective method based\non normalisation (FaiReg), which minimises the impact of unfairness in\nregression problems, especially due to labelling bias. We present a theoretical\nanalysis of the method, in addition to an empirical comparison against two\nstandard methods for fairness, namely data balancing and adversarial training.\nWe also include a hybrid formulation (FaiRegH), merging the presented method\nwith data balancing, in an attempt to face labelling and sampling biases\nsimultaneously. The experiments are conducted on the multimodal dataset First\nImpressions (FI) with various labels, namely Big-Five personality prediction\nand interview screening score. The results show the superior performance of\ndiminishing the effects of unfairness better than data balancing, also without\ndeteriorating the performance of the original problem as much as adversarial\ntraining. Fairness is evaluated based on the Equal Accuracy (EA) and\nStatistical Parity (SP) constraints. The experiments present a setup that\nenhances the fairness for several protected variables simultaneously.\n","authors":["Mostafa M. Amin","Björn W. Schuller"],"pdf_url":"https://arxiv.org/pdf/2202.00993v2.pdf","comment":"Including references and appendices: 17 pages, 3 Figures, 5 Tables"},{"id":"http://arxiv.org/abs/2408.10940v1","updated":"2024-08-20T15:29:56Z","published":"2024-08-20T15:29:56Z","title":"A Closer Look at Data Augmentation Strategies for Finetuning-Based\n Low/Few-Shot Object Detection","summary":" Current methods for low- and few-shot object detection have primarily focused\non enhancing model performance for detecting objects. One common approach to\nachieve this is by combining model finetuning with data augmentation\nstrategies. However, little attention has been given to the energy efficiency\nof these approaches in data-scarce regimes. This paper seeks to conduct a\ncomprehensive empirical study that examines both model performance and energy\nefficiency of custom data augmentations and automated data augmentation\nselection strategies when combined with a lightweight object detector. The\nmethods are evaluated in three different benchmark datasets in terms of their\nperformance and energy consumption, and the Efficiency Factor is employed to\ngain insights into their effectiveness considering both performance and\nefficiency. Consequently, it is shown that in many cases, the performance gains\nof data augmentation strategies are overshadowed by their increased energy\nusage, necessitating the development of more energy efficient data augmentation\nstrategies to address data scarcity.\n","authors":["Vladislav Li","Georgios Tsoumplekas","Ilias Siniosoglou","Vasileios Argyriou","Anastasios Lytos","Eleftherios Fountoukidis","Panagiotis Sarigiannidis"],"pdf_url":"https://arxiv.org/pdf/2408.10940v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10939v1","updated":"2024-08-20T15:27:18Z","published":"2024-08-20T15:27:18Z","title":"Conformalized Interval Arithmetic with Symmetric Calibration","summary":" Uncertainty quantification is essential in decision-making, especially when\njoint distributions of random variables are involved. While conformal\nprediction provides distribution-free prediction sets with valid coverage\nguarantees, it traditionally focuses on single predictions. This paper\nintroduces novel conformal prediction methods for estimating the sum or average\nof unknown labels over specific index sets. We develop conformal prediction\nintervals for single target to the prediction interval for sum of multiple\ntargets. Under permutation invariant assumptions, we prove the validity of our\nproposed method. We also apply our algorithms on class average estimation and\npath cost prediction tasks, and we show that our method outperforms existing\nconformalized approaches as well as non-conformal approaches.\n","authors":["Rui Luo","Zhixin Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.10939v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10932v1","updated":"2024-08-20T15:15:10Z","published":"2024-08-20T15:15:10Z","title":"The Evolution of Reinforcement Learning in Quantitative Finance","summary":" Reinforcement Learning (RL) has experienced significant advancement over the\npast decade, prompting a growing interest in applications within finance. This\nsurvey critically evaluates 167 publications, exploring diverse RL applications\nand frameworks in finance. Financial markets, marked by their complexity,\nmulti-agent nature, information asymmetry, and inherent randomness, serve as an\nintriguing test-bed for RL. Traditional finance offers certain solutions, and\nRL advances these with a more dynamic approach, incorporating machine learning\nmethods, including transfer learning, meta-learning, and multi-agent solutions.\nThis survey dissects key RL components through the lens of Quantitative\nFinance. We uncover emerging themes, propose areas for future research, and\ncritique the strengths and weaknesses of existing methods.\n","authors":["Nikolaos Pippas","Cagatay Turkay","Elliot A. Ludvig"],"pdf_url":"https://arxiv.org/pdf/2408.10932v1.pdf","comment":"This work is currently submitted to and under-review for ACM\n Computing Surveys. This copy is an unedited, pre-print version and it is the\n author's version of the work. I"},{"id":"http://arxiv.org/abs/2408.10920v1","updated":"2024-08-20T15:04:37Z","published":"2024-08-20T15:04:37Z","title":"Recurrent Neural Networks Learn to Store and Generate Sequences using\n Non-Linear Representations","summary":" The Linear Representation Hypothesis (LRH) states that neural networks learn\nto encode concepts as directions in activation space, and a strong version of\nthe LRH states that models learn only such encodings. In this paper, we present\na counterexample to this strong LRH: when trained to repeat an input token\nsequence, gated recurrent neural networks (RNNs) learn to represent the token\nat each position with a particular order of magnitude, rather than a direction.\nThese representations have layered features that are impossible to locate in\ndistinct linear subspaces. To show this, we train interventions to predict and\nmanipulate tokens by learning the scaling factor corresponding to each sequence\nposition. These interventions indicate that the smallest RNNs find only this\nmagnitude-based solution, while larger RNNs have linear representations. These\nfindings strongly indicate that interpretability research should not be\nconfined by the LRH.\n","authors":["Róbert Csordás","Christopher Potts","Christopher D. Manning","Atticus Geiger"],"pdf_url":"https://arxiv.org/pdf/2408.10920v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10919v1","updated":"2024-08-20T15:04:14Z","published":"2024-08-20T15:04:14Z","title":"CrossFi: A Cross Domain Wi-Fi Sensing Framework Based on Siamese Network","summary":" In recent years, Wi-Fi sensing has garnered significant attention due to its\nnumerous benefits, such as privacy protection, low cost, and penetration\nability. Extensive research has been conducted in this field, focusing on areas\nsuch as gesture recognition, people identification, and fall detection.\nHowever, many data-driven methods encounter challenges related to domain shift,\nwhere the model fails to perform well in environments different from the\ntraining data. One major factor contributing to this issue is the limited\navailability of Wi-Fi sensing datasets, which makes models learn excessive\nirrelevant information and over-fit to the training set. Unfortunately,\ncollecting large-scale Wi-Fi sensing datasets across diverse scenarios is a\nchallenging task. To address this problem, we propose CrossFi, a siamese\nnetwork-based approach that excels in both in-domain scenario and cross-domain\nscenario, including few-shot, zero-shot scenarios, and even works in few-shot\nnew-class scenario where testing set contains new categories. The core\ncomponent of CrossFi is a sample-similarity calculation network called CSi-Net,\nwhich improves the structure of the siamese network by using an attention\nmechanism to capture similarity information, instead of simply calculating the\ndistance or cosine similarity. Based on it, we develop an extra Weight-Net that\ncan generate a template for each class, so that our CrossFi can work in\ndifferent scenarios. Experimental results demonstrate that our CrossFi achieves\nstate-of-the-art performance across various scenarios. In gesture recognition\ntask, our CrossFi achieves an accuracy of 98.17% in in-domain scenario, 91.72%\nin one-shot cross-domain scenario, 64.81% in zero-shot cross-domain scenario,\nand 84.75% in one-shot new-class scenario. To facilitate future research, we\nwill release the code for our model upon publication.\n","authors":["Zijian Zhao","Tingwei Chen","Zhijie Cai","Hang Li","Xiaoyang Li","Qimei Chen","Guangxu Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.10919v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10901v1","updated":"2024-08-20T14:43:53Z","published":"2024-08-20T14:43:53Z","title":"A Grey-box Attack against Latent Diffusion Model-based Image Editing by\n Posterior Collapse","summary":" Recent advancements in generative AI, particularly Latent Diffusion Models\n(LDMs), have revolutionized image synthesis and manipulation. However, these\ngenerative techniques raises concerns about data misappropriation and\nintellectual property infringement. Adversarial attacks on machine learning\nmodels have been extensively studied, and a well-established body of research\nhas extended these techniques as a benign metric to prevent the underlying\nmisuse of generative AI. Current approaches to safeguarding images from\nmanipulation by LDMs are limited by their reliance on model-specific knowledge\nand their inability to significantly degrade semantic quality of generated\nimages. In response to these shortcomings, we propose the Posterior Collapse\nAttack (PCA) based on the observation that VAEs suffer from posterior collapse\nduring training. Our method minimizes dependence on the white-box information\nof target models to get rid of the implicit reliance on model-specific\nknowledge. By accessing merely a small amount of LDM parameters, in specific\nmerely the VAE encoder of LDMs, our method causes a substantial semantic\ncollapse in generation quality, particularly in perceptual consistency, and\ndemonstrates strong transferability across various model architectures.\nExperimental results show that PCA achieves superior perturbation effects on\nimage generation of LDMs with lower runtime and VRAM. Our method outperforms\nexisting techniques, offering a more robust and generalizable solution that is\nhelpful in alleviating the socio-technical challenges posed by the rapidly\nevolving landscape of generative AI.\n","authors":["Zhongliang Guo","Lei Fang","Jingyu Lin","Yifei Qian","Shuai Zhao","Zeyu Wang","Junhao Dong","Cunjian Chen","Ognjen Arandjelović","Chun Pong Lau"],"pdf_url":"https://arxiv.org/pdf/2408.10901v1.pdf","comment":"21 pages, 7 figures, 10 tables"},{"id":"http://arxiv.org/abs/2311.06228v3","updated":"2024-08-20T14:42:50Z","published":"2023-11-10T18:34:24Z","title":"Learning material synthesis-process-structure-property relationship by\n data fusion: Bayesian Coregionalization N-Dimensional Piecewise Function\n Learning","summary":" Autonomous materials research labs require the ability to combine and learn\nfrom diverse data streams. This is especially true for learning material\nsynthesis-process-structure-property relationships, key to accelerating\nmaterials optimization and discovery as well as accelerating mechanistic\nunderstanding. We present the Synthesis-process-structure-property relAtionship\ncoreGionalized lEarner (SAGE) algorithm. A fully Bayesian algorithm that uses\nmultimodal coregionalization to merge knowledge across data sources to learn\nsynthesis-process-structure-property relationships. SAGE outputs a\nprobabilistic posterior for the relationships including the most likely\nrelationships given the data.\n","authors":["A. Gilad Kusne","Austin McDannald","Brian DeCost"],"pdf_url":"https://arxiv.org/pdf/2311.06228v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06209v2","updated":"2024-08-20T14:35:03Z","published":"2024-04-09T10:58:21Z","title":"Elephants Never Forget: Memorization and Learning of Tabular Data in\n Large Language Models","summary":" While many have shown how Large Language Models (LLMs) can be applied to a\ndiverse set of tasks, the critical issues of data contamination and\nmemorization are often glossed over. In this work, we address this concern for\ntabular data. Specifically, we introduce a variety of different techniques to\nassess whether a language model has seen a tabular dataset during training.\nThis investigation reveals that LLMs have memorized many popular tabular\ndatasets verbatim. We then compare the few-shot learning performance of LLMs on\ndatasets that were seen during training to the performance on datasets released\nafter training. We find that LLMs perform better on datasets seen during\ntraining, indicating that memorization leads to overfitting. At the same time,\nLLMs show non-trivial performance on novel datasets and are surprisingly robust\nto data transformations. We then investigate the in-context statistical\nlearning abilities of LLMs. While LLMs are significantly better than random at\nsolving statistical classification problems, the sample efficiency of few-shot\nlearning lags behind traditional statistical learning algorithms, especially as\nthe dimension of the problem increases. This suggests that much of the observed\nfew-shot performance on novel real-world datasets is due to the LLM's world\nknowledge. Overall, our results highlight the importance of testing whether an\nLLM has seen an evaluation dataset during pre-training. We release the\nhttps://github.com/interpretml/LLM-Tabular-Memorization-Checker Python package\nto test LLMs for memorization of tabular datasets.\n","authors":["Sebastian Bordt","Harsha Nori","Vanessa Rodrigues","Besmira Nushi","Rich Caruana"],"pdf_url":"https://arxiv.org/pdf/2404.06209v2.pdf","comment":"COLM camera ready"},{"id":"http://arxiv.org/abs/2004.12571v3","updated":"2024-08-20T14:11:18Z","published":"2020-04-27T03:45:48Z","title":"Exploiting Defenses against GAN-Based Feature Inference Attacks in\n Federated Learning","summary":" Federated learning (FL) is a decentralized model training framework that aims\nto merge isolated data islands while maintaining data privacy. However, recent\nstudies have revealed that Generative Adversarial Network (GAN) based attacks\ncan be employed in FL to learn the distribution of private datasets and\nreconstruct recognizable images. In this paper, we exploit defenses against\nGAN-based attacks in FL and propose a framework, Anti-GAN, to prevent attackers\nfrom learning the real distribution of the victim's data. The core idea of\nAnti-GAN is to manipulate the visual features of private training images to\nmake them indistinguishable to human eyes even restored by attackers.\nSpecifically, Anti-GAN projects the private dataset onto a GAN's generator and\ncombines the generated fake images with the actual images to create the\ntraining dataset, which is then used for federated model training. The\nexperimental results demonstrate that Anti-GAN is effective in preventing\nattackers from learning the distribution of private images while causing\nminimal harm to the accuracy of the federated model.\n","authors":["Xinjian Luo","Xianglong Zhang"],"pdf_url":"https://arxiv.org/pdf/2004.12571v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10878v1","updated":"2024-08-20T14:08:16Z","published":"2024-08-20T14:08:16Z","title":"DBHP: Trajectory Imputation in Multi-Agent Sports Using Derivative-Based\n Hybrid Prediction","summary":" Many spatiotemporal domains handle multi-agent trajectory data, but in\nreal-world scenarios, collected trajectory data are often partially missing due\nto various reasons. While existing approaches demonstrate good performance in\ntrajectory imputation, they face challenges in capturing the complex dynamics\nand interactions between agents due to a lack of physical constraints that\ngovern realistic trajectories, leading to suboptimal results. To address this\nissue, the paper proposes a Derivative-Based Hybrid Prediction (DBHP) framework\nthat can effectively impute multiple agents' missing trajectories. First, a\nneural network equipped with Set Transformers produces a naive prediction of\nmissing trajectories while satisfying the permutation-equivariance in terms of\nthe order of input agents. Then, the framework makes alternative predictions\nleveraging velocity and acceleration information and combines all the\npredictions with properly determined weights to provide final imputed\ntrajectories. In this way, our proposed framework not only accurately predicts\nposition, velocity, and acceleration values but also enforces the physical\nrelationship between them, eventually improving both the accuracy and\nnaturalness of the predicted trajectories. Accordingly, the experiment results\nabout imputing player trajectories in team sports show that our framework\nsignificantly outperforms existing imputation baselines.\n","authors":["Hanjun Choi","Hyunsung Kim","Minho Lee","Chang-Jo Kim","Jinsung Yoon","Sang-Ki Ko"],"pdf_url":"https://arxiv.org/pdf/2408.10878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10876v1","updated":"2024-08-20T14:05:25Z","published":"2024-08-20T14:05:25Z","title":"More Options for Prelabor Rupture of Membranes, A Bayesian Analysis","summary":" An obstetric goal for a laboring mother is to achieve a vaginal delivery as\nit reduces the risks inherent in major abdominal surgery (i.e., a Cesarean\nsection). Various medical interventions may be used by a physician to increase\nthe likelihood of this occurring while minimizing maternal and fetal morbidity.\nHowever, patients with prelabor rupture of membranes (PROM) have only two\ncommonly used options for cervical ripening, Pitocin and misoprostol. Little\nresearch exists on the benefits/risks for these two key drugs for PROM\npatients. A major limitation with most induction-of-labor related research is\nthe inability to account for differences in \\textit{Bishop scores} that are\ncommonly used in obstetrical practice to determine the next induction agent\noffered to the patient. This creates a confounding factor, which biases the\nresults, but has not been realized in the literature. In this work, we use a\nBayesian model of the relationships between the relevant factors, informed by\nexpert physicians, to separate the confounding variable from its actual impact.\nIn doing so, we provide strong evidence that pitocin and buccal misoprostol are\nequally effective and safe; thus, physicians have more choice in clinical care\nthan previously realized. This is particularly important for developing\ncountries where neither medication may be readily available, and prior\nguidelines may create an artificial barrier to needed medication.\n","authors":["Ashley Klein","Edward Raff","Elisabeth Seamon","Lily Foley","Timothy Bussert"],"pdf_url":"https://arxiv.org/pdf/2408.10876v1.pdf","comment":"To appear in the 2024 IEEE 11th International Conference on Data\n Science and Advanced Analytics (DSAA)"},{"id":"http://arxiv.org/abs/2408.10871v1","updated":"2024-08-20T14:03:21Z","published":"2024-08-20T14:03:21Z","title":"Radio U-Net: a convolutional neural network to detect diffuse radio\n sources in galaxy clusters and beyond","summary":" The forthcoming generation of radio telescope arrays promises significant\nadvancements in sensitivity and resolution, enabling the identification and\ncharacterization of many new faint and diffuse radio sources. Conventional\nmanual cataloging methodologies are anticipated to be insufficient to exploit\nthe capabilities of new radio surveys. Radio interferometric images of diffuse\nsources present a challenge for image segmentation tasks due to noise,\nartifacts, and embedded radio sources. In response to these challenges, we\nintroduce Radio U-Net, a fully convolutional neural network based on the U-Net\narchitecture. Radio U-Net is designed to detect faint and extended sources in\nradio surveys, such as radio halos, relics, and cosmic web filaments. Radio\nU-Net was trained on synthetic radio observations built upon cosmological\nsimulations and then tested on a sample of galaxy clusters, where the detection\nof cluster diffuse radio sources relied on customized data reduction and visual\ninspection of LOFAR Two Metre Sky Survey (LoTSS) data. The 83% of clusters\nexhibiting diffuse radio emission were accurately identified, and the\nsegmentation successfully recovered the morphology of the sources even in\nlow-quality images. In a test sample comprising 246 galaxy clusters, we\nachieved a 73% accuracy rate in distinguishing between clusters with and\nwithout diffuse radio emission. Our results establish the applicability of\nRadio U-Net to extensive radio survey datasets, probing its efficiency on\ncutting-edge high-performance computing systems. This approach represents an\nadvancement in optimizing the exploitation of forthcoming large radio surveys\nfor scientific exploration.\n","authors":["Chiara Stuardi","Claudio Gheller","Franco Vazza","Andrea Botteon"],"pdf_url":"https://arxiv.org/pdf/2408.10871v1.pdf","comment":"Accepted by MNRAS, 16 pages, 9 figures, 2 tables"},{"id":"http://arxiv.org/abs/2406.01631v2","updated":"2024-08-20T13:56:21Z","published":"2024-06-01T11:56:08Z","title":"SUBER: An RL Environment with Simulated Human Behavior for Recommender\n Systems","summary":" Reinforcement learning (RL) has gained popularity in the realm of recommender\nsystems due to its ability to optimize long-term rewards and guide users in\ndiscovering relevant content. However, the successful implementation of RL in\nrecommender systems is challenging because of several factors, including the\nlimited availability of online data for training on-policy methods. This\nscarcity requires expensive human interaction for online model training.\nFurthermore, the development of effective evaluation frameworks that accurately\nreflect the quality of models remains a fundamental challenge in recommender\nsystems. To address these challenges, we propose a comprehensive framework for\nsynthetic environments that simulate human behavior by harnessing the\ncapabilities of large language models (LLMs). We complement our framework with\nin-depth ablation studies and demonstrate its effectiveness with experiments on\nmovie and book recommendations. Using LLMs as synthetic users, this work\nintroduces a modular and novel framework to train RL-based recommender systems.\nThe software, including the RL environment, is publicly available on GitHub.\n","authors":["Nathan Corecco","Giorgio Piatti","Luca A. Lanzendörfer","Flint Xiaofeng Fan","Roger Wattenhofer"],"pdf_url":"https://arxiv.org/pdf/2406.01631v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10862v1","updated":"2024-08-20T13:54:07Z","published":"2024-08-20T13:54:07Z","title":"Feature Selection from Differentially Private Correlations","summary":" Data scientists often seek to identify the most important features in\nhigh-dimensional datasets. This can be done through $L_1$-regularized\nregression, but this can become inefficient for very high-dimensional datasets.\nAdditionally, high-dimensional regression can leak information about individual\ndatapoints in a dataset. In this paper, we empirically evaluate the established\nbaseline method for feature selection with differential privacy, the two-stage\nselection technique, and show that it is not stable under sparsity. This makes\nit perform poorly on real-world datasets, so we consider a different approach\nto private feature selection. We employ a correlations-based order statistic to\nchoose important features from a dataset and privatize them to ensure that the\nresults do not leak information about individual datapoints. We find that our\nmethod significantly outperforms the established baseline for private feature\nselection on many datasets.\n","authors":["Ryan Swope","Amol Khanna","Philip Doldo","Saptarshi Roy","Edward Raff"],"pdf_url":"https://arxiv.org/pdf/2408.10862v1.pdf","comment":"To appear in Proceedings of the 17th ACM Workshop on Artificial\n Intelligence and Security, 2024"},{"id":"http://arxiv.org/abs/2408.10858v1","updated":"2024-08-20T13:49:26Z","published":"2024-08-20T13:49:26Z","title":"Knowledge Sharing and Transfer via Centralized Reward Agent for\n Multi-Task Reinforcement Learning","summary":" Reward shaping is effective in addressing the sparse-reward challenge in\nreinforcement learning by providing immediate feedback through auxiliary\ninformative rewards. Based on the reward shaping strategy, we propose a novel\nmulti-task reinforcement learning framework, that integrates a centralized\nreward agent (CRA) and multiple distributed policy agents. The CRA functions as\na knowledge pool, which aims to distill knowledge from various tasks and\ndistribute it to individual policy agents to improve learning efficiency.\nSpecifically, the shaped rewards serve as a straightforward metric to encode\nknowledge. This framework not only enhances knowledge sharing across\nestablished tasks but also adapts to new tasks by transferring valuable reward\nsignals. We validate the proposed method on both discrete and continuous\ndomains, demonstrating its robustness in multi-task sparse-reward settings and\nits effective transferability to unseen tasks.\n","authors":["Haozhe Ma","Zhengding Luo","Thanh Vinh Vo","Kuankuan Sima","Tze-Yun Leong"],"pdf_url":"https://arxiv.org/pdf/2408.10858v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20557v2","updated":"2024-08-20T13:42:25Z","published":"2024-07-30T05:24:08Z","title":"CELLM: An Efficient Communication in Large Language Models Training for\n Federated Learning","summary":" Federated Learning (FL) is a recent model training paradigm in which client\ndevices collaboratively train a model without ever aggregating their data.\nCrucially, this scheme offers users potential privacy and security benefits by\nonly ever communicating updates to the model weights to a central server as\nopposed to traditional machine learning (ML) training which directly\ncommunicates and aggregates data. However, FL training suffers from statistical\nheterogeneity as clients may have differing local data distributions. Large\nlanguage models (LLMs) offer a potential solution to this issue of\nheterogeneity given that they have consistently been shown to be able to learn\non vast amounts of noisy data. While LLMs are a promising development for\nresolving the consistent issue of non-I.I.D. Clients in federated settings\nexacerbate two other bottlenecks in FL: limited local computing and expensive\ncommunication. This thesis aims to develop efficient training methods for LLMs\nin FL. To this end, we employ two critical techniques in enabling efficient\ntraining. First, we use low-rank adaptation (LoRA) to reduce the computational\nload of local model training. Second, we communicate sparse updates throughout\ntraining to significantly cut down on communication costs. Taken together, our\nmethod reduces communication costs by up to 10x over vanilla LoRA and up to 5x\nover more complex sparse LoRA baselines while achieving greater utility. We\nemphasize the importance of carefully applying sparsity and picking effective\nrank and sparsity configurations for federated LLM training.\n","authors":["Raja Vavekanand","Kira Sam"],"pdf_url":"https://arxiv.org/pdf/2407.20557v2.pdf","comment":"arXiv admin note: This submission has been withdrawn by arXiv\n administrators due to inappropriate text overlap with external sources"},{"id":"http://arxiv.org/abs/2408.10839v1","updated":"2024-08-20T13:34:17Z","published":"2024-08-20T13:34:17Z","title":"Benchmarking Large Language Models for Math Reasoning Tasks","summary":" The use of Large Language Models (LLMs) in mathematical reasoning has become\na cornerstone of related research, demonstrating the intelligence of these\nmodels and enabling potential practical applications through their advanced\nperformance, such as in educational settings. Despite the variety of datasets\nand in-context learning algorithms designed to improve the ability of LLMs to\nautomate mathematical problem solving, the lack of comprehensive benchmarking\nacross different datasets makes it complicated to select an appropriate model\nfor specific tasks. In this project, we present a benchmark that fairly\ncompares seven state-of-the-art in-context learning algorithms for mathematical\nproblem solving across five widely used mathematical datasets on four powerful\nfoundation models. Furthermore, we explore the trade-off between efficiency and\nperformance, highlighting the practical applications of LLMs for mathematical\nreasoning. Our results indicate that larger foundation models like GPT-4o and\nLLaMA 3-70B can solve mathematical reasoning independently from the concrete\nprompting strategy, while for smaller models the in-context learning approach\nsignificantly influences the performance. Moreover, the optimal prompt depends\non the chosen foundation model. We open-source our benchmark code to support\nthe integration of additional models in future research.\n","authors":["Kathrin Seßler","Yao Rong","Emek Gözlüklü","Enkelejda Kasneci"],"pdf_url":"https://arxiv.org/pdf/2408.10839v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10838v1","updated":"2024-08-20T13:32:11Z","published":"2024-08-20T13:32:11Z","title":"Multilevel CNNs for Parametric PDEs based on Adaptive Finite Elements","summary":" A neural network architecture is presented that exploits the multilevel\nproperties of high-dimensional parameter-dependent partial differential\nequations, enabling an efficient approximation of parameter-to-solution maps,\nrivaling best-in-class methods such as low-rank tensor regression in terms of\naccuracy and complexity. The neural network is trained with data on adaptively\nrefined finite element meshes, thus reducing data complexity significantly.\nError control is achieved by using a reliable finite element a posteriori error\nestimator, which is also provided as input to the neural network.\n The proposed U-Net architecture with CNN layers mimics a classical finite\nelement multigrid algorithm. It can be shown that the CNN efficiently\napproximates all operations required by the solver, including the evaluation of\nthe residual-based error estimator. In the CNN, a culling mask set-up according\nto the local corrections due to refinement on each mesh level reduces the\noverall complexity, allowing the network optimization with localized fine-scale\nfinite element data.\n A complete convergence and complexity analysis is carried out for the\nadaptive multilevel scheme, which differs in several aspects from previous\nnon-adaptive multilevel CNN. Moreover, numerical experiments with common\nbenchmark problems from Uncertainty Quantification illustrate the practical\nperformance of the architecture.\n","authors":["Janina Enrica Schütte","Martin Eigel"],"pdf_url":"https://arxiv.org/pdf/2408.10838v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10822v1","updated":"2024-08-20T13:18:21Z","published":"2024-08-20T13:18:21Z","title":"Navigating Spatio-Temporal Heterogeneity: A Graph Transformer Approach\n for Traffic Forecasting","summary":" Traffic forecasting has emerged as a crucial research area in the development\nof smart cities. Although various neural networks with intricate architectures\nhave been developed to address this problem, they still face two key\nchallenges: i) Recent advancements in network designs for modeling\nspatio-temporal correlations are starting to see diminishing returns in\nperformance enhancements. ii) Additionally, most models do not account for the\nspatio-temporal heterogeneity inherent in traffic data, i.e., traffic\ndistribution varies significantly across different regions and traffic flow\npatterns fluctuate across various time slots. To tackle these challenges, we\nintroduce the Spatio-Temporal Graph Transformer (STGormer), which effectively\nintegrates attribute and structure information inherent in traffic data for\nlearning spatio-temporal correlations, and a mixture-of-experts module for\ncapturing heterogeneity along spaital and temporal axes. Specifically, we\ndesign two straightforward yet effective spatial encoding methods based on the\ngraph structure and integrate time position encoding into the vanilla\ntransformer to capture spatio-temporal traffic patterns. Additionally, a\nmixture-of-experts enhanced feedforward neural network (FNN) module adaptively\nassigns suitable expert layers to distinct patterns via a spatio-temporal\ngating network, further improving overall prediction accuracy. Experiments on\nfive real-world datasets demonstrate that STGormer achieves state-of-the-art\nperformance.\n","authors":["Jianxiang Zhou","Erdong Liu","Wei Chen","Siru Zhong","Yuxuan Liang"],"pdf_url":"https://arxiv.org/pdf/2408.10822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10818v1","updated":"2024-08-20T13:13:36Z","published":"2024-08-20T13:13:36Z","title":"Learning Randomized Algorithms with Transformers","summary":" Randomization is a powerful tool that endows algorithms with remarkable\nproperties. For instance, randomized algorithms excel in adversarial settings,\noften surpassing the worst-case performance of deterministic algorithms with\nlarge margins. Furthermore, their success probability can be amplified by\nsimple strategies such as repetition and majority voting. In this paper, we\nenhance deep neural networks, in particular transformer models, with\nrandomization. We demonstrate for the first time that randomized algorithms can\nbe instilled in transformers through learning, in a purely data- and\nobjective-driven manner. First, we analyze known adversarial objectives for\nwhich randomized algorithms offer a distinct advantage over deterministic ones.\nWe then show that common optimization techniques, such as gradient descent or\nevolutionary strategies, can effectively learn transformer parameters that make\nuse of the randomness provided to the model. To illustrate the broad\napplicability of randomization in empowering neural networks, we study three\nconceptual tasks: associative recall, graph coloring, and agents that explore\ngrid worlds. In addition to demonstrating increased robustness against\noblivious adversaries through learned randomization, our experiments reveal\nremarkable performance improvements due to the inherently random nature of the\nneural networks' computation and predictions.\n","authors":["Johannes von Oswald","Seijin Kobayashi","Yassir Akram","Angelika Steger"],"pdf_url":"https://arxiv.org/pdf/2408.10818v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10816v1","updated":"2024-08-20T13:11:43Z","published":"2024-08-20T13:11:43Z","title":"Deep Learning-based Classification of Dementia using Image\n Representation of Subcortical Signals","summary":" Dementia is a neurological syndrome marked by cognitive decline. Alzheimer's\ndisease (AD) and Frontotemporal dementia (FTD) are the common forms of\ndementia, each with distinct progression patterns. EEG, a non-invasive tool for\nrecording brain activity, has shown potential in distinguishing AD from FTD and\nmild cognitive impairment (MCI). Previous studies have utilized various EEG\nfeatures, such as subband power and connectivity patterns to differentiate\nthese conditions. However, artifacts in EEG signals can obscure crucial\ninformation, necessitating advanced signal processing techniques. This study\naims to develop a deep learning-based classification system for dementia by\nanalyzing scout time-series signals from deep brain regions, specifically the\nhippocampus, amygdala, and thalamus. The study utilizes scout time series\nextracted via the standardized low-resolution brain electromagnetic tomography\n(sLORETA) technique. The time series is converted to image representations\nusing continuous wavelet transform (CWT) and fed as input to deep learning\nmodels. Two high-density EEG datasets are utilized to check for the efficacy of\nthe proposed method: the online BrainLat dataset (comprising AD, FTD, and\nhealthy controls (HC)) and the in-house IITD-AIIA dataset (including subjects\nwith AD, MCI, and HC). Different classification strategies and classifier\ncombinations have been utilized for the accurate mapping of classes on both\ndatasets. The best results were achieved by using a product of probabilities\nfrom classifiers for left and right subcortical regions in conjunction with the\nDenseNet model architecture. It yields accuracies of 94.17$\\%$ and 77.72$\\%$ on\nthe BrainLat and IITD-AIIA datasets, respectively. This highlights the\npotential of this approach for early and accurate differentiation of\nneurodegenerative disorders.\n","authors":["Shivani Ranjan","Ayush Tripathi","Harshal Shende","Robin Badal","Amit Kumar","Pramod Yadav","Deepak Joshi","Lalan Kumar"],"pdf_url":"https://arxiv.org/pdf/2408.10816v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10807v1","updated":"2024-08-20T12:56:49Z","published":"2024-08-20T12:56:49Z","title":"DisMix: Disentangling Mixtures of Musical Instruments for Source-level\n Pitch and Timbre Manipulation","summary":" Existing work on pitch and timbre disentanglement has been mostly focused on\nsingle-instrument music audio, excluding the cases where multiple instruments\nare presented. To fill the gap, we propose DisMix, a generative framework in\nwhich the pitch and timbre representations act as modular building blocks for\nconstructing the melody and instrument of a source, and the collection of which\nforms a set of per-instrument latent representations underlying the observed\nmixture. By manipulating the representations, our model samples mixtures with\nnovel combinations of pitch and timbre of the constituent instruments. We can\njointly learn the disentangled pitch-timbre representations and a latent\ndiffusion transformer that reconstructs the mixture conditioned on the set of\nsource-level representations. We evaluate the model using both a simple dataset\nof isolated chords and a realistic four-part chorales in the style of J.S.\nBach, identify the key components for the success of disentanglement, and\ndemonstrate the application of mixture transformation based on source-level\nattribute manipulation.\n","authors":["Yin-Jyun Luo","Kin Wai Cheuk","Woosung Choi","Toshimitsu Uesaka","Keisuke Toyama","Koichi Saito","Chieh-Hsin Lai","Yuhta Takida","Wei-Hsiang Liao","Simon Dixon","Yuki Mitsufuji"],"pdf_url":"https://arxiv.org/pdf/2408.10807v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10802v1","updated":"2024-08-20T12:51:35Z","published":"2024-08-20T12:51:35Z","title":"Inverse Deep Learning Ray Tracing for Heliostat Surface Prediction","summary":" Concentrating Solar Power (CSP) plants play a crucial role in the global\ntransition towards sustainable energy. A key factor in ensuring the safe and\nefficient operation of CSP plants is the distribution of concentrated flux\ndensity on the receiver. However, the non-ideal flux density generated by\nindividual heliostats can undermine the safety and efficiency of the power\nplant. The flux density from each heliostat is influenced by its precise\nsurface profile, which includes factors such as canting and mirror errors.\nAccurately measuring these surface profiles for a large number of heliostats in\noperation is a formidable challenge. Consequently, control systems often rely\non the assumption of ideal surface conditions, which compromises both safety\nand operational efficiency. In this study, we introduce inverse Deep Learning\nRay Tracing (iDLR), an innovative method designed to predict heliostat surfaces\nbased solely on target images obtained during heliostat calibration. Our\nsimulation-based investigation demonstrates that sufficient information\nregarding the heliostat surface is retained in the flux density distribution of\na single heliostat, enabling deep learning models to accurately predict the\nunderlying surface with deflectometry-like precision for the majority of\nheliostats. Additionally, we assess the limitations of this method,\nparticularly in relation to surface accuracy and resultant flux density\npredictions. Furthermore, we are presenting a new comprehensive heliostat model\nusing Non-Uniform Rational B-Spline (NURBS) that has the potential to become\nthe new State of the Art for heliostat surface parameterization. Our findings\nreveal that iDLR has significant potential to enhance CSP plant operations,\npotentially increasing the overall efficiency and energy output of the power\nplants.\n","authors":["Jan Lewen","Max Pargmann","Mehdi Cherti","Jenia Jitsev","Robert Pitz-Paal","Daniel Maldonado Quinto"],"pdf_url":"https://arxiv.org/pdf/2408.10802v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.10367v4","updated":"2024-08-20T12:50:18Z","published":"2022-02-21T17:04:05Z","title":"Probabilities of the Third Type: Statistical Relational Learning and\n Reasoning with Relative Frequencies","summary":" Dependencies on the relative frequency of a state in the domain are common\nwhen modelling probabilistic dependencies on relational data. For instance, the\nlikelihood of a school closure during an epidemic might depend on the\nproportion of infected pupils exceeding a threshold. Often, rather than\ndepending on discrete thresholds, dependencies are continuous: for instance,\nthe likelihood of any one mosquito bite transmitting an illness depends on the\nproportion of carrier mosquitoes. Current approaches usually only consider\nprobabilities over possible worlds rather than over domain elements themselves.\nAn exception are the recently introduced lifted Bayesian networks for\nconditional probability logic, which express discrete dependencies on\nprobabilistic data. We introduce functional lifted Bayesian networks, a\nformalism that explicitly incorporates continuous dependencies on relative\nfrequencies into statistical relational artificial intelligence, and compare\nand contrast them with lifted Bayesian networks for conditional probability\nlogic. Incorporating relative frequencies is not only beneficial to modelling;\nit also provides a more rigorous approach to learning problems where training\nand test or application domains have different sizes. To this end, we provide a\nrepresentation of the asymptotic probability distributions induced by\nfunctional lifted Bayesian networks on domains of increasing sizes. Since that\nrepresentation has well-understood scaling behaviour across domain sizes, it\ncan be used to estimate parameters for a large domain consistently from\nrandomly sampled subpopulations. Furthermore, we show that in parametric\nfamilies of FLBN, convergence is uniform in the parameters, which ensures a\nmeaningful dependence of the asymptotic probabilities on the parameters of the\nmodel.\n","authors":["Felix Weitkämper"],"pdf_url":"https://arxiv.org/pdf/2202.10367v4.pdf","comment":"30 pages"},{"id":"http://arxiv.org/abs/2408.10798v1","updated":"2024-08-20T12:46:23Z","published":"2024-08-20T12:46:23Z","title":"Universal Novelty Detection Through Adaptive Contrastive Learning","summary":" Novelty detection is a critical task for deploying machine learning models in\nthe open world. A crucial property of novelty detection methods is\nuniversality, which can be interpreted as generalization across various\ndistributions of training or test data. More precisely, for novelty detection,\ndistribution shifts may occur in the training set or the test set. Shifts in\nthe training set refer to cases where we train a novelty detector on a new\ndataset and expect strong transferability. Conversely, distribution shifts in\nthe test set indicate the methods' performance when the trained model\nencounters a shifted test sample. We experimentally show that existing methods\nfalter in maintaining universality, which stems from their rigid inductive\nbiases. Motivated by this, we aim for more generalized techniques that have\nmore adaptable inductive biases. In this context, we leverage the fact that\ncontrastive learning provides an efficient framework to easily switch and adapt\nto new inductive biases through the proper choice of augmentations in forming\nthe negative pairs. We propose a novel probabilistic auto-negative pair\ngeneration method AutoAugOOD, along with contrastive learning, to yield a\nuniversal novelty detector method. Our experiments demonstrate the superiority\nof our method under different distribution shifts in various image benchmark\ndatasets. Notably, our method emerges universality in the lens of adaptability\nto different setups of novelty detection, including one-class, unlabeled\nmulti-class, and labeled multi-class settings. Code:\nhttps://github.com/mojtaba-nafez/UNODE\n","authors":["Hossein Mirzaei","Mojtaba Nafez","Mohammad Jafari","Mohammad Bagher Soltani","Mohammad Azizmalayeri","Jafar Habibi","Mohammad Sabokrou","Mohammad Hossein Rohban"],"pdf_url":"https://arxiv.org/pdf/2408.10798v1.pdf","comment":"16 pages, 5 figures, conference"},{"id":"http://arxiv.org/abs/2408.10787v1","updated":"2024-08-20T12:27:53Z","published":"2024-08-20T12:27:53Z","title":"LightMDETR: A Lightweight Approach for Low-Cost Open-Vocabulary Object\n Detection Training","summary":" Object detection in computer vision traditionally involves identifying\nobjects in images. By integrating textual descriptions, we enhance this\nprocess, providing better context and accuracy. The MDETR model significantly\nadvances this by combining image and text data for more versatile object\ndetection and classification. However, MDETR's complexity and high\ncomputational demands hinder its practical use. In this paper, we introduce\nLightweight MDETR (LightMDETR), an optimized MDETR variant designed for\nimproved computational efficiency while maintaining robust multimodal\ncapabilities. Our approach involves freezing the MDETR backbone and training a\nsole component, the Deep Fusion Encoder (DFE), to represent image and text\nmodalities. A learnable context vector enables the DFE to switch between these\nmodalities. Evaluation on datasets like RefCOCO, RefCOCO+, and RefCOCOg\ndemonstrates that LightMDETR achieves superior precision and accuracy.\n","authors":["Binta Sow","Bilal Faye","Hanane Azzag","Mustapha Lebbah"],"pdf_url":"https://arxiv.org/pdf/2408.10787v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.13044v4","updated":"2024-08-20T12:16:13Z","published":"2024-07-17T22:48:47Z","title":"DropKAN: Regularizing KANs by masking post-activations","summary":" We propose DropKAN (Dropout Kolmogorov-Arnold Networks) a regularization\nmethod that prevents co-adaptation of activation function weights in\nKolmogorov-Arnold Networks (KANs). DropKAN functions by embedding the drop mask\ndirectly within the KAN layer, randomly masking the outputs of some activations\nwithin the KANs' computation graph. We show that this simple procedure that\nrequire minimal coding effort has a regularizing effect and consistently lead\nto better generalization of KANs. We analyze the adaptation of the standard\nDropout with KANs and demonstrate that Dropout applied to KANs' neurons can\nlead to unpredictable behavior in the feedforward pass. We carry an empirical\nstudy with real world Machine Learning datasets to validate our findings. Our\nresults suggest that DropKAN is consistently a better alternative to using\nstandard Dropout with KANs, and improves the generalization performance of\nKANs. Our implementation of DropKAN is available at:\n\\url{https://github.com/Ghaith81/dropkan}.\n","authors":["Mohammed Ghaith Altarabichi"],"pdf_url":"https://arxiv.org/pdf/2407.13044v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10775v1","updated":"2024-08-20T12:14:18Z","published":"2024-08-20T12:14:18Z","title":"Generative AI in Industrial Machine Vision -- A Review","summary":" Machine vision enhances automation, quality control, and operational\nefficiency in industrial applications by enabling machines to interpret and act\non visual data. While traditional computer vision algorithms and approaches\nremain widely utilized, machine learning has become pivotal in current research\nactivities. In particular, generative \\gls*{AI} demonstrates promising\npotential by improving pattern recognition capabilities, through data\naugmentation, increasing image resolution, and identifying anomalies for\nquality control. However, the application of generative \\gls*{AI} in machine\nvision is still in its early stages due to challenges in data diversity,\ncomputational requirements, and the necessity for robust validation methods. A\ncomprehensive literature review is essential to understand the current state of\ngenerative \\gls*{AI} in industrial machine vision, focusing on recent\nadvancements, applications, and research trends. Thus, a literature review\nbased on the PRISMA guidelines was conducted, analyzing over 1,200 papers on\ngenerative \\gls*{AI} in industrial machine vision. Our findings reveal various\npatterns in current research, with the primary use of generative \\gls*{AI}\nbeing data augmentation, for machine vision tasks such as classification and\nobject detection. Furthermore, we gather a collection of application challenges\ntogether with data requirements to enable a successful application of\ngenerative \\gls*{AI} in industrial machine vision. This overview aims to\nprovide researchers with insights into the different areas and applications\nwithin current research, highlighting significant advancements and identifying\nopportunities for future work.\n","authors":["Hans Aoyang Zhou","Dominik Wolfschläger","Constantinos Florides","Jonas Werheid","Hannes Behnen","Jan-Henrick Woltersmann","Tiago C. Pinto","Marco Kemmerling","Anas Abdelrazeq","Robert H. Schmitt"],"pdf_url":"https://arxiv.org/pdf/2408.10775v1.pdf","comment":"44 pages, 7 figures, This work has been submitted to the Journal of\n Intelligent Manufacturing"},{"id":"http://arxiv.org/abs/2408.10771v1","updated":"2024-08-20T12:09:58Z","published":"2024-08-20T12:09:58Z","title":"SSL-TTS: Leveraging Self-Supervised Embeddings and kNN Retrieval for\n Zero-Shot Multi-speaker TTS","summary":" While recent zero-shot multispeaker text-to-speech (TTS) models achieve\nimpressive results, they typically rely on extensive transcribed speech\ndatasets from numerous speakers and intricate training pipelines. Meanwhile,\nself-supervised learning (SSL) speech features have emerged as effective\nintermediate representations for TTS. It was also observed that SSL features\nfrom different speakers that are linearly close share phonetic information\nwhile maintaining individual speaker identity, which enables straight-forward\nand robust voice cloning. In this study, we introduce SSL-TTS, a lightweight\nand efficient zero-shot TTS framework trained on transcribed speech from a\nsingle speaker. SSL-TTS leverages SSL features and retrieval methods for simple\nand robust zero-shot multi-speaker synthesis. Objective and subjective\nevaluations show that our approach achieves performance comparable to\nstate-of-the-art models that require significantly larger training datasets.\nThe low training data requirements mean that SSL-TTS is well suited for the\ndevelopment of multi-speaker TTS systems for low-resource domains and\nlanguages. We also introduce an interpolation parameter which enables fine\ncontrol over the output speech by blending voices. Demo samples are available\nat https://idiap.github.io/ssl-tts\n","authors":["Karl El Hajal","Ajinkya Kulkarni","Enno Hermann","Mathew Magimai. -Doss"],"pdf_url":"https://arxiv.org/pdf/2408.10771v1.pdf","comment":"Submitted to IEEE Signal Processing Letters"},{"id":"http://arxiv.org/abs/2408.09908v2","updated":"2024-08-20T12:00:00Z","published":"2024-08-19T11:30:00Z","title":"$p$SVM: Soft-margin SVMs with $p$-norm Hinge Loss","summary":" Support Vector Machines (SVMs) based on hinge loss have been extensively\ndiscussed and applied to various binary classification tasks. These SVMs\nachieve a balance between margin maximization and the minimization of slack due\nto outliers. Although many efforts have been dedicated to enhancing the\nperformance of SVMs with hinge loss, studies on $p$SVMs, soft-margin SVMs with\n$p$-norm hinge loss, remain relatively scarce. In this paper, we explore the\nproperties, performance, and training algorithms of $p$SVMs. We first derive\nthe generalization bound of $p$SVMs, then formulate the dual optimization\nproblem, comparing it with the traditional approach. Furthermore, we discuss a\ngeneralized version of the Sequential Minimal Optimization (SMO) algorithm,\n$p$SMO, to train our $p$SVM model. Comparative experiments on various datasets,\nincluding binary and multi-class classification tasks, demonstrate the\neffectiveness and advantages of our $p$SVM model and the $p$SMO method. Code is\navailable at https://github.com/CoderBak/pSVM.\n","authors":["Haoxiang Sun"],"pdf_url":"https://arxiv.org/pdf/2408.09908v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.02123v2","updated":"2024-08-20T11:57:06Z","published":"2024-08-04T19:37:30Z","title":"Human-inspired Explanations for Vision Transformers and Convolutional\n Neural Networks","summary":" We introduce Foveation-based Explanations (FovEx), a novel human-inspired\nvisual explainability (XAI) method for Deep Neural Networks. Our method\nachieves state-of-the-art performance on both transformer (on 4 out of 5\nmetrics) and convolutional models (on 3 out of 5 metrics), demonstrating its\nversatility. Furthermore, we show the alignment between the explanation map\nproduced by FovEx and human gaze patterns (+14\\% in NSS compared to RISE,\n+203\\% in NSS compared to gradCAM), enhancing our confidence in FovEx's ability\nto close the interpretation gap between humans and machines.\n","authors":["Mahadev Prasad Panda","Matteo Tiezzi","Martina Vilas","Gemma Roig","Bjoern M. Eskofier","Dario Zanca"],"pdf_url":"https://arxiv.org/pdf/2408.02123v2.pdf","comment":"Accepted at the Human-inspired Computer Vision (HCV) ECCV 2024\n Workshop as an extended abstract. A long version of the work can be found at\n arXiv:2408.02123v1"},{"id":"http://arxiv.org/abs/2309.14353v2","updated":"2024-08-20T11:56:03Z","published":"2023-09-21T08:05:28Z","title":"Limited Communications Distributed Optimization via Deep Unfolded\n Distributed ADMM","summary":" Distributed optimization is a fundamental framework for collaborative\ninference and decision making in decentralized multi-agent systems. The\noperation is modeled as the joint minimization of a shared objective which\ntypically depends on observations gathered locally by each agent. Distributed\noptimization algorithms, such as the common D-ADMM, tackle this task by\niteratively combining local computations and message exchanges. One of the main\nchallenges associated with distributed optimization, and particularly with\nD-ADMM, is that it requires a large number of communications, i.e., messages\nexchanged between the agents, to reach consensus. This can make D-ADMM costly\nin power, latency, and channel resources. In this work we propose unfolded\nD-ADMM, which follows the emerging deep unfolding methodology to enable D-ADMM\nto operate reliably with a predefined and small number of messages exchanged by\neach agent. Unfolded D-ADMM fully preserves the operation of D-ADMM, while\nleveraging data to tune the hyperparameters of each iteration of the algorithm.\nThese hyperparameters can either be agent-specific, aiming at achieving the\nbest performance within a fixed number of iterations over a given network, or\nshared among the agents, allowing to learn to distributedly optimize over\ndifferent networks. For both settings, our unfolded D-ADMM operates with\nlimited communications, while preserving the interpretability and flexibility\nof the original D-ADMM algorithm. We specialize unfolded D-ADMM for two\nrepresentative settings: a distributed estimation task, considering a sparse\nrecovery setup, and a distributed learning scenario, where multiple agents\ncollaborate in learning a machine learning model. Our numerical results\ndemonstrate that the proposed approach dramatically reduces the number of\ncommunications utilized by D-ADMM, without compromising on its performance.\n","authors":["Yoav Noah","Nir Shlezinger"],"pdf_url":"https://arxiv.org/pdf/2309.14353v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14405v4","updated":"2024-08-20T11:45:34Z","published":"2023-05-23T12:03:51Z","title":"NeuralMatrix: Compute the Entire Neural Networks with Linear Matrix\n Operations for Efficient Inference","summary":" The inherent diversity of computation types within the deep neural network\n(DNN) models often requires a variety of specialized units in hardware\nprocessors, which limits computational efficiency, increasing both inference\nlatency and power consumption, especially when the hardware processor needs to\nsupport and execute different neural networks. In this study, we introduce\nNeuralMatrix, which elastically transforms the computations of entire DNNs into\nlinear matrix operations. This transformation allows seamless execution of\nvarious DNN models all with matrix operations and paves the way for running\nversatile DNN models with a single General Matrix Multiplication (GEMM)\naccelerator.Extensive experiments with both CNN and transformer-based models\ndemonstrate the potential of NeuralMatrix to accurately and efficiently execute\na wide range of DNN models, achieving 2.17-38.72 times computation efficiency\n(i.e., throughput per power) compared to CPUs, GPUs, and SoC platforms. This\nlevel of efficiency is usually only attainable with the accelerator designed\nfor a specific neural network.\n","authors":["Ruiqi Sun","Siwei Ye","Jie Zhao","Xin He","Jianzhe Lin","Yiran Li","An Zou"],"pdf_url":"https://arxiv.org/pdf/2305.14405v4.pdf","comment":"9 pages, 8figures, Submitted to The 39th Annual AAAI Conference on\n Artificial Intelligence"},{"id":"http://arxiv.org/abs/2408.04380v2","updated":"2024-08-20T11:45:29Z","published":"2024-08-08T11:34:31Z","title":"Deep Generative Models in Robotics: A Survey on Learning from Multimodal\n Demonstrations","summary":" Learning from Demonstrations, the field that proposes to learn robot behavior\nmodels from data, is gaining popularity with the emergence of deep generative\nmodels. Although the problem has been studied for years under names such as\nImitation Learning, Behavioral Cloning, or Inverse Reinforcement Learning,\nclassical methods have relied on models that don't capture complex data\ndistributions well or don't scale well to large numbers of demonstrations. In\nrecent years, the robot learning community has shown increasing interest in\nusing deep generative models to capture the complexity of large datasets. In\nthis survey, we aim to provide a unified and comprehensive review of the last\nyear's progress in the use of deep generative models in robotics. We present\nthe different types of models that the community has explored, such as\nenergy-based models, diffusion models, action value maps, or generative\nadversarial networks. We also present the different types of applications in\nwhich deep generative models have been used, from grasp generation to\ntrajectory generation or cost learning. One of the most important elements of\ngenerative models is the generalization out of distributions. In our survey, we\nreview the different decisions the community has made to improve the\ngeneralization of the learned models. Finally, we highlight the research\nchallenges and propose a number of future directions for learning deep\ngenerative models in robotics.\n","authors":["Julen Urain","Ajay Mandlekar","Yilun Du","Mahi Shafiullah","Danfei Xu","Katerina Fragkiadaki","Georgia Chalvatzaki","Jan Peters"],"pdf_url":"https://arxiv.org/pdf/2408.04380v2.pdf","comment":"20 pages, 11 figures, submitted to TRO"},{"id":"http://arxiv.org/abs/2408.10755v1","updated":"2024-08-20T11:37:52Z","published":"2024-08-20T11:37:52Z","title":"Generating Synthetic Fair Syntax-agnostic Data by Learning and\n Distilling Fair Representation","summary":" Data Fairness is a crucial topic due to the recent wide usage of AI powered\napplications. Most of the real-world data is filled with human or machine\nbiases and when those data are being used to train AI models, there is a chance\nthat the model will reflect the bias in the training data. Existing\nbias-mitigating generative methods based on GANs, Diffusion models need\nin-processing fairness objectives and fail to consider computational overhead\nwhile choosing computationally-heavy architectures, which may lead to high\ncomputational demands, instability and poor optimization performance. To\nmitigate this issue, in this work, we present a fair data generation technique\nbased on knowledge distillation, where we use a small architecture to distill\nthe fair representation in the latent space. The idea of fair latent space\ndistillation enables more flexible and stable training of Fair Generative\nModels (FGMs). We first learn a syntax-agnostic (for any data type) fair\nrepresentation of the data, followed by distillation in the latent space into a\nsmaller model. After distillation, we use the distilled fair latent space to\ngenerate high-fidelity fair synthetic data. While distilling, we employ quality\nloss (for fair distillation) and utility loss (for data utility) to ensure that\nthe fairness and data utility characteristics remain in the distilled latent\nspace. Our approaches show a 5%, 5% and 10% rise in performance in fairness,\nsynthetic sample quality and data utility, respectively, than the\nstate-of-the-art fair generative model.\n","authors":["Md Fahim Sikder","Resmi Ramachandranpillai","Daniel de Leng","Fredrik Heintz"],"pdf_url":"https://arxiv.org/pdf/2408.10755v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10752v1","updated":"2024-08-20T11:34:23Z","published":"2024-08-20T11:34:23Z","title":"Security Assessment of Hierarchical Federated Deep Learning","summary":" Hierarchical federated learning (HFL) is a promising distributed deep\nlearning model training paradigm, but it has crucial security concerns arising\nfrom adversarial attacks. This research investigates and assesses the security\nof HFL using a novel methodology by focusing on its resilience against\nadversarial attacks inference-time and training-time. Through a series of\nextensive experiments across diverse datasets and attack scenarios, we uncover\nthat HFL demonstrates robustness against untargeted training-time attacks due\nto its hierarchical structure. However, targeted attacks, particularly backdoor\nattacks, exploit this architecture, especially when malicious clients are\npositioned in the overlapping coverage areas of edge servers. Consequently, HFL\nshows a dual nature in its resilience, showcasing its capability to recover\nfrom attacks thanks to its hierarchical aggregation that strengthens its\nsuitability for adversarial training, thereby reinforcing its resistance\nagainst inference-time attacks. These insights underscore the necessity for\nbalanced security strategies in HFL systems, leveraging their inherent\nstrengths while effectively mitigating vulnerabilities.\n","authors":["D Alqattan","R Sun","H Liang","G Nicosia","V Snasel","R Ranjan","V Ojha"],"pdf_url":"https://arxiv.org/pdf/2408.10752v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10746v1","updated":"2024-08-20T11:30:12Z","published":"2024-08-20T11:30:12Z","title":"Pluto and Charon: A Time and Memory Efficient Collaborative Edge AI\n Framework for Personal LLMs Fine-Tuning","summary":" Large language models (LLMs) have unlocked a plethora of powerful\napplications at the network edge, such as intelligent personal assistants. Data\nprivacy and security concerns have prompted a shift towards edge-based\nfine-tuning of personal LLMs, away from cloud reliance. However, this raises\nissues of computational intensity and resource scarcity, hindering training\nefficiency and feasibility. While current studies investigate\nparameter-efficient fine-tuning (PEFT) techniques to mitigate resource\nconstraints, our analysis indicates that these techniques are not sufficiently\nresource-efficient for edge devices. To tackle these challenges, we propose\nPluto and Charon (PAC), a time and memory efficient collaborative edge AI\nframework for personal LLMs fine-tuning. PAC breaks the resource wall of\npersonal LLMs fine-tuning with a sophisticated algorithm-system co-design. (1)\nAlgorithmically, PAC implements a personal LLMs fine-tuning technique that is\nefficient in terms of parameters, time, and memory. It utilizes Parallel\nAdapters to circumvent the need for a full backward pass through the LLM\nbackbone. Additionally, an activation cache mechanism further streamlining the\nprocess by negating the necessity for repeated forward passes across multiple\nepochs. (2) Systematically, PAC leverages edge devices in close proximity,\npooling them as a collective resource for in-situ personal LLMs fine-tuning,\nutilizing a hybrid data and pipeline parallelism to orchestrate distributed\ntraining. The use of the activation cache eliminates the need for forward pass\nthrough the LLM backbone,enabling exclusive fine-tuning of the Parallel\nAdapters using data parallelism. Extensive evaluation based on prototype\nimplementation demonstrates that PAC remarkably outperforms state-of-the-art\napproaches, achieving up to 8.64x end-to-end speedup and up to 88.16% reduction\nin memory footprint.\n","authors":["Bei Ouyang","Shengyuan Ye","Liekang Zeng","Tianyi Qian","Jingyi Li","Xu Chen"],"pdf_url":"https://arxiv.org/pdf/2408.10746v1.pdf","comment":"Accepted by The 53rd International Conference on Parallel Processing\n (ICPP'24)"},{"id":"http://arxiv.org/abs/2404.06492v2","updated":"2024-08-20T11:21:32Z","published":"2024-04-09T17:45:25Z","title":"Graph Reinforcement Learning for Combinatorial Optimization: A Survey\n and Unifying Perspective","summary":" Graphs are a natural representation for systems based on relations between\nconnected entities. Combinatorial optimization problems, which arise when\nconsidering an objective function related to a process of interest on discrete\nstructures, are often challenging due to the rapid growth of the solution\nspace. The trial-and-error paradigm of Reinforcement Learning has recently\nemerged as a promising alternative to traditional methods, such as exact\nalgorithms and (meta)heuristics, for discovering better decision-making\nstrategies in a variety of disciplines including chemistry, computer science,\nand statistics. Despite the fact that they arose in markedly different fields,\nthese techniques share significant commonalities. Therefore, we set out to\nsynthesize this work in a unifying perspective that we term Graph Reinforcement\nLearning, interpreting it as a constructive decision-making method for graph\nproblems. After covering the relevant technical background, we review works\nalong the dividing line of whether the goal is to optimize graph structure\ngiven a process of interest, or to optimize the outcome of the process itself\nunder fixed graph structure. Finally, we discuss the common challenges facing\nthe field and open research questions. In contrast with other surveys, the\npresent work focuses on non-canonical graph problems for which performant\nalgorithms are typically not known and Reinforcement Learning is able to\nprovide efficient and effective solutions.\n","authors":["Victor-Alexandru Darvariu","Stephen Hailes","Mirco Musolesi"],"pdf_url":"https://arxiv.org/pdf/2404.06492v2.pdf","comment":"To appear in Transactions on Machine Learning Research (TMLR)"},{"id":"http://arxiv.org/abs/2408.08968v2","updated":"2024-08-20T11:17:56Z","published":"2024-08-16T18:34:11Z","title":"Online SLA Decomposition: Enabling Real-Time Adaptation to Evolving\n Systems","summary":" When a network slice spans multiple domains, each domain must uphold the\nEnd-to-End (E2E) Service Level Agreement (SLA) associated with the slice. This\nrequires decomposing the E2E SLA into partial SLAs for each domain. In a\ntwo-level network slicing management system with an E2E orchestrator and local\ncontrollers, we propose an online learning-decomposition framework that\ndynamically updates risk models using recent feedback. This approach utilizes\nonline gradient descent and FIFO memory buffers to enhance stability and\nrobustness. Our empirical study shows the proposed framework outperforms\nstate-of-the-art static methods, offering more accurate and resilient SLA\ndecomposition under varying conditions and sparse data.\n","authors":["Cyril Shih-Huan Hsu","Danny De Vleeschauwer","Chrysa Papagianni"],"pdf_url":"https://arxiv.org/pdf/2408.08968v2.pdf","comment":"The paper has been submitted to IEEE Networking Letters"},{"id":"http://arxiv.org/abs/2408.01239v2","updated":"2024-08-20T11:12:23Z","published":"2024-08-02T12:58:08Z","title":"Tailoring Graph Neural Network-based Flow-guided Localization to\n Individual Bloodstreams and Activities","summary":" Flow-guided localization using in-body nanodevices in the bloodstream is\nexpected to be beneficial for early disease detection, continuous monitoring of\nbiological conditions, and targeted treatment. The nanodevices face size and\npower constraints that produce erroneous raw data for localization purposes.\nOn-body anchors receive this data, and use it to derive the locations of\ndiagnostic events of interest. Different Machine Learning (ML) approaches have\nbeen recently proposed for this task, yet they are currently restricted to a\nreference bloodstream of a resting patient. As such, they are unable to deal\nwith the physical diversity of patients' bloodstreams and cannot provide\ncontinuous monitoring due to changes in individual patient's activities. Toward\naddressing these issues for the current State-of-the-Art (SotA) flow-guided\nlocalization approach based on Graph Neural Networks (GNNs), we propose a\npipeline for GNN adaptation based on individual physiological indicators\nincluding height, weight, and heart rate. Our results indicate that the\nproposed adaptions are beneficial in reconciling the individual differences\nbetween bloodstreams and activities.\n","authors":["Pablo Galván","Filip Lemic","Gerard Calvo Bartra","Sergi Abadal","Xavier Costa Pérez"],"pdf_url":"https://arxiv.org/pdf/2408.01239v2.pdf","comment":"7 pages, 9 figures, 2 tables, 16 references, accepted at ACM\n NanoCom'25"},{"id":"http://arxiv.org/abs/2307.11609v2","updated":"2024-08-20T11:09:16Z","published":"2023-07-21T14:25:22Z","title":"Persistent Ballistic Entanglement Spreading with Optimal Control in\n Quantum Spin Chains","summary":" Entanglement propagation provides a key routine to understand quantum\nmany-body dynamics in and out of equilibrium. The entanglement entropy (EE)\nusually approaches to a sub-saturation known as the Page value $\\tilde{S}_{P}\n=\\tilde{S} - dS$ (with $\\tilde{S}$ the maximum of EE and $dS$ the Page\ncorrection) in, e.g., the random unitary evolutions. The ballistic spreading of\nEE usually appears in the early time and will be deviated far before the Page\nvalue is reached. In this work, we uncover that the magnetic field that\nmaximizes the EE robustly induces persistent ballistic spreading of\nentanglement in quantum spin chains. The linear growth of EE is demonstrated to\npersist till the maximal $\\tilde{S}$ (along with a flat entanglement spectrum)\nis reached. The robustness of ballistic spreading and the enhancement of EE\nunder such an optimal control are demonstrated, considering particularly\nperturbing the initial state by random pure states (RPS's). These are argued as\nthe results from the endomorphism of the time evolution under such an\nentanglement-enhancing optimal control for the RPS's.\n","authors":["Ying Lu","Pei Shi","Xiao-Han Wang","Jie Hu","Shi-Ju Ran"],"pdf_url":"https://arxiv.org/pdf/2307.11609v2.pdf","comment":"Main text: 6 pages, 5 figures + Supplemental material"},{"id":"http://arxiv.org/abs/2406.01290v3","updated":"2024-08-20T11:09:01Z","published":"2024-06-03T13:01:09Z","title":"Resource-constrained Fairness","summary":" Access to resources strongly constrains the decisions we make. While we might\nwish to offer every student a scholarship, or schedule every patient for\nfollow-up meetings with a specialist, limited resources mean that this is not\npossible. When deploying machine learning systems, these resource constraints\nare simply enforced by varying the threshold of a classifier. However, these\nfinite resource limitations are disregarded by most existing tools for fair\nmachine learning, which do not allow the specification of resource limitations\nand do not remain fair when varying thresholds. This makes them ill-suited for\nreal-world deployment. Our research introduces the concept of\n\"resource-constrained fairness\" and quantifies the cost of fairness within this\nframework. We demonstrate that the level of available resources significantly\ninfluences this cost, a factor overlooked in previous evaluations.\n","authors":["Sofie Goethals","Eoin Delaney","Brent Mittelstadt","Chris Russell"],"pdf_url":"https://arxiv.org/pdf/2406.01290v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10720v1","updated":"2024-08-20T10:43:09Z","published":"2024-08-20T10:43:09Z","title":"Towards Foundation Models for the Industrial Forecasting of Chemical\n Kinetics","summary":" Scientific Machine Learning is transforming traditional engineering\nindustries by enhancing the efficiency of existing technologies and\naccelerating innovation, particularly in modeling chemical reactions. Despite\nrecent advancements, the issue of solving stiff chemically reacting problems\nwithin computational fluid dynamics remains a significant issue. In this study\nwe propose a novel approach utilizing a multi-layer-perceptron mixer\narchitecture (MLP-Mixer) to model the time-series of stiff chemical kinetics.\nWe evaluate this method using the ROBER system, a benchmark model in chemical\nkinetics, to compare its performance with traditional numerical techniques.\nThis study provides insight into the industrial utility of the recently\ndeveloped MLP-Mixer architecture to model chemical kinetics and provides\nmotivation for such neural architecture to be used as a base for time-series\nfoundation models.\n","authors":["Imran Nasim","Joaõ Lucas de Sousa Almeida"],"pdf_url":"https://arxiv.org/pdf/2408.10720v1.pdf","comment":"Accepted into the IEEE CAI 2024 Workshop on Scientific Machine\n Learning and Its Industrial Applications (SMLIA2024)"},{"id":"http://arxiv.org/abs/2310.13479v3","updated":"2024-08-20T10:35:24Z","published":"2023-10-20T13:20:17Z","title":"Segment, Select, Correct: A Framework for Weakly-Supervised Referring\n Segmentation","summary":" Referring Image Segmentation (RIS) - the problem of identifying objects in\nimages through natural language sentences - is a challenging task currently\nmostly solved through supervised learning. However, while collecting referred\nannotation masks is a time-consuming process, the few existing\nweakly-supervised and zero-shot approaches fall significantly short in\nperformance compared to fully-supervised learning ones. To bridge the\nperformance gap without mask annotations, we propose a novel weakly-supervised\nframework that tackles RIS by decomposing it into three steps: obtaining\ninstance masks for the object mentioned in the referencing instruction\n(segment), using zero-shot learning to select a potentially correct mask for\nthe given instruction (select), and bootstrapping a model which allows for\nfixing the mistakes of zero-shot selection (correct). In our experiments, using\nonly the first two steps (zero-shot segment and select) outperforms other\nzero-shot baselines by as much as 16.5%, while our full method improves upon\nthis much stronger baseline and sets the new state-of-the-art for\nweakly-supervised RIS, reducing the gap between the weakly-supervised and\nfully-supervised methods in some cases from around 33% to as little as 7%. Code\nis available at https://github.com/fgirbal/segment-select-correct.\n","authors":["Francisco Eiras","Kemal Oksuz","Adel Bibi","Philip H. S. Torr","Puneet K. Dokania"],"pdf_url":"https://arxiv.org/pdf/2310.13479v3.pdf","comment":"Accepted to ECCV'24 Workshop Proceedings (Instance-Level Recognition\n Workshop)"},{"id":"http://arxiv.org/abs/2304.03997v4","updated":"2024-08-20T10:34:28Z","published":"2023-04-08T12:30:59Z","title":"Predicting Short Term Energy Demand in Smart Grid: A Deep Learning\n Approach for Integrating Renewable Energy Sources in Line with SDGs 7, 9, and\n 13","summary":" Integrating renewable energy sources into the power grid is becoming\nincreasingly important as the world moves towards a more sustainable energy\nfuture in line with SDG 7. However, the intermittent nature of renewable energy\nsources can make it challenging to manage the power grid and ensure a stable\nsupply of electricity, which is crucial for achieving SDG 9. In this paper, we\npropose a deep learning model for predicting energy demand in a smart power\ngrid, which can improve the integration of renewable energy sources by\nproviding accurate predictions of energy demand. Our approach aligns with SDG\n13 on climate action, enabling more efficient management of renewable energy\nresources. We use long short-term memory networks, well-suited for time series\ndata, to capture complex patterns and dependencies in energy demand data. The\nproposed approach is evaluated using four historical short-term energy demand\ndata datasets from different energy distribution companies, including American\nElectric Power, Commonwealth Edison, Dayton Power and Light, and\nPennsylvania-New Jersey-Maryland Interconnection. The proposed model is\ncompared with three other state-of-the-art forecasting algorithms: Facebook\nProphet, Support Vector Regression, and Random Forest Regression. The\nexperimental results show that the proposed REDf model can accurately predict\nenergy demand with a mean absolute error of 1.4%, indicating its potential to\nenhance the stability and efficiency of the power grid and contribute to\nachieving SDGs 7, 9, and 13. The proposed model also has the potential to\nmanage the integration of renewable energy sources effectively.\n","authors":["Md Saef Ullah Miah","Junaida Sulaiman","Md. Imamul Islam","Md. Masuduzzaman","Molla Shahadat Hossain Lipu","Ramdhan Nugraha"],"pdf_url":"https://arxiv.org/pdf/2304.03997v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10717v1","updated":"2024-08-20T10:31:52Z","published":"2024-08-20T10:31:52Z","title":"Accelerated training of deep learning surrogate models for surface\n displacement and flow, with application to MCMC-based history matching of CO2\n storage operations","summary":" Deep learning surrogate modeling shows great promise for subsurface flow\napplications, but the training demands can be substantial. Here we introduce a\nnew surrogate modeling framework to predict CO2 saturation, pressure and\nsurface displacement for use in the history matching of carbon storage\noperations. Rather than train using a large number of expensive coupled\nflow-geomechanics simulation runs, training here involves a large number of\ninexpensive flow-only simulations combined with a much smaller number of\ncoupled runs. The flow-only runs use an effective rock compressibility, which\nis shown to provide accurate predictions for saturation and pressure for our\nsystem. A recurrent residual U-Net architecture is applied for the saturation\nand pressure surrogate models, while a new residual U-Net model is introduced\nto predict surface displacement. The surface displacement surrogate accepts, as\ninputs, geomodel quantities along with saturation and pressure surrogate\npredictions. Median relative error for a diverse test set is less than 4% for\nall variables. The surrogate models are incorporated into a hierarchical Markov\nchain Monte Carlo history matching workflow. Surrogate error is included using\na new treatment involving the full model error covariance matrix. A high degree\nof prior uncertainty, with geomodels characterized by uncertain geological\nscenario parameters (metaparameters) and associated realizations, is\nconsidered. History matching results for a synthetic true model are generated\nusing in-situ monitoring-well data only, surface displacement data only, and\nboth data types. The enhanced uncertainty reduction achieved with both data\ntypes is quantified. Posterior saturation and surface displacement fields are\nshown to correspond well with the true solution.\n","authors":["Yifu Han","Francois P. Hamon","Louis J. Durlofsky"],"pdf_url":"https://arxiv.org/pdf/2408.10717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10713v1","updated":"2024-08-20T10:29:21Z","published":"2024-08-20T10:29:21Z","title":"Offline Model-Based Reinforcement Learning with Anti-Exploration","summary":" Model-based reinforcement learning (MBRL) algorithms learn a dynamics model\nfrom collected data and apply it to generate synthetic trajectories to enable\nfaster learning. This is an especially promising paradigm in offline\nreinforcement learning (RL) where data may be limited in quantity, in addition\nto being deficient in coverage and quality. Practical approaches to offline\nMBRL usually rely on ensembles of dynamics models to prevent exploitation of\nany individual model and to extract uncertainty estimates that penalize values\nin states far from the dataset support. Uncertainty estimates from ensembles\ncan vary greatly in scale, making it challenging to generalize hyperparameters\nwell across even similar tasks. In this paper, we present Morse Model-based\noffline RL (MoMo), which extends the anti-exploration paradigm found in offline\nmodel-free RL to the model-based space. We develop model-free and model-based\nvariants of MoMo and show how the model-free version can be extended to detect\nand deal with out-of-distribution (OOD) states using explicit uncertainty\nestimation without the need for large ensembles. MoMo performs offline MBRL\nusing an anti-exploration bonus to counteract value overestimation in\ncombination with a policy constraint, as well as a truncation function to\nterminate synthetic rollouts that are excessively OOD. Experimentally, we find\nthat both model-free and model-based MoMo perform well, and the latter\noutperforms prior model-based and model-free baselines on the majority of D4RL\ndatasets tested.\n","authors":["Padmanaba Srinivasan","William Knottenbelt"],"pdf_url":"https://arxiv.org/pdf/2408.10713v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05680v2","updated":"2024-08-20T10:24:50Z","published":"2023-08-10T16:33:17Z","title":"Breaking Language Barriers with MMTweets: Advancing Cross-Lingual\n Debunked Narrative Retrieval for Fact-Checking","summary":" Finding previously debunked narratives involves identifying claims that have\nalready undergone fact-checking. The issue intensifies when similar false\nclaims persist in multiple languages, despite the availability of debunks for\nseveral months in another language. Hence, automatically finding debunks (or\nfact-checks) in multiple languages is crucial to make the best use of scarce\nfact-checkers' resources. Mainly due to the lack of readily available data,\nthis is an understudied problem, particularly when considering the\ncross-lingual scenario, i.e. the retrieval of debunks in a language different\nfrom the language of the online post being checked. This study introduces\ncross-lingual debunked narrative retrieval and addresses this research gap by:\n(i) creating Multilingual Misinformation Tweets (MMTweets): a dataset that\nstands out, featuring cross-lingual pairs, images, human annotations, and\nfine-grained labels, making it a comprehensive resource compared to its\ncounterparts; (ii) conducting an extensive experiment to benchmark\nstate-of-the-art cross-lingual retrieval models and introducing multistage\nretrieval methods tailored for the task; and (iii) comprehensively evaluating\nretrieval models for their cross-lingual and cross-dataset transfer\ncapabilities within MMTweets, and conducting a retrieval latency analysis. We\nfind that MMTweets presents challenges for cross-lingual debunked narrative\nretrieval, highlighting areas for improvement in retrieval models. Nonetheless,\nthe study provides valuable insights for creating MMTweets datasets and\noptimising debunked narrative retrieval models to empower fact-checking\nendeavours. The dataset and annotation codebook are publicly available at\nhttps://doi.org/10.5281/zenodo.10637161.\n","authors":["Iknoor Singh","Carolina Scarton","Xingyi Song","Kalina Bontcheva"],"pdf_url":"https://arxiv.org/pdf/2308.05680v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10709v1","updated":"2024-08-20T10:23:35Z","published":"2024-08-20T10:23:35Z","title":"Variable Assignment Invariant Neural Networks for Learning Logic\n Programs","summary":" Learning from interpretation transition (LFIT) is a framework for learning\nrules from observed state transitions. LFIT has been implemented in purely\nsymbolic algorithms, but they are unable to deal with noise or generalize to\nunobserved transitions. Rule extraction based neural network methods suffer\nfrom overfitting, while more general implementation that categorize rules\nsuffer from combinatorial explosion. In this paper, we introduce a technique to\nleverage variable permutation invariance inherent in symbolic domains. Our\ntechnique ensures that the permutation and the naming of the variables would\nnot affect the results. We demonstrate the effectiveness and the scalability of\nthis method with various experiments. Our code is publicly available at\nhttps://github.com/phuayj/delta-lfit-2\n","authors":["Yin Jun Phua","Katsumi Inoue"],"pdf_url":"https://arxiv.org/pdf/2408.10709v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14281v2","updated":"2024-08-20T10:19:16Z","published":"2024-06-20T13:07:06Z","title":"FairX: A comprehensive benchmarking tool for model analysis using\n fairness, utility, and explainability","summary":" We present FairX, an open-source Python-based benchmarking tool designed for\nthe comprehensive analysis of models under the umbrella of fairness, utility,\nand eXplainability (XAI). FairX enables users to train benchmarking\nbias-removal models and evaluate their fairness using a wide array of fairness\nmetrics, data utility metrics, and generate explanations for model predictions,\nall within a unified framework. Existing benchmarking tools do not have the way\nto evaluate synthetic data generated from fair generative models, also they do\nnot have the support for training fair generative models either. In FairX, we\nadd fair generative models in the collection of our fair-model library\n(pre-processing, in-processing, post-processing) and evaluation metrics for\nevaluating the quality of synthetic fair data. This version of FairX supports\nboth tabular and image datasets. It also allows users to provide their own\ncustom datasets. The open-source FairX benchmarking package is publicly\navailable at https://github.com/fahim-sikder/FairX.\n","authors":["Md Fahim Sikder","Resmi Ramachandranpillai","Daniel de Leng","Fredrik Heintz"],"pdf_url":"https://arxiv.org/pdf/2406.14281v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10700v1","updated":"2024-08-20T09:57:13Z","published":"2024-08-20T09:57:13Z","title":"AnyGraph: Graph Foundation Model in the Wild","summary":" The growing ubiquity of relational data structured as graphs has underscored\nthe need for graph learning models with exceptional generalization\ncapabilities. However, current approaches often struggle to effectively extract\ngeneralizable insights, frequently requiring extensive fine-tuning and limiting\ntheir versatility. Graph foundation models offer a transformative solution,\nwith the potential to learn robust, generalizable representations from graph\ndata. This enables more effective and adaptable applications across a wide\nspectrum of tasks and domains. In this work, we investigate a unified graph\nmodel, AnyGraph, designed to handle key challenges: i) Structure Heterogenity.\nAddressing distribution shift in graph structural information; ii) Feature\nHeterogenity. Handling diverse feature representation spaces across graph\ndatasets; iii) Fast Adaptation. Efficiently adapting the model to new graph\ndomains; iv) Scaling Law Emergence. Enabling the model to exhibit scaling law\nbehavior, where its performance scales favorably with the amount of data and\nparameter sizes. To tackle these critical challenges, we build the AnyGraph\nupon a Graph Mixture-of-Experts (MoE) architecture. This approach empowers the\nmodel to effectively manage both the in-domain and cross-domain distribution\nshift concerning structure-level and feature-level heterogeneity. Furthermore,\na lightweight graph expert routing mechanism is proposed to facilitate\nAnyGraph's fast adaptability to new data and domains. Our extensive experiments\non diverse 38 graph datasets have demonstrated the strong zero-shot learning\nperformance of AnyGraph across diverse graph domains with significant\ndistribution shift. Furthermore, we have validated the model's fast adaptation\nability and scaling law emergence, showcasing its versatility.\n","authors":["Lianghao Xia","Chao Huang"],"pdf_url":"https://arxiv.org/pdf/2408.10700v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10682v1","updated":"2024-08-20T09:36:04Z","published":"2024-08-20T09:36:04Z","title":"Towards Robust Knowledge Unlearning: An Adversarial Framework for\n Assessing and Improving Unlearning Robustness in Large Language Models","summary":" LLM have achieved success in many fields but still troubled by problematic\ncontent in the training corpora. LLM unlearning aims at reducing their\ninfluence and avoid undesirable behaviours. However, existing unlearning\nmethods remain vulnerable to adversarial queries and the unlearned knowledge\nresurfaces after the manually designed attack queries. As part of a red-team\neffort to proactively assess the vulnerabilities of unlearned models, we design\nDynamic Unlearning Attack (DUA), a dynamic and automated framework to attack\nthese models and evaluate their robustness. It optimizes adversarial suffixes\nto reintroduce the unlearned knowledge in various scenarios. We find that\nunlearned knowledge can be recovered in $55.2\\%$ of the questions, even without\nrevealing the unlearned model's parameters. In response to this vulnerability,\nwe propose Latent Adversarial Unlearning (LAU), a universal framework that\neffectively enhances the robustness of the unlearned process. It formulates the\nunlearning process as a min-max optimization problem and resolves it through\ntwo stages: an attack stage, where perturbation vectors are trained and added\nto the latent space of LLMs to recover the unlearned knowledge, and a defense\nstage, where previously trained perturbation vectors are used to enhance\nunlearned model's robustness. With our LAU framework, we obtain two robust\nunlearning methods, AdvGA and AdvNPO. We conduct extensive experiments across\nmultiple unlearning benchmarks and various models, and demonstrate that they\nimprove the unlearning effectiveness by over $53.5\\%$, cause only less than a\n$11.6\\%$ reduction in neighboring knowledge, and have almost no impact on the\nmodel's general capabilities.\n","authors":["Hongbang Yuan","Zhuoran Jin","Pengfei Cao","Yubo Chen","Kang Liu","Jun Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.10682v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2408.10681v1","updated":"2024-08-20T09:35:24Z","published":"2024-08-20T09:35:24Z","title":"HMoE: Heterogeneous Mixture of Experts for Language Modeling","summary":" Mixture of Experts (MoE) offers remarkable performance and computational\nefficiency by selectively activating subsets of model parameters.\nTraditionally, MoE models use homogeneous experts, each with identical\ncapacity. However, varying complexity in input data necessitates experts with\ndiverse capabilities, while homogeneous MoE hinders effective expert\nspecialization and efficient parameter utilization. In this study, we propose a\nnovel Heterogeneous Mixture of Experts (HMoE), where experts differ in size and\nthus possess diverse capacities. This heterogeneity allows for more specialized\nexperts to handle varying token complexities more effectively. To address the\nimbalance in expert activation, we propose a novel training objective that\nencourages the frequent activation of smaller experts, enhancing computational\nefficiency and parameter utilization. Extensive experiments demonstrate that\nHMoE achieves lower loss with fewer activated parameters and outperforms\nconventional homogeneous MoE models on various pre-training evaluation\nbenchmarks. Codes will be released upon acceptance.\n","authors":["An Wang","Xingwu Sun","Ruobing Xie","Shuaipeng Li","Jiaqi Zhu","Zhen Yang","Pinxue Zhao","J. N. Han","Zhanhui Kang","Di Wang","Naoaki Okazaki","Cheng-zhong Xu"],"pdf_url":"https://arxiv.org/pdf/2408.10681v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10676v1","updated":"2024-08-20T09:27:07Z","published":"2024-08-20T09:27:07Z","title":"Representation Norm Amplification for Out-of-Distribution Detection in\n Long-Tail Learning","summary":" Detecting out-of-distribution (OOD) samples is a critical task for reliable\nmachine learning. However, it becomes particularly challenging when the models\nare trained on long-tailed datasets, as the models often struggle to\ndistinguish tail-class in-distribution samples from OOD samples. We examine the\nmain challenges in this problem by identifying the trade-offs between OOD\ndetection and in-distribution (ID) classification, faced by existing methods.\nWe then introduce our method, called \\textit{Representation Norm Amplification}\n(RNA), which solves this challenge by decoupling the two problems. The main\nidea is to use the norm of the representation as a new dimension for OOD\ndetection, and to develop a training method that generates a noticeable\ndiscrepancy in the representation norm between ID and OOD data, while not\nperturbing the feature learning for ID classification. Our experiments show\nthat RNA achieves superior performance in both OOD detection and classification\ncompared to the state-of-the-art methods, by 1.70\\% and 9.46\\% in FPR95 and\n2.43\\% and 6.87\\% in classification accuracy on CIFAR10-LT and ImageNet-LT,\nrespectively. The code for this work is available at\nhttps://github.com/dgshin21/RNA.\n","authors":["Dong Geun Shin","Hye Won Chung"],"pdf_url":"https://arxiv.org/pdf/2408.10676v1.pdf","comment":"30 pages, 8 figures, 17 tables"},{"id":"http://arxiv.org/abs/2408.10672v1","updated":"2024-08-20T09:17:11Z","published":"2024-08-20T09:17:11Z","title":"Neural Exploratory Landscape Analysis","summary":" Recent research in Meta-Black-Box Optimization (MetaBBO) have shown that\nmeta-trained neural networks can effectively guide the design of black-box\noptimizers, significantly reducing the need for expert tuning and delivering\nrobust performance across complex problem distributions. Despite their success,\na paradox remains: MetaBBO still rely on human-crafted Exploratory Landscape\nAnalysis features to inform the meta-level agent about the low-level\noptimization progress. To address the gap, this paper proposes Neural\nExploratory Landscape Analysis (NeurELA), a novel framework that dynamically\nprofiles landscape features through a two-stage, attention-based neural\nnetwork, executed in an entirely end-to-end fashion. NeurELA is pre-trained\nover a variety of MetaBBO algorithms using a multi-task neuroevolution\nstrategy. Extensive experiments show that NeurELA achieves consistently\nsuperior performance when integrated into different and even unseen MetaBBO\ntasks and can be efficiently fine-tuned for further performance boost. This\nadvancement marks a pivotal step in making MetaBBO algorithms more autonomous\nand broadly applicable.\n","authors":["Zeyuan Ma","Jiacheng Chen","Hongshu Guo","Yue-Jiao Gong"],"pdf_url":"https://arxiv.org/pdf/2408.10672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10669v1","updated":"2024-08-20T09:11:38Z","published":"2024-08-20T09:11:38Z","title":"Tensor tree learns hidden relational structures in data to construct\n generative models","summary":" Based on the tensor tree network with the Born machine framework, we propose\na general method for constructing a generative model by expressing the target\ndistribution function as the quantum wave function amplitude represented by a\ntensor tree. The key idea is dynamically optimizing the tree structure that\nminimizes the bond mutual information. The proposed method offers enhanced\nperformance and uncovers hidden relational structures in the target data. We\nillustrate potential practical applications with four examples: (i) random\npatterns, (ii) QMNIST hand-written digits, (iii) Bayesian networks, and (iv)\nthe stock price fluctuation pattern in S&P500. In (i) and (ii), strongly\ncorrelated variables were concentrated near the center of the network; in\n(iii), the causality pattern was identified; and, in (iv), a structure\ncorresponding to the eleven sectors emerged.\n","authors":["Kenji Harada","Tsuyoshi Okubo","Naoki Kawashima"],"pdf_url":"https://arxiv.org/pdf/2408.10669v1.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2408.10665v1","updated":"2024-08-20T09:06:59Z","published":"2024-08-20T09:06:59Z","title":"End-to-end learned Lossy Dynamic Point Cloud Attribute Compression","summary":" Recent advancements in point cloud compression have primarily emphasized\ngeometry compression while comparatively fewer efforts have been dedicated to\nattribute compression. This study introduces an end-to-end learned dynamic\nlossy attribute coding approach, utilizing an efficient high-dimensional\nconvolution to capture extensive inter-point dependencies. This enables the\nefficient projection of attribute features into latent variables. Subsequently,\nwe employ a context model that leverage previous latent space in conjunction\nwith an auto-regressive context model for encoding the latent tensor into a\nbitstream. Evaluation of our method on widely utilized point cloud datasets\nfrom the MPEG and Microsoft demonstrates its superior performance compared to\nthe core attribute compression module Region-Adaptive Hierarchical Transform\nmethod from MPEG Geometry Point Cloud Compression with 38.1% Bjontegaard\nDelta-rate saving in average while ensuring a low-complexity encoding/decoding.\n","authors":["Dat Thanh Nguyen","Daniel Zieger","Marc Stamminger","Andre Kaup"],"pdf_url":"https://arxiv.org/pdf/2408.10665v1.pdf","comment":"6 pages, accepted for presentation at 2024 IEEE International\n Conference on Image Processing (ICIP) 2024"},{"id":"http://arxiv.org/abs/2408.10664v1","updated":"2024-08-20T09:05:44Z","published":"2024-08-20T09:05:44Z","title":"Federated Clustering: An Unsupervised Cluster-Wise Training for\n Decentralized Data Distributions","summary":" Federated Learning (FL) is a pivotal approach in decentralized machine\nlearning, especially when data privacy is crucial and direct data sharing is\nimpractical. While FL is typically associated with supervised learning, its\npotential in unsupervised scenarios is underexplored. This paper introduces a\nnovel unsupervised federated learning methodology designed to identify the\ncomplete set of categories (global K) across multiple clients within\nlabel-free, non-uniform data distributions, a process known as Federated\nClustering. Our approach, Federated Cluster-Wise Refinement (FedCRef), involves\nclients that collaboratively train models on clusters with similar data\ndistributions. Initially, clients with diverse local data distributions (local\nK) train models on their clusters to generate compressed data representations.\nThese local models are then shared across the network, enabling clients to\ncompare them through reconstruction error analysis, leading to the formation of\nfederated groups.In these groups, clients collaboratively train a shared model\nrepresenting each data distribution, while continuously refining their local\nclusters to enhance data association accuracy. This iterative process allows\nour system to identify all potential data distributions across the network and\ndevelop robust representation models for each. To validate our approach, we\ncompare it with traditional centralized methods, establishing a performance\nbaseline and showcasing the advantages of our distributed solution. We also\nconduct experiments on the EMNIST and KMNIST datasets, demonstrating FedCRef's\nability to refine and align cluster models with actual data distributions,\nsignificantly improving data representation precision in unsupervised federated\nsettings.\n","authors":["Mirko Nardi","Lorenzo Valerio","Andrea Passarella"],"pdf_url":"https://arxiv.org/pdf/2408.10664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09545v2","updated":"2024-08-20T09:04:25Z","published":"2024-08-18T17:16:49Z","title":"Seamless Integration: Sampling Strategies in Federated Learning Systems","summary":" Federated Learning (FL) represents a paradigm shift in the field of machine\nlearning, offering an approach for a decentralized training of models across a\nmultitude of devices while maintaining the privacy of local data. However, the\ndynamic nature of FL systems, characterized by the ongoing incorporation of new\nclients with potentially diverse data distributions and computational\ncapabilities, poses a significant challenge to the stability and efficiency of\nthese distributed learning networks. The seamless integration of new clients is\nimperative to sustain and enhance the performance and robustness of FL systems.\nThis paper looks into the complexities of integrating new clients into existing\nFL systems and explores how data heterogeneity and varying data distribution\n(not independent and identically distributed) among them can affect model\ntraining, system efficiency, scalability and stability. Despite these\nchallenges, the integration of new clients into FL systems presents\nopportunities to enhance data diversity, improve learning performance, and\nleverage distributed computational power. In contrast to other fields of\napplication such as the distributed optimization of word predictions on Gboard\n(where federated learning once originated), there are usually only a few\nclients in the production environment, which is why information from each new\nclient becomes all the more valuable. This paper outlines strategies for\neffective client selection strategies and solutions for ensuring system\nscalability and stability. Using the example of images from optical quality\ninspection, it offers insights into practical approaches. In conclusion, this\npaper proposes that addressing the challenges presented by new client\nintegration is crucial to the advancement and efficiency of distributed\nlearning networks, thus paving the way for the adoption of Federated Learning\nin production environments.\n","authors":["Tatjana Legler","Vinit Hegiste","Martin Ruskowski"],"pdf_url":"https://arxiv.org/pdf/2408.09545v2.pdf","comment":"The 2nd IEEE International Conference on Federated Learning\n Technologies and Applications (FLTA24)"},{"id":"http://arxiv.org/abs/2307.07439v4","updated":"2024-08-20T08:52:17Z","published":"2023-07-14T16:04:03Z","title":"Atlas-Based Interpretable Age Prediction In Whole-Body MR Images","summary":" Age prediction is an important part of medical assessments and research. It\ncan aid in detecting diseases as well as abnormal ageing by highlighting\npotential discrepancies between chronological and biological age. To improve\nunderstanding of age-related changes in various body parts, we investigate the\nageing of the human body on a large scale by using whole-body 3D images. We\nutilise the Grad-CAM method to determine the body areas most predictive of a\nperson's age. In order to expand our analysis beyond individual subjects, we\nemploy registration techniques to generate population-wide importance maps that\nshow the most predictive areas in the body for a whole cohort of subjects. We\nshow that the investigation of the full 3D volume of the whole body and the\npopulation-wide analysis can give important insights into which body parts play\nthe most important roles in predicting a person's age. Our findings reveal\nthree primary areas of interest: the spine, the autochthonous back muscles, and\nthe cardiac region, which exhibits the highest importance. Finally, we\ninvestigate differences between subjects that show accelerated and decelerated\nageing.\n","authors":["Sophie Starck","Yadunandan Vivekanand Kini","Jessica Johanna Maria Ritter","Rickmer Braren","Daniel Rueckert","Tamara Mueller"],"pdf_url":"https://arxiv.org/pdf/2307.07439v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02969v3","updated":"2024-08-20T08:46:40Z","published":"2023-07-06T13:12:19Z","title":"DPM: Clustering Sensitive Data through Separation","summary":" Clustering is an important tool for data exploration where the goal is to\nsubdivide a data set into disjoint clusters that fit well into the underlying\ndata structure. When dealing with sensitive data, privacy-preserving algorithms\naim to approximate the non-private baseline while minimising the leakage of\nsensitive information. State-of-the-art privacy-preserving clustering\nalgorithms tend to output clusters that are good in terms of the standard\nmetrics, inertia, silhouette score, and clustering accuracy, however, the\nclustering result strongly deviates from the non-private KMeans baseline. In\nthis work, we present a privacy-preserving clustering algorithm called DPM that\nrecursively separates a data set into clusters based on a geometrical\nclustering approach. In addition, DPM estimates most of the data-dependent\nhyper-parameters in a privacy-preserving way. We prove that DPM preserves\nDifferential Privacy and analyse the utility guarantees of DPM. Finally, we\nconduct an extensive empirical evaluation for synthetic and real-life data\nsets. We show that DPM achieves state-of-the-art utility on the standard\nclustering metrics and yields a clustering result much closer to that of the\npopular non-private KMeans algorithm without requiring the number of classes.\n","authors":["Johannes Liebenow","Yara Schütt","Tanya Braun","Marcel Gehrke","Florian Thaeter","Esfandiar Mohammadi"],"pdf_url":"https://arxiv.org/pdf/2307.02969v3.pdf","comment":"The first two authors equally contributed to this work"},{"id":"http://arxiv.org/abs/2408.09218v2","updated":"2024-08-20T08:46:37Z","published":"2024-08-17T14:55:15Z","title":"A Fast and Computationally Inexpensive Method For Image Translation of\n 3D Volume Patient Data","summary":" CycleGAN was trained on SynthRAD Grand Challenge Dataset using the\nsingle-epoch modification (SEM) method proposed in this paper which is referred\nto as (CycleGAN-single) compared to the usual method of training CycleGAN on\naround 200 epochs (CycleGAN-multi). Model performance were evaluated\nqualitatively and quantitatively with quantitative performance metrics like\nPSNR, SSIM, MAE and MSE. The consideration of both quantitative and qualitative\nperformance when evaluating a model is unique to certain image-translation\ntasks like medical imaging as detailed in this paper. Also, this paper shows\nthat good quantitative performance does not always imply good qualitative\nperformance and the converse is also not always True (i.e. good qualitative\nperformance does not always imply good quantitative performance). This paper\nalso proposes FQGA (Fast Paired Image-to-Image Translation Quarter-Generator\nAdversary) Model which has 1/4 the number of parameters compared to CycleGAN\n(when comparing their Generator Models). FQGA outperforms CycleGAN\nqualitatively and quantitatively even only after training on 20 epochs.\nFinally, using SEM method on FQGA allowed it to again outperform CycleGAN both\nquantitatively and qualitatively. These performance gains with fewer model\nparameters and time savings from running fewer epochs may also be applicable to\nother image-to-image translation tasks in Machine Learning apart from the\nMedical image-translation task discussed in this paper between Cone Beam\nComputed Tomography (CBCT) and Computed Tomography (CT) images.\n","authors":["Cho Yang"],"pdf_url":"https://arxiv.org/pdf/2408.09218v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10649v1","updated":"2024-08-20T08:42:00Z","published":"2024-08-20T08:42:00Z","title":"Inferring Underwater Topography with FINN","summary":" Spatiotemporal partial differential equations (PDEs) find extensive\napplication across various scientific and engineering fields. While numerous\nmodels have emerged from both physics and machine learning (ML) communities,\nthere is a growing trend towards integrating these approaches to develop hybrid\narchitectures known as physics-aware machine learning models. Among these, the\nfinite volume neural network (FINN) has emerged as a recent addition. FINN has\nproven to be particularly efficient in uncovering latent structures in data. In\nthis study, we explore the capabilities of FINN in tackling the shallow-water\nequations, which simulates wave dynamics in coastal regions. Specifically, we\ninvestigate FINN's efficacy to reconstruct underwater topography based on these\nparticular wave equations. Our findings reveal that FINN exhibits a remarkable\ncapacity to infer topography solely from wave dynamics, distinguishing itself\nfrom both conventional ML and physics-aware ML models. Our results underscore\nthe potential of FINN in advancing our understanding of spatiotemporal\nphenomena and enhancing parametrization capabilities in related domains.\n","authors":["Coşku Can Horuz","Matthias Karlbauer","Timothy Praditia","Sergey Oladyshkin","Wolfgang Nowak","Sebastian Otte"],"pdf_url":"https://arxiv.org/pdf/2408.10649v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08493v2","updated":"2024-08-20T08:41:58Z","published":"2024-08-16T02:29:38Z","title":"Fishers Harvest Parallel Unlearning in Inherited Model Networks","summary":" Unlearning in various learning frameworks remains challenging, with the\ncontinuous growth and updates of models exhibiting complex inheritance\nrelationships. This paper presents a novel unlearning framework, which enables\nfully parallel unlearning among models exhibiting inheritance. A key enabler is\nthe new Unified Model Inheritance Graph (UMIG), which captures the inheritance\nusing a Directed Acyclic Graph (DAG).Central to our framework is the new Fisher\nInheritance Unlearning (FIUn) algorithm, which utilizes the Fisher Information\nMatrix (FIM) from initial unlearning models to pinpoint impacted parameters in\ninherited models. By employing FIM, the FIUn method breaks the sequential\ndependencies among the models, facilitating simultaneous unlearning and\nreducing computational overhead. We further design to merge disparate FIMs into\na single matrix, synchronizing updates across inherited models. Experiments\nconfirm the effectiveness of our unlearning framework. For single-class tasks,\nit achieves complete unlearning with 0\\% accuracy for unlearned labels while\nmaintaining 94.53\\% accuracy for retained labels on average. For multi-class\ntasks, the accuracy is 1.07\\% for unlearned labels and 84.77\\% for retained\nlabels on average. Our framework accelerates unlearning by 99\\% compared to\nalternative methods.\n","authors":["Xiao Liu","Mingyuan Li","Xu Wang","Guangsheng Yu","Wei Ni","Lixiang Li","Haipeng Peng","Renping Liu"],"pdf_url":"https://arxiv.org/pdf/2408.08493v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10647v1","updated":"2024-08-20T08:40:39Z","published":"2024-08-20T08:40:39Z","title":"Privacy-preserving Universal Adversarial Defense for Black-box Models","summary":" Deep neural networks (DNNs) are increasingly used in critical applications\nsuch as identity authentication and autonomous driving, where robustness\nagainst adversarial attacks is crucial. These attacks can exploit minor\nperturbations to cause significant prediction errors, making it essential to\nenhance the resilience of DNNs. Traditional defense methods often rely on\naccess to detailed model information, which raises privacy concerns, as model\nowners may be reluctant to share such data. In contrast, existing black-box\ndefense methods fail to offer a universal defense against various types of\nadversarial attacks. To address these challenges, we introduce DUCD, a\nuniversal black-box defense method that does not require access to the target\nmodel's parameters or architecture. Our approach involves distilling the target\nmodel by querying it with data, creating a white-box surrogate while preserving\ndata privacy. We further enhance this surrogate model using a certified defense\nbased on randomized smoothing and optimized noise selection, enabling robust\ndefense against a broad range of adversarial attacks. Comparative evaluations\nbetween the certified defenses of the surrogate and target models demonstrate\nthe effectiveness of our approach. Experiments on multiple image classification\ndatasets show that DUCD not only outperforms existing black-box defenses but\nalso matches the accuracy of white-box defenses, all while enhancing data\nprivacy and reducing the success rate of membership inference attacks.\n","authors":["Qiao Li","Cong Wu","Jing Chen","Zijun Zhang","Kun He","Ruiying Du","Xinxin Wang","Qingchuang Zhao","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2408.10647v1.pdf","comment":"12 pages, 9 figures"},{"id":"http://arxiv.org/abs/2408.08258v2","updated":"2024-08-20T08:36:59Z","published":"2024-08-15T16:59:15Z","title":"Snuffy: Efficient Whole Slide Image Classifier","summary":" Whole Slide Image (WSI) classification with multiple instance learning (MIL)\nin digital pathology faces significant computational challenges. Current\nmethods mostly rely on extensive self-supervised learning (SSL) for\nsatisfactory performance, requiring long training periods and considerable\ncomputational resources. At the same time, no pre-training affects performance\ndue to domain shifts from natural images to WSIs. We introduce Snuffy\narchitecture, a novel MIL-pooling method based on sparse transformers that\nmitigates performance loss with limited pre-training and enables continual\nfew-shot pre-training as a competitive option. Our sparsity pattern is tailored\nfor pathology and is theoretically proven to be a universal approximator with\nthe tightest probabilistic sharp bound on the number of layers for sparse\ntransformers, to date. We demonstrate Snuffy's effectiveness on CAMELYON16 and\nTCGA Lung cancer datasets, achieving superior WSI and patch-level accuracies.\nThe code is available on https://github.com/jafarinia/snuffy.\n","authors":["Hossein Jafarinia","Alireza Alipanah","Danial Hamdi","Saeed Razavi","Nahal Mirzaie","Mohammad Hossein Rohban"],"pdf_url":"https://arxiv.org/pdf/2408.08258v2.pdf","comment":"Accepted for ECCV 2024"},{"id":"http://arxiv.org/abs/2408.10645v1","updated":"2024-08-20T08:36:59Z","published":"2024-08-20T08:36:59Z","title":"CoRA: Collaborative Information Perception by Large Language Model's\n Weights for Recommendation","summary":" Involving collaborative information in Large Language Models (LLMs) is a\npromising technique for adapting LLMs for recommendation. Existing methods\nachieve this by concatenating collaborative features with text tokens into a\nunified sequence input and then fine-tuning to align these features with LLM's\ninput space. Although effective, in this work, we identify two limitations when\nadapting LLMs to recommendation tasks, which hinder the integration of general\nknowledge and collaborative information, resulting in sub-optimal\nrecommendation performance. (1) Fine-tuning LLM with recommendation data can\nundermine its inherent world knowledge and fundamental competencies, which are\ncrucial for interpreting and inferring recommendation text. (2) Incorporating\ncollaborative features into textual prompts disrupts the semantics of the\noriginal prompts, preventing LLM from generating appropriate outputs. In this\npaper, we propose a new paradigm, CoRA (an acronym for Collaborative LoRA),\nwith a collaborative weights generator. Rather than input space alignment, this\nmethod aligns collaborative information with LLM's parameter space,\nrepresenting them as incremental weights to update LLM's output. This way, LLM\nperceives collaborative information without altering its general knowledge and\ntext inference capabilities. Specifically, we employ a collaborative filtering\nmodel to extract user and item embeddings, converting them into collaborative\nweights with low-rank properties through the collaborative weights generator.\nWe then merge the collaborative weights into LLM's weights, enabling LLM to\nperceive the collaborative signals and generate personalized recommendations\nwithout fine-tuning or extra collaborative tokens in prompts. Extensive\nexperiments confirm that CoRA effectively integrates collaborative information\ninto LLM, enhancing recommendation performance.\n","authors":["Yuting Liu","Jinghao Zhang","Yizhou Dang","Yuliang Liang","Qiang Liu","Guibing Guo","Jianzhe Zhao","Xingwei Wang"],"pdf_url":"https://arxiv.org/pdf/2408.10645v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10633v1","updated":"2024-08-20T08:19:55Z","published":"2024-08-20T08:19:55Z","title":"Interactive Counterfactual Generation for Univariate Time Series","summary":" We propose an interactive methodology for generating counterfactual\nexplanations for univariate time series data in classification tasks by\nleveraging 2D projections and decision boundary maps to tackle interpretability\nchallenges. Our approach aims to enhance the transparency and understanding of\ndeep learning models' decision processes. The application simplifies the time\nseries data analysis by enabling users to interactively manipulate projected\ndata points, providing intuitive insights through inverse projection\ntechniques. By abstracting user interactions with the projected data points\nrather than the raw time series data, our method facilitates an intuitive\ngeneration of counterfactual explanations. This approach allows for a more\nstraightforward exploration of univariate time series data, enabling users to\nmanipulate data points to comprehend potential outcomes of hypothetical\nscenarios. We validate this method using the ECG5000 benchmark dataset,\ndemonstrating significant improvements in interpretability and user\nunderstanding of time series classification. The results indicate a promising\ndirection for enhancing explainable AI, with potential applications in various\ndomains requiring transparent and interpretable deep learning models. Future\nwork will explore the scalability of this method to multivariate time series\ndata and its integration with other interpretability techniques.\n","authors":["Udo Schlegel","Julius Rauscher","Daniel A. Keim"],"pdf_url":"https://arxiv.org/pdf/2408.10633v1.pdf","comment":"14 pages, 4 figures, accepted at XKDD @ ECML-PKDD"},{"id":"http://arxiv.org/abs/2408.10631v1","updated":"2024-08-20T08:13:52Z","published":"2024-08-20T08:13:52Z","title":"LLM-Barber: Block-Aware Rebuilder for Sparsity Mask in One-Shot for\n Large Language Models","summary":" Large language models (LLMs) have grown significantly in scale, leading to a\ncritical need for efficient model pruning techniques. Existing post-training\npruning techniques primarily focus on measuring weight importance on converged\ndense models to determine salient weights to retain. However, they often\noverlook the changes in weight importance during the pruning process, which can\nlead to performance degradation in the pruned models. To address this issue, we\npresent LLM-Barber (Block-Aware Rebuilder for Sparsity Mask in One-Shot), a\nnovel one-shot pruning framework that rebuilds the sparsity mask of pruned\nmodels without any retraining or weight reconstruction. LLM-Barber incorporates\nblock-aware error optimization across Self-Attention and MLP blocks, ensuring\nglobal performance optimization. Inspired by the recent discovery of prominent\noutliers in LLMs, LLM-Barber introduces an innovative pruning metric that\nidentifies weight importance using weights multiplied by gradients. Our\nexperiments show that LLM-Barber can efficiently prune models like LLaMA and\nOPT families with 7B to 13B parameters on a single A100 GPU in just 30 minutes,\nachieving state-of-the-art results in both perplexity and zero-shot performance\nacross various language benchmarks. Code is available at\nhttps://github.com/YupengSu/LLM-Barber.\n","authors":["Yupeng Su","Ziyi Guan","Xiaoqun Liu","Tianlai Jin","Dongkuan Wu","Graziano Chesi","Ngai Wong","Hao Yu"],"pdf_url":"https://arxiv.org/pdf/2408.10631v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10628v1","updated":"2024-08-20T08:09:44Z","published":"2024-08-20T08:09:44Z","title":"Finding the DeepDream for Time Series: Activation Maximization for\n Univariate Time Series","summary":" Understanding how models process and interpret time series data remains a\nsignificant challenge in deep learning to enable applicability in\nsafety-critical areas such as healthcare. In this paper, we introduce Sequence\nDreaming, a technique that adapts Activation Maximization to analyze sequential\ninformation, aiming to enhance the interpretability of neural networks\noperating on univariate time series. By leveraging this method, we visualize\nthe temporal dynamics and patterns most influential in model decision-making\nprocesses. To counteract the generation of unrealistic or excessively noisy\nsequences, we enhance Sequence Dreaming with a range of regularization\ntechniques, including exponential smoothing. This approach ensures the\nproduction of sequences that more accurately reflect the critical features\nidentified by the neural network. Our approach is tested on a time series\nclassification dataset encompassing applications in predictive maintenance. The\nresults show that our proposed Sequence Dreaming approach demonstrates targeted\nactivation maximization for different use cases so that either centered class\nor border activation maximization can be generated. The results underscore the\nversatility of Sequence Dreaming in uncovering salient temporal features\nlearned by neural networks, thereby advancing model transparency and\ntrustworthiness in decision-critical domains.\n","authors":["Udo Schlegel","Daniel A. Keim","Tobias Sutter"],"pdf_url":"https://arxiv.org/pdf/2408.10628v1.pdf","comment":"16 pages, 4 figures, accepted at TempXAI @ ECML-PKDD"},{"id":"http://arxiv.org/abs/2312.17503v2","updated":"2024-08-20T08:09:26Z","published":"2023-12-29T07:52:46Z","title":"HiBid: A Cross-Channel Constrained Bidding System with Budget Allocation\n by Hierarchical Offline Deep Reinforcement Learning","summary":" Online display advertising platforms service numerous advertisers by\nproviding real-time bidding (RTB) for the scale of billions of ad requests\nevery day. The bidding strategy handles ad requests cross multiple channels to\nmaximize the number of clicks under the set financial constraints, i.e., total\nbudget and cost-per-click (CPC), etc. Different from existing works mainly\nfocusing on single channel bidding, we explicitly consider cross-channel\nconstrained bidding with budget allocation. Specifically, we propose a\nhierarchical offline deep reinforcement learning (DRL) framework called\n``HiBid'', consisted of a high-level planner equipped with auxiliary loss for\nnon-competitive budget allocation, and a data augmentation enhanced low-level\nexecutor for adaptive bidding strategy in response to allocated budgets.\nAdditionally, a CPC-guided action selection mechanism is introduced to satisfy\nthe cross-channel CPC constraint. Through extensive experiments on both the\nlarge-scale log data and online A/B testing, we confirm that HiBid outperforms\nsix baselines in terms of the number of clicks, CPC satisfactory ratio, and\nreturn-on-investment (ROI). We also deploy HiBid on Meituan advertising\nplatform to already service tens of thousands of advertisers every day.\n","authors":["Hao Wang","Bo Tang","Chi Harold Liu","Shangqin Mao","Jiahong Zhou","Zipeng Dai","Yaqi Sun","Qianlong Xie","Xingxing Wang","Dong Wang"],"pdf_url":"https://arxiv.org/pdf/2312.17503v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19823v2","updated":"2024-08-20T08:00:02Z","published":"2024-05-30T08:31:18Z","title":"Joint Selective State Space Model and Detrending for Robust Time Series\n Anomaly Detection","summary":" Deep learning-based sequence models are extensively employed in Time Series\nAnomaly Detection (TSAD) tasks due to their effective sequential modeling\ncapabilities. However, the ability of TSAD is limited by two key challenges:\n(i) the ability to model long-range dependency and (ii) the generalization\nissue in the presence of non-stationary data. To tackle these challenges, an\nanomaly detector that leverages the selective state space model known for its\nproficiency in capturing long-term dependencies across various domains is\nproposed. Additionally, a multi-stage detrending mechanism is introduced to\nmitigate the prominent trend component in non-stationary data to address the\ngeneralization issue. Extensive experiments conducted on realworld public\ndatasets demonstrate that the proposed methods surpass all 12 compared baseline\nmethods.\n","authors":["Junqi Chen","Xu Tan","Sylwan Rahardja","Jiawei Yang","Susanto Rahardja"],"pdf_url":"https://arxiv.org/pdf/2405.19823v2.pdf","comment":"Accepted by IEEE Signal Processing Letters.\n DOI:10.1109/LSP.2024.3438078"},{"id":"http://arxiv.org/abs/2405.10706v2","updated":"2024-08-20T07:48:13Z","published":"2024-05-17T11:28:52Z","title":"Challenging the Human-in-the-loop in Algorithmic Decision-making","summary":" We discuss the role of humans in algorithmic decision-making (ADM) for\nsocially relevant problems from a technical and philosophical perspective. In\nparticular, we illustrate tensions arising from diverse expectations, values,\nand constraints by and on the humans involved. To this end, we assume that a\nstrategic decision-maker (SDM) introduces ADM to optimize strategic and\nsocietal goals while the algorithms' recommended actions are overseen by a\npractical decision-maker (PDM) - a specific human-in-the-loop - who makes the\nfinal decisions. While the PDM is typically assumed to be a corrective, it can\ncounteract the realization of the SDM's desired goals and societal values not\nleast because of a misalignment of these values and unmet information needs of\nthe PDM. This has significant implications for the distribution of power\nbetween the stakeholders in ADM, their constraints, and information needs. In\nparticular, we emphasize the overseeing PDM's role as a potential political and\nethical decision maker, who acts expected to balance strategic, value-driven\nobjectives and on-the-ground individual decisions and constraints. We\ndemonstrate empirically, on a machine learning benchmark dataset, the\nsignificant impact an overseeing PDM's decisions can have even if the PDM is\nconstrained to performing only a limited amount of actions differing from the\nalgorithms' recommendations. To ensure that the SDM's intended values are\nrealized, the PDM needs to be provided with appropriate information conveyed\nthrough tailored explanations and its role must be characterized clearly. Our\nfindings emphasize the need for an in-depth discussion of the role and power of\nthe PDM and challenge the often-taken view that just including a\nhuman-in-the-loop in ADM ensures the 'correct' and 'ethical' functioning of the\nsystem.\n","authors":["Sebastian Tschiatschek","Eugenia Stamboliev","Timothée Schmude","Mark Coeckelbergh","Laura Koesten"],"pdf_url":"https://arxiv.org/pdf/2405.10706v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10610v1","updated":"2024-08-20T07:42:42Z","published":"2024-08-20T07:42:42Z","title":"On the Approximability of Stationary Processes using the ARMA Model","summary":" We identify certain gaps in the literature on the approximability of\nstationary random variables using the Autoregressive Moving Average (ARMA)\nmodel. To quantify approximability, we propose that an ARMA model be viewed as\nan approximation of a stationary random variable. We map these stationary\nrandom variables to Hardy space functions, and formulate a new function\napproximation problem that corresponds to random variable approximation, and\nthus to ARMA. Based on this Hardy space formulation we identify a class of\nstationary processes where approximation guarantees are feasible. We also\nidentify an idealized stationary random process for which we conjecture that a\ngood ARMA approximation is not possible. Next, we provide a constructive proof\nthat Pad\\'e approximations do not always correspond to the best ARMA\napproximation. Finally, we note that the spectral methods adopted in this paper\ncan be seen as a generalization of unit root methods for stationary processes\neven when an ARMA model is not defined.\n","authors":["Anand Ganesh","Babhrubahan Bose","Anand Rajagopalan"],"pdf_url":"https://arxiv.org/pdf/2408.10610v1.pdf","comment":"10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2408.10609v1","updated":"2024-08-20T07:40:20Z","published":"2024-08-20T07:40:20Z","title":"PerturBench: Benchmarking Machine Learning Models for Cellular\n Perturbation Analysis","summary":" We present a comprehensive framework for predicting the effects of\nperturbations in single cells, designed to standardize benchmarking in this\nrapidly evolving field. Our framework, PerturBench, includes a user-friendly\nplatform, diverse datasets, metrics for fair model comparison, and detailed\nperformance analysis. Extensive evaluations of published and baseline models\nreveal limitations like mode or posterior collapse, and underscore the\nimportance of rank metrics that assess the ordering of perturbations alongside\ntraditional measures like RMSE. Our findings show that simple models can\noutperform more complex approaches. This benchmarking exercise sets new\nstandards for model evaluation, supports robust model development, and advances\nthe potential of these models to use high-throughput and high-content genetic\nand chemical screens for disease target discovery.\n","authors":["Yan Wu","Esther Wershof","Sebastian M Schmon","Marcel Nassar","Błażej Osiński","Ridvan Eksi","Kun Zhang","Thore Graepel"],"pdf_url":"https://arxiv.org/pdf/2408.10609v1.pdf","comment":"9 pages plus 19 pages supplementary material. Code is available at\n https://github.com/altoslabs/perturbench"},{"id":"http://arxiv.org/abs/2408.10604v1","updated":"2024-08-20T07:37:06Z","published":"2024-08-20T07:37:06Z","title":"Multilingual Non-Factoid Question Answering with Silver Answers","summary":" Most existing Question Answering Datasets (QuADs) primarily focus on\nfactoid-based short-context Question Answering (QA) in high-resource languages.\nHowever, the scope of such datasets for low-resource languages remains limited,\nwith only a few works centered on factoid-based QuADs and none on non-factoid\nQuADs. Therefore, this work presents MuNfQuAD, a multilingual QuAD with\nnon-factoid questions. It utilizes interrogative sub-headings from BBC news\narticles as questions and the corresponding paragraphs as silver answers. The\ndataset comprises over 370K QA pairs across 38 languages, encompassing several\nlow-resource languages, and stands as the largest multilingual QA dataset to\ndate. Based on the manual annotations of 790 QA-pairs from MuNfQuAD (golden\nset), we observe that 98\\% of questions can be answered using their\ncorresponding silver answer. Our fine-tuned Answer Paragraph Selection (APS)\nmodel outperforms the baselines. The APS model attained an accuracy of 80\\% and\n72\\%, as well as a macro F1 of 72\\% and 66\\%, on the MuNfQuAD testset and the\ngolden set, respectively. Furthermore, the APS model effectively generalizes\ncertain a language within the golden set, even after being fine-tuned on silver\nlabels.\n","authors":["Ritwik Mishra","Sreeram Vennam","Rajiv Ratn Shah","Ponnurangam Kumaraguru"],"pdf_url":"https://arxiv.org/pdf/2408.10604v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07205v2","updated":"2024-08-20T07:20:10Z","published":"2024-08-13T21:24:14Z","title":"Deep Index Policy for Multi-Resource Restless Matching Bandit and Its\n Application in Multi-Channel Scheduling","summary":" Scheduling in multi-channel wireless communication system presents formidable\nchallenges in effectively allocating resources. To address these challenges, we\ninvestigate a multi-resource restless matching bandit (MR-RMB) model for\nheterogeneous resource systems with an objective of maximizing long-term\ndiscounted total rewards while respecting resource constraints. We have also\ngeneralized to applications beyond multi-channel wireless. We discuss the\nMax-Weight Index Matching algorithm, which optimizes resource allocation based\non learned partial indexes. We have derived the policy gradient theorem for\nindex learning. Our main contribution is the introduction of a new Deep Index\nPolicy (DIP), an online learning algorithm tailored for MR-RMB. DIP learns the\npartial index by leveraging the policy gradient theorem for restless arms with\nconvoluted and unknown transition kernels of heterogeneous resources. We\ndemonstrate the utility of DIP by evaluating its performance for three\ndifferent MR-RMB problems. Our simulation results show that DIP indeed learns\nthe partial indexes efficiently.\n","authors":["Nida Zamir","I-Hong Hou"],"pdf_url":"https://arxiv.org/pdf/2408.07205v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09420v2","updated":"2024-08-20T07:18:55Z","published":"2024-08-18T09:31:13Z","title":"Enhancing Startup Success Predictions in Venture Capital: A GraphRAG\n Augmented Multivariate Time Series Method","summary":" In the Venture Capital(VC) industry, predicting the success of startups is\nchallenging due to limited financial data and the need for subjective revenue\nforecasts. Previous methods based on time series analysis or deep learning\noften fall short as they fail to incorporate crucial inter-company\nrelationships such as competition and collaboration. Regarding the issues, we\npropose a novel approach using GrahphRAG augmented time series model. With\nGraphRAG, time series predictive methods are enhanced by integrating these\nvital relationships into the analysis framework, allowing for a more dynamic\nunderstanding of the startup ecosystem in venture capital. Our experimental\nresults demonstrate that our model significantly outperforms previous models in\nstartup success predictions. To the best of our knowledge, our work is the\nfirst application work of GraphRAG.\n","authors":["Zitian Gao","Yihao Xiao"],"pdf_url":"https://arxiv.org/pdf/2408.09420v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2312.13936,\n arXiv:2312.04876, arXiv:2402.11454 by other authors"},{"id":"http://arxiv.org/abs/2405.05638v4","updated":"2024-08-20T07:17:25Z","published":"2024-05-09T09:27:18Z","title":"A Correlation-induced Finite Difference Estimator","summary":" Finite difference (FD) approximation is a classic approach to stochastic\ngradient estimation when only noisy function realizations are available. In\nthis paper, we first provide a sample-driven method via the bootstrap technique\nto estimate the optimal perturbation, and then propose an efficient FD\nestimator based on correlated samples at the estimated optimal perturbation.\nFurthermore, theoretical analyses of both the perturbation estimator and the FD\nestimator reveal that, {\\it surprisingly}, the correlation enables the proposed\nFD estimator to achieve a reduction in variance and, in some cases, a decrease\nin bias compared to the traditional optimal FD estimator. Numerical results\nconfirm the efficiency of our estimators and align well with the theory\npresented, especially in scenarios with small sample sizes. Finally, we apply\nthe estimator to solve derivative-free optimization (DFO) problems, and\nnumerical studies show that DFO problems with 100 dimensions can be effectively\nsolved.\n","authors":["Guo Liang","Guangwu Liu","Kun Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.05638v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09263v3","updated":"2024-08-20T06:34:37Z","published":"2023-11-15T07:37:28Z","title":"Auto-ICL: In-Context Learning without Human Supervision","summary":" With in-context learning ability, the performance of large language models\ncan be significantly boosted when provided with appropriate context. However,\nexisting in-context learning methods mainly rely on human-provided contexts,\nsuch as labeled examples and explicit instructions. Writing context by humans\nis labor-intensive on various tasks and limits the model to tasks manageable by\nhumans. To overcome these limitations, we propose Automatic In-Context Learning\nframework that enables the model to autonomously generate examples and\ninstructions for problem-solving. With experiments across various models and\ndatasets, results show that model-generated contexts outperform human-annotated\ncontexts, including Few-Shot and Few-Shot-CoT methods, and surpass existing\nself-generated context methods like Zero-CoT and Auto-CoT.\n","authors":["Jinghan Yang","Shuming Ma","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2311.09263v3.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2408.07088v2","updated":"2024-08-20T06:33:40Z","published":"2024-08-09T02:27:46Z","title":"Learning Rule-Induced Subgraph Representations for Inductive Relation\n Prediction","summary":" Inductive relation prediction (IRP) -- where entities can be different during\ntraining and inference -- has shown great power for completing evolving\nknowledge graphs. Existing works mainly focus on using graph neural networks\n(GNNs) to learn the representation of the subgraph induced from the target\nlink, which can be seen as an implicit rule-mining process to measure the\nplausibility of the target link. However, these methods cannot differentiate\nthe target link and other links during message passing, hence the final\nsubgraph representation will contain irrelevant rule information to the target\nlink, which reduces the reasoning performance and severely hinders the\napplications for real-world scenarios. To tackle this problem, we propose a\nnovel \\textit{single-source edge-wise} GNN model to learn the\n\\textbf{R}ule-induc\\textbf{E}d \\textbf{S}ubgraph represen\\textbf{T}ations\n(\\textbf{REST}), which encodes relevant rules and eliminates irrelevant rules\nwithin the subgraph. Specifically, we propose a \\textit{single-source}\ninitialization approach to initialize edge features only for the target link,\nwhich guarantees the relevance of mined rules and target link. Then we propose\nseveral RNN-based functions for \\textit{edge-wise} message passing to model the\nsequential property of mined rules. REST is a simple and effective approach\nwith theoretical support to learn the \\textit{rule-induced subgraph\nrepresentation}. Moreover, REST does not need node labeling, which\nsignificantly accelerates the subgraph preprocessing time by up to\n\\textbf{11.66$\\times$}. Experiments on inductive relation prediction benchmarks\ndemonstrate the effectiveness of our REST. Our code is available at\nhttps://github.com/smart-lty/REST.\n","authors":["Tianyu Liu","Qitan Lv","Jie Wang","Shuling Yang","Hanzhu Chen"],"pdf_url":"https://arxiv.org/pdf/2408.07088v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10567v1","updated":"2024-08-20T06:08:37Z","published":"2024-08-20T06:08:37Z","title":"Prompt Your Brain: Scaffold Prompt Tuning for Efficient Adaptation of\n fMRI Pre-trained Model","summary":" We introduce Scaffold Prompt Tuning (ScaPT), a novel prompt-based framework\nfor adapting large-scale functional magnetic resonance imaging (fMRI)\npre-trained models to downstream tasks, with high parameter efficiency and\nimproved performance compared to fine-tuning and baselines for prompt tuning.\nThe full fine-tuning updates all pre-trained parameters, which may distort the\nlearned feature space and lead to overfitting with limited training data which\nis common in fMRI fields. In contrast, we design a hierarchical prompt\nstructure that transfers the knowledge learned from high-resource tasks to\nlow-resource ones. This structure, equipped with a Deeply-conditioned\nInput-Prompt (DIP) mapping module, allows for efficient adaptation by updating\nonly 2% of the trainable parameters. The framework enhances semantic\ninterpretability through attention mechanisms between inputs and prompts, and\nit clusters prompts in the latent space in alignment with prior knowledge.\nExperiments on public resting state fMRI datasets reveal ScaPT outperforms\nfine-tuning and multitask-based prompt tuning in neurodegenerative diseases\ndiagnosis/prognosis and personality trait prediction, even with fewer than 20\nparticipants. It highlights ScaPT's efficiency in adapting pre-trained fMRI\nmodels to low-resource tasks.\n","authors":["Zijian Dong","Yilei Wu","Zijiao Chen","Yichi Zhang","Yueming Jin","Juan Helen Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.10567v1.pdf","comment":"MICCAI 2024"},{"id":"http://arxiv.org/abs/2408.10566v1","updated":"2024-08-20T06:05:52Z","published":"2024-08-20T06:05:52Z","title":"SparseGrow: Addressing Growth-Induced Forgetting in Task-Agnostic\n Continual Learning","summary":" In continual learning (CL), model growth enhances adaptability over new data,\nimproving knowledge retention for more tasks. However, improper model growth\ncan lead to severe degradation of previously learned knowledge, an issue we\nname as growth-induced forgetting (GIFt), especially in task-agnostic CL using\nentire grown model for inference. Existing works, despite adopting model growth\nand random initialization for better adaptability, often fail to recognize the\npresence of GIFt caused by improper model growth. This oversight limits\ncomprehensive control of forgetting and hinders full utilization of model\ngrowth. We are the first in CL to identify this issue and conduct an in-depth\nstudy on root cause of GIFt, where layer expansion stands out among model\ngrowth strategies, widening layers without affecting model functionality. Yet,\ndirect adoption of layer expansion presents challenges. It lacks data-driven\ncontrol and initialization of expanded parameters to balance adaptability and\nknowledge retention. This paper presents a novel SparseGrow approach to\novercome the issue of GIFt while enhancing adaptability over new data.\nSparseGrow employs data-driven sparse layer expansion to control efficient\nparameter usage during growth, reducing GIFt from excessive growth and\nfunctionality changes. It also combines sparse growth with on-data\ninitialization at training late-stage to create partially 0-valued expansions\nthat fit learned distribution, enhancing retention and adaptability. To further\nminimize forgetting, freezing is applied by calculating the sparse mask,\nallowing data-driven preservation of important parameters. Through experiments\nacross datasets with various settings, cases and task numbers, we demonstrate\nthe necessity of layer expansion and showcase the effectiveness of SparseGrow\nin overcoming GIFt, highlighting its adaptability and knowledge retention for\nincremental tasks.\n","authors":["Yuqing Zhao","Divya Saxena","Jiannong Cao","Xiaoyun Liu","Changlin Song"],"pdf_url":"https://arxiv.org/pdf/2408.10566v1.pdf","comment":"This paper has been submitted to the AAAI conference. If accepted,\n the final version will be updated to reflect the conference proceedings"},{"id":"http://arxiv.org/abs/2408.10556v1","updated":"2024-08-20T05:38:50Z","published":"2024-08-20T05:38:50Z","title":"Hokoff: Real Game Dataset from Honor of Kings and its Offline\n Reinforcement Learning Benchmarks","summary":" The advancement of Offline Reinforcement Learning (RL) and Offline\nMulti-Agent Reinforcement Learning (MARL) critically depends on the\navailability of high-quality, pre-collected offline datasets that represent\nreal-world complexities and practical applications. However, existing datasets\noften fall short in their simplicity and lack of realism. To address this gap,\nwe propose Hokoff, a comprehensive set of pre-collected datasets that covers\nboth offline RL and offline MARL, accompanied by a robust framework, to\nfacilitate further research. This data is derived from Honor of Kings, a\nrecognized Multiplayer Online Battle Arena (MOBA) game known for its intricate\nnature, closely resembling real-life situations. Utilizing this framework, we\nbenchmark a variety of offline RL and offline MARL algorithms. We also\nintroduce a novel baseline algorithm tailored for the inherent hierarchical\naction space of the game. We reveal the incompetency of current offline RL\napproaches in handling task complexity, generalization and multi-task learning.\n","authors":["Yun Qu","Boyuan Wang","Jianzhun Shao","Yuhang Jiang","Chen Chen","Zhenbin Ye","Lin Liu","Junfeng Yang","Lin Lai","Hongyang Qin","Minwen Deng","Juchao Zhuo","Deheng Ye","Qiang Fu","Wei Yang","Guang Yang","Lanxiao Huang","Xiangyang Ji"],"pdf_url":"https://arxiv.org/pdf/2408.10556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10555v1","updated":"2024-08-20T05:38:47Z","published":"2024-08-20T05:38:47Z","title":"Target-Prompt Online Graph Collaborative Learning for Temporal QoS\n Prediction","summary":" In service-oriented architecture, accurately predicting the Quality of\nService (QoS) is vital for maintaining reliability and enhancing user\nsatisfaction. However, current methods often neglect high-order latent\ncollaborative relationships and fail to dynamically adjust feature learning for\nspecific user-service invocations, which are critical for precise feature\nextraction. Moreover, relying on RNNs to capture QoS evolution limits the\nability to detect long-term trends due to challenges in managing long-range\ndependencies. To address these issues, we propose the Target-Prompt Online\nGraph Collaborative Learning (TOGCL) framework for temporal QoS prediction. It\nleverages a dynamic user-service invocation graph to comprehensively model\nhistorical interactions. Building on this graph, it develops a target-prompt\ngraph attention network to extract online deep latent features of users and\nservices at each time slice, considering implicit target-neighboring\ncollaborative relationships and historical QoS values. Additionally, a\nmulti-layer Transformer encoder is employed to uncover temporal feature\nevolution patterns, enhancing temporal QoS prediction. Extensive experiments on\nthe WS-DREAM dataset demonstrate that TOGCL significantly outperforms\nstate-of-the-art methods across multiple metrics, achieving improvements of up\nto 38.80\\%. These results underscore the effectiveness of TOGCL for temporal\nQoS prediction.\n","authors":["Shengxiang Hu","Guobing Zou","Song Yang","Shiyi Lin","Bofeng Zhang","Yixin Chen"],"pdf_url":"https://arxiv.org/pdf/2408.10555v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2407.12288v3","updated":"2024-08-20T05:34:20Z","published":"2024-07-17T03:18:40Z","title":"Information-Theoretic Foundations for Machine Learning","summary":" The staggering progress of machine learning in the past decade has been a\nsight to behold. In retrospect, it is both remarkable and unsettling that these\nmilestones were achievable with little to no rigorous theory to guide\nexperimentation. Despite this fact, practitioners have been able to guide their\nfuture experimentation via observations from previous large-scale empirical\ninvestigations. However, alluding to Plato's Allegory of the cave, it is likely\nthat the observations which form the field's notion of reality are but shadows\nrepresenting fragments of that reality. In this work, we propose a theoretical\nframework which attempts to answer what exists outside of the cave. To the\ntheorist, we provide a framework which is mathematically rigorous and leaves\nopen many interesting ideas for future exploration. To the practitioner, we\nprovide a framework whose results are very intuitive, general, and which will\nhelp form principles to guide future investigations. Concretely, we provide a\ntheoretical framework rooted in Bayesian statistics and Shannon's information\ntheory which is general enough to unify the analysis of many phenomena in\nmachine learning. Our framework characterizes the performance of an optimal\nBayesian learner, which considers the fundamental limits of information.\nThroughout this work, we derive very general theoretical results and apply them\nto derive insights specific to settings ranging from data which is\nindependently and identically distributed under an unknown distribution, to\ndata which is sequential, to data which exhibits hierarchical structure\namenable to meta-learning. We conclude with a section dedicated to\ncharacterizing the performance of misspecified algorithms. These results are\nexciting and particularly relevant as we strive to overcome increasingly\ndifficult machine learning challenges in this endlessly complex world.\n","authors":["Hong Jun Jeon","Benjamin Van Roy"],"pdf_url":"https://arxiv.org/pdf/2407.12288v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.05036v2","updated":"2024-08-20T05:31:21Z","published":"2024-06-07T15:58:12Z","title":"TimeSieve: Extracting Temporal Dynamics through Information Bottlenecks","summary":" Time series forecasting has become an increasingly popular research area due\nto its critical applications in various real-world domains such as traffic\nmanagement, weather prediction, and financial analysis. Despite significant\nadvancements, existing models face notable challenges, including the necessity\nof manual hyperparameter tuning for different datasets, and difficulty in\neffectively distinguishing signal from redundant features in data characterized\nby strong seasonality. These issues hinder the generalization and practical\napplication of time series forecasting models. To solve this issues, we propose\nan innovative time series forecasting model TimeSieve designed to address these\nchallenges. Our approach employs wavelet transforms to preprocess time series\ndata, effectively capturing multi-scale features without the need for\nadditional parameters or manual hyperparameter tuning. Additionally, we\nintroduce the information bottleneck theory that filters out redundant features\nfrom both detail and approximation coefficients, retaining only the most\npredictive information. This combination reduces significantly improves the\nmodel's accuracy. Extensive experiments demonstrate that our model outperforms\nexisting state-of-the-art methods on 70% of the datasets, achieving higher\npredictive accuracy and better generalization across diverse datasets. Our\nresults validate the effectiveness of our approach in addressing the key\nchallenges in time series forecasting, paving the way for more reliable and\nefficient predictive models in practical applications. The code for our model\nis available at https://github.com/xll0328/TimeSieve.\n","authors":["Ninghui Feng","Songning Lai","Fobao Zhou","Zhenxiao Yin","Hang Zhao"],"pdf_url":"https://arxiv.org/pdf/2406.05036v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09632v2","updated":"2024-08-20T05:28:27Z","published":"2024-08-19T01:30:14Z","title":"MoDeGPT: Modular Decomposition for Large Language Model Compression","summary":" Large Language Models (LLMs) have reshaped the landscape of artificial\nintelligence by demonstrating exceptional performance across various tasks.\nHowever, substantial computational requirements make their deployment\nchallenging on devices with limited resources. Recently, compression methods\nusing low-rank matrix techniques have shown promise, yet these often lead to\ndegraded accuracy or introduce significant overhead in parameters and inference\nlatency. This paper introduces \\textbf{Mo}dular \\textbf{De}composition\n(MoDeGPT), a novel structured compression framework that does not need recovery\nfine-tuning while resolving the above drawbacks. MoDeGPT partitions the\nTransformer block into modules comprised of matrix pairs and reduces the hidden\ndimensions via reconstructing the module-level outputs. MoDeGPT is developed\nbased on a theoretical framework that utilizes three well-established matrix\ndecomposition algorithms -- Nystr\\\"om approximation, CR decomposition, and SVD\n-- and applies them to our redefined transformer modules. Our comprehensive\nexperiments show MoDeGPT, without backward propagation, matches or surpasses\nprevious structured compression methods that rely on gradient information, and\nsaves 98% of compute costs on compressing a 13B model. On \\textsc{Llama}-2/3\nand OPT models, MoDeGPT maintains 90-95% zero-shot performance with 25-30%\ncompression rates. Moreover, the compression can be done on a single GPU within\na few hours and increases the inference throughput by up to 46%.\n","authors":["Chi-Heng Lin","Shangqian Gao","James Seale Smith","Abhishek Patel","Shikhar Tuli","Yilin Shen","Hongxia Jin","Yen-Chang Hsu"],"pdf_url":"https://arxiv.org/pdf/2408.09632v2.pdf","comment":"31 pages, 9 figures"},{"id":"http://arxiv.org/abs/2407.06121v2","updated":"2024-08-20T05:16:36Z","published":"2024-07-08T16:58:57Z","title":"Periodic agent-state based Q-learning for POMDPs","summary":" The standard approach for Partially Observable Markov Decision Processes\n(POMDPs) is to convert them to a fully observed belief-state MDP. However, the\nbelief state depends on the system model and is therefore not viable in\nreinforcement learning (RL) settings. A widely used alternative is to use an\nagent state, which is a model-free, recursively updateable function of the\nobservation history. Examples include frame stacking and recurrent neural\nnetworks. Since the agent state is model-free, it is used to adapt standard RL\nalgorithms to POMDPs. However, standard RL algorithms like Q-learning learn a\nstationary policy. Our main thesis that we illustrate via examples is that\nbecause the agent state does not satisfy the Markov property, non-stationary\nagent-state based policies can outperform stationary ones. To leverage this\nfeature, we propose PASQL (periodic agent-state based Q-learning), which is a\nvariant of agent-state-based Q-learning that learns periodic policies. By\ncombining ideas from periodic Markov chains and stochastic approximation, we\nrigorously establish that PASQL converges to a cyclic limit and characterize\nthe approximation error of the converged periodic policy. Finally, we present a\nnumerical experiment to highlight the salient features of PASQL and demonstrate\nthe benefit of learning periodic policies over stationary policies.\n","authors":["Amit Sinha","Mathieu Geist","Aditya Mahajan"],"pdf_url":"https://arxiv.org/pdf/2407.06121v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.01534v2","updated":"2024-08-20T04:15:50Z","published":"2024-08-02T18:47:11Z","title":"An Efficient Real-Time Object Detection Framework on\n Resource-Constricted Hardware Devices via Software and Hardware Co-design","summary":" The fast development of object detection techniques has attracted attention\nto developing efficient Deep Neural Networks (DNNs). However, the current\nstate-of-the-art DNN models can not provide a balanced solution among accuracy,\nspeed, and model size. This paper proposes an efficient real-time object\ndetection framework on resource-constrained hardware devices through hardware\nand software co-design. The Tensor Train (TT) decomposition is proposed for\ncompressing the YOLOv5 model. By unitizing the unique characteristics given by\nthe TT decomposition, we develop an efficient hardware accelerator based on\nFPGA devices. Experimental results show that the proposed method can\nsignificantly reduce the model size and improve the execution time.\n","authors":["Mingshuo Liu","Shiyi Luo","Kevin Han","Bo Yuan","Ronald F. DeMara","Yu Bai"],"pdf_url":"https://arxiv.org/pdf/2408.01534v2.pdf","comment":"11 pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.10517v1","updated":"2024-08-20T03:35:28Z","published":"2024-08-20T03:35:28Z","title":"Integrating Multi-Modal Input Token Mixer Into Mamba-Based Decision\n Models: Decision MetaMamba","summary":" Return-Conditioned Transformer Decision Models (RCTDM) have demonstrated the\npotential to enhance transformer performance in offline reinforcement learning\nby replacing rewards in the input sequence with returns-to-go. However, to\nachieve the goal of learning an optimal policy from offline datasets composed\nof limited suboptimal trajectories, RCTDM required alternative methods. One\nprominent approach, trajectory stitching, was designed to enable the network to\ncombine multiple trajectories to find the optimal path. To implement this using\nonly transformers without auxiliary networks, it was necessary to shorten the\ninput sequence length to better capture the Markov property in reinforcement\nlearnings. This, however, introduced a trade-off, as it reduced the accuracy of\naction inference. Our study introduces a model named Decision MetaMamba to\nresolve these challenges. DMM employs an input token mixer to extract patterns\nfrom short sequences and uses a State Space Model (SSM) to selectively combine\ninformation from relatively distant sequences. Inspired by Metaformer, this\nstructure was developed by transforming Mamba's input layer into various\nmulti-modal layers. Fortunately, with the advent of Mamba, implemented using\nparallel selective scanning, we achieved a high-performance sequence model\ncapable of replacing transformers. Based on these innovations, DMM demonstrated\nexcellent performance across various datasets in offline RL, confirming that\nmodels using SSM can improve performance by domain-specific alterations of the\ninput layer. Additionally, it maintained its performance even in lightweight\nmodels with fewer parameters. These results suggest that decision models based\non SSM can pave the way for improved outcomes in future developments.\n","authors":["Wall Kim"],"pdf_url":"https://arxiv.org/pdf/2408.10517v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10511v1","updated":"2024-08-20T03:20:13Z","published":"2024-08-20T03:20:13Z","title":"Single-cell Curriculum Learning-based Deep Graph Embedding Clustering","summary":" The swift advancement of single-cell RNA sequencing (scRNA-seq) technologies\nenables the investigation of cellular-level tissue heterogeneity. Cell\nannotation significantly contributes to the extensive downstream analysis of\nscRNA-seq data. However, The analysis of scRNA-seq for biological inference\npresents challenges owing to its intricate and indeterminate data distribution,\ncharacterized by a substantial volume and a high frequency of dropout events.\nFurthermore, the quality of training samples varies greatly, and the\nperformance of the popular scRNA-seq data clustering solution GNN could be\nharmed by two types of low-quality training nodes: 1) nodes on the boundary; 2)\nnodes that contribute little additional information to the graph. To address\nthese problems, we propose a single-cell curriculum learning-based deep graph\nembedding clustering (scCLG). We first propose a Chebyshev graph convolutional\nautoencoder with multi-decoder (ChebAE) that combines three optimization\nobjectives corresponding to three decoders, including topology reconstruction\nloss of cell graphs, zero-inflated negative binomial (ZINB) loss, and\nclustering loss, to learn cell-cell topology representation. Meanwhile, we\nemploy a selective training strategy to train GNN based on the features and\nentropy of nodes and prune the difficult nodes based on the difficulty scores\nto keep the high-quality graph. Empirical results on a variety of gene\nexpression datasets show that our model outperforms state-of-the-art methods.\n","authors":["Huifa Li","Jie Fu","Xinpeng Ling","Zhiyu Sun","Kuncan Wang","Zhili Chen"],"pdf_url":"https://arxiv.org/pdf/2408.10511v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09715v2","updated":"2024-08-20T03:13:41Z","published":"2024-08-19T06:06:30Z","title":"HYDEN: Hyperbolic Density Representations for Medical Images and Reports","summary":" In light of the inherent entailment relations between images and text,\nhyperbolic point vector embeddings, leveraging the hierarchical modeling\nadvantages of hyperbolic space, have been utilized for visual semantic\nrepresentation learning. However, point vector embedding approaches fail to\naddress the issue of semantic uncertainty, where an image may have multiple\ninterpretations, and text may refer to different images, a phenomenon\nparticularly prevalent in the medical domain. Therefor, we propose\n\\textbf{HYDEN}, a novel hyperbolic density embedding based image-text\nrepresentation learning approach tailored for specific medical domain data.\nThis method integrates text-aware local features alongside global features from\nimages, mapping image-text features to density features in hyperbolic space via\nusing hyperbolic pseudo-Gaussian distributions. An encapsulation loss function\nis employed to model the partial order relations between image-text density\ndistributions. Experimental results demonstrate the interpretability of our\napproach and its superior performance compared to the baseline methods across\nvarious zero-shot tasks and different datasets.\n","authors":["Zhi Qiao","Linbin Han","Xiantong Zhen","Jia-Hong Gao","Zhen Qian"],"pdf_url":"https://arxiv.org/pdf/2408.09715v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10503v1","updated":"2024-08-20T03:03:56Z","published":"2024-08-20T03:03:56Z","title":"Adaptive Knowledge Distillation for Classification of Hand Images using\n Explainable Vision Transformers","summary":" Assessing the forensic value of hand images involves the use of unique\nfeatures and patterns present in an individual's hand. The human hand has\ndistinct characteristics, such as the pattern of veins, fingerprints, and the\ngeometry of the hand itself. This paper investigates the use of vision\ntransformers (ViTs) for classification of hand images. We use explainability\ntools to explore the internal representations of ViTs and assess their impact\non the model outputs. Utilizing the internal understanding of ViTs, we\nintroduce distillation methods that allow a student model to adaptively extract\nknowledge from a teacher model while learning on data of a different domain to\nprevent catastrophic forgetting. Two publicly available hand image datasets are\nused to conduct a series of experiments to evaluate performance of the ViTs and\nour proposed adaptive distillation methods. The experimental results\ndemonstrate that ViT models significantly outperform traditional machine\nlearning methods and the internal states of ViTs are useful for explaining the\nmodel outputs in the classification task. By averting catastrophic forgetting,\nour distillation methods achieve excellent performance on data from both source\nand target domains, particularly when these two domains exhibit significant\ndissimilarity. The proposed approaches therefore can be developed and\nimplemented effectively for real-world applications such as access control,\nidentity verification, and authentication systems.\n","authors":["Thanh Thi Nguyen","Campbell Wilson","Janis Dalins"],"pdf_url":"https://arxiv.org/pdf/2408.10503v1.pdf","comment":"Accepted at the ECML PKDD 2024 (Research Track)"},{"id":"http://arxiv.org/abs/2408.10111v2","updated":"2024-08-20T02:59:16Z","published":"2024-08-19T15:59:46Z","title":"PLUTUS: A Well Pre-trained Large Unified Transformer can Unveil\n Financial Time Series Regularities","summary":" Financial time series modeling is crucial for understanding and predicting\nmarket behaviors but faces challenges such as non-linearity, non-stationarity,\nand high noise levels. Traditional models struggle to capture complex patterns\ndue to these issues, compounded by limitations in computational resources and\nmodel capacity. Inspired by the success of large language models in NLP, we\nintroduce $\\textbf{PLUTUS}$, a $\\textbf{P}$re-trained $\\textbf{L}$arge\n$\\textbf{U}$nified $\\textbf{T}$ransformer-based model that $\\textbf{U}$nveils\nregularities in financial time $\\textbf{S}$eries. PLUTUS uses an invertible\nembedding module with contrastive learning and autoencoder techniques to create\nan approximate one-to-one mapping between raw data and patch embeddings.\nTimeFormer, an attention based architecture, forms the core of PLUTUS,\neffectively modeling high-noise time series. We incorporate a novel attention\nmechanisms to capture features across both variable and temporal dimensions.\nPLUTUS is pre-trained on an unprecedented dataset of 100 billion observations,\ndesigned to thrive in noisy financial environments. To our knowledge, PLUTUS is\nthe first open-source, large-scale, pre-trained financial time series model\nwith over one billion parameters. It achieves state-of-the-art performance in\nvarious tasks, demonstrating strong transferability and establishing a robust\nfoundational model for finance. Our research provides technical guidance for\npre-training financial time series data, setting a new standard in the field.\n","authors":["Yuanjian Xu","Anxian Liu","Jianing Hao","Zhenzhuo Li","Shichang Meng","Guang Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.10111v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10502v1","updated":"2024-08-20T02:47:24Z","published":"2024-08-20T02:47:24Z","title":"Asymptotic Classification Error for Heavy-Tailed Renewal Processes","summary":" Despite the widespread occurrence of classification problems and the\nincreasing collection of point process data across many disciplines, study of\nerror probability for point process classification only emerged very recently.\nHere, we consider classification of renewal processes. We obtain asymptotic\nexpressions for the Bhattacharyya bound on misclassification error\nprobabilities for heavy-tailed renewal processes.\n","authors":["Xinhui Rong","Victor Solo"],"pdf_url":"https://arxiv.org/pdf/2408.10502v1.pdf","comment":"11 pages, 2 figures"},{"id":"http://arxiv.org/abs/2402.11173v2","updated":"2024-08-20T02:37:32Z","published":"2024-02-17T02:42:56Z","title":"How to Make the Gradients Small Privately: Improved Rates for\n Differentially Private Non-Convex Optimization","summary":" We provide a simple and flexible framework for designing differentially\nprivate algorithms to find approximate stationary points of non-convex loss\nfunctions. Our framework is based on using a private approximate risk minimizer\nto \"warm start\" another private algorithm for finding stationary points. We use\nthis framework to obtain improved, and sometimes optimal, rates for several\nclasses of non-convex loss functions. First, we obtain improved rates for\nfinding stationary points of smooth non-convex empirical loss functions.\nSecond, we specialize to quasar-convex functions, which generalize star-convex\nfunctions and arise in learning dynamical systems and training some neural\nnets. We achieve the optimal rate for this class. Third, we give an optimal\nalgorithm for finding stationary points of functions satisfying the\nKurdyka-Lojasiewicz (KL) condition. For example, over-parameterized neural\nnetworks often satisfy this condition. Fourth, we provide new state-of-the-art\nrates for stationary points of non-convex population loss functions. Fifth, we\nobtain improved rates for non-convex generalized linear models. A modification\nof our algorithm achieves nearly the same rates for second-order stationary\npoints of functions with Lipschitz Hessian, improving over the previous\nstate-of-the-art for each of the above problems.\n","authors":["Andrew Lowy","Jonathan Ullman","Stephen J. Wright"],"pdf_url":"https://arxiv.org/pdf/2402.11173v2.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2408.08808v3","updated":"2024-08-20T02:32:58Z","published":"2024-08-16T15:41:43Z","title":"Constructing Domain-Specific Evaluation Sets for LLM-as-a-judge","summary":" Large Language Models (LLMs) have revolutionized the landscape of machine\nlearning, yet current benchmarks often fall short in capturing the diverse\nbehavior of these models in real-world applications. A benchmark's usefulness\nis determined by its ability to clearly differentiate between models of varying\ncapabilities (separability) and closely align with human preferences. Existing\nframeworks like Alpaca-Eval 2.0 LC\n\\cite{dubois2024lengthcontrolledalpacaevalsimpleway} and Arena-Hard v0.1\n\\cite{li2024crowdsourced} are limited by their focus on general-purpose queries\nand lack of diversity across domains such as law, medicine, and multilingual\ncontexts. In this paper, we address these limitations by introducing a novel\ndata pipeline that curates diverse, domain-specific evaluation sets tailored\nfor LLM-as-a-Judge frameworks. Our approach leverages a combination of manual\ncuration, semi-supervised learning to generate clusters, and stratified\nsampling to ensure balanced representation across a wide range of domains and\nlanguages. The resulting evaluation set, which includes 1573 samples across 14\ncategories, demonstrates high separability (84\\%) across ten top-ranked models,\nand agreement (84\\%) with Chatbot Arena and (0.915) Spearman correlation. The\nagreement values are 9\\% better than Arena Hard and 20\\% better than AlpacaEval\n2.0 LC, while the Spearman coefficient is 0.7 more than the next best\nbenchmark, showcasing a significant improvement in the usefulness of the\nbenchmark. We further provide an open-source evaluation tool that enables\nfine-grained analysis of model performance across user-defined categories,\noffering valuable insights for practitioners. This work contributes to the\nongoing effort to enhance the transparency, diversity, and effectiveness of LLM\nevaluation methodologies.\n","authors":["Ravi Raju","Swayambhoo Jain","Bo Li","Jonathan Li","Urmish Thakker"],"pdf_url":"https://arxiv.org/pdf/2408.08808v3.pdf","comment":"14 pages, 8 figures, Under review"},{"id":"http://arxiv.org/abs/2408.10493v1","updated":"2024-08-20T02:22:59Z","published":"2024-08-20T02:22:59Z","title":"Clustering by Mining Density Distributions and Splitting Manifold\n Structure","summary":" Spectral clustering requires the time-consuming decomposition of the\nLaplacian matrix of the similarity graph, thus limiting its applicability to\nlarge datasets. To improve the efficiency of spectral clustering, a top-down\napproach was recently proposed, which first divides the data into several\nmicro-clusters (granular-balls), then splits these micro-clusters when they are\nnot \"compact'', and finally uses these micro-clusters as nodes to construct a\nsimilarity graph for more efficient spectral clustering. However, this top-down\napproach is challenging to adapt to unevenly distributed or structurally\ncomplex data. This is because constructing micro-clusters as a rough ball\nstruggles to capture the shape and structure of data in a local range, and the\nsimplistic splitting rule that solely targets ``compactness'' is susceptible to\nnoise and variations in data density and leads to micro-clusters with varying\nshapes, making it challenging to accurately measure the similarity between\nthem. To resolve these issues, this paper first proposes to start from local\nstructures to obtain micro-clusters, such that the complex structural\ninformation inside local neighborhoods is well captured by them. Moreover, by\nnoting that Euclidean distance is more suitable for convex sets, this paper\nfurther proposes a data splitting rule that couples local density and data\nmanifold structures, so that the similarities of the obtained micro-clusters\ncan be easily characterized. A novel similarity measure between micro-clusters\nis then proposed for the final spectral clustering. A series of experiments\nbased on synthetic and real-world datasets demonstrate that the proposed method\nhas better adaptability to structurally complex data than granular-ball based\nmethods.\n","authors":["Zhichang Xu","Zhiguo Long","Hua Meng"],"pdf_url":"https://arxiv.org/pdf/2408.10493v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10491v1","updated":"2024-08-20T02:22:27Z","published":"2024-08-20T02:22:27Z","title":"Achieving the Tightest Relaxation of Sigmoids for Formal Verification","summary":" In the field of formal verification, Neural Networks (NNs) are typically\nreformulated into equivalent mathematical programs which are optimized over. To\novercome the inherent non-convexity of these reformulations, convex relaxations\nof nonlinear activation functions are typically utilized. Common relaxations\n(i.e., static linear cuts) of ``S-shaped\" activation functions, however, can be\noverly loose, slowing down the overall verification process. In this paper, we\nderive tuneable hyperplanes which upper and lower bound the sigmoid activation\nfunction. When tuned in the dual space, these affine bounds smoothly rotate\naround the nonlinear manifold of the sigmoid activation function. This\napproach, termed $\\alpha$-sig, allows us to tractably incorporate the tightest\npossible, element-wise convex relaxation of the sigmoid activation function\ninto a formal verification framework. We embed these relaxations inside of\nlarge verification tasks and compare their performance to LiRPA and\n$\\alpha$-CROWN, a state-of-the-art verification duo.\n","authors":["Samuel Chevalier","Duncan Starkenburg"," Krishnamurthy"," Dvijotham"],"pdf_url":"https://arxiv.org/pdf/2408.10491v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17722v2","updated":"2024-08-20T02:16:41Z","published":"2024-07-25T02:48:56Z","title":"Text-Driven Neural Collaborative Filtering Model for Paper Source\n Tracing","summary":" Identifying significant references within the complex interrelations of a\ncitation knowledge graph is challenging, which encompasses connections through\ncitations, authorship, keywords, and other relational attributes. The Paper\nSource Tracing (PST) task seeks to automate the identification of pivotal\nreferences for given scholarly articles utilizing advanced data mining\ntechniques. In the KDD CUP OAG-Challenge PST track, we design a\nrecommendation-based framework tailored for the PST task. This framework\nemploys the Neural Collaborative Filtering (NCF) model to generate final\npredictions. To process the textual attributes of the papers and extract input\nfeatures for the model, we utilize SciBERT, a pre-trained language model.\nAccording to the experimental results, our method achieved a score of 0.37814\non the Mean Average Precision (MAP) metric, outperforming baseline models and\nranking 11th among all participating teams. The source code is publicly\navailable at https://github.com/MyLove-XAB/KDDCupFinal.\n","authors":["Aobo Xu","Bingyu Chang","Qingpeng Liu","Ling Jian"],"pdf_url":"https://arxiv.org/pdf/2407.17722v2.pdf","comment":"KDD CUP 2024 OAG-Challenges, Paper Source Tracing, Technical Report\n of Team AoboSama @ KDD CUP 2024. August 25--29, 2024. Barcelona, Spain"},{"id":"http://arxiv.org/abs/2408.10483v1","updated":"2024-08-20T01:56:07Z","published":"2024-08-20T01:56:07Z","title":"PRformer: Pyramidal Recurrent Transformer for Multivariate Time Series\n Forecasting","summary":" The self-attention mechanism in Transformer architecture, invariant to\nsequence order, necessitates positional embeddings to encode temporal order in\ntime series prediction. We argue that this reliance on positional embeddings\nrestricts the Transformer's ability to effectively represent temporal\nsequences, particularly when employing longer lookback windows. To address\nthis, we introduce an innovative approach that combines Pyramid RNN\nembeddings(PRE) for univariate time series with the Transformer's capability to\nmodel multivariate dependencies. PRE, utilizing pyramidal one-dimensional\nconvolutional layers, constructs multiscale convolutional features that\npreserve temporal order. Additionally, RNNs, layered atop these features, learn\nmultiscale time series representations sensitive to sequence order. This\nintegration into Transformer models with attention mechanisms results in\nsignificant performance enhancements. We present the PRformer, a model\nintegrating PRE with a standard Transformer encoder, demonstrating\nstate-of-the-art performance on various real-world datasets. This performance\nhighlights the effectiveness of our approach in leveraging longer lookback\nwindows and underscores the critical role of robust temporal representations in\nmaximizing Transformer's potential for prediction tasks. Code is available at\nthis repository: \\url{https://github.com/usualheart/PRformer}.\n","authors":["Yongbo Yu","Weizhong Yu","Feiping Nie","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2408.10483v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10479v1","updated":"2024-08-20T01:30:53Z","published":"2024-08-20T01:30:53Z","title":"An End-to-End Reinforcement Learning Based Approach for Micro-View\n Order-Dispatching in Ride-Hailing","summary":" Assigning orders to drivers under localized spatiotemporal context\n(micro-view order-dispatching) is a major task in Didi, as it influences\nride-hailing service experience. Existing industrial solutions mainly follow a\ntwo-stage pattern that incorporate heuristic or learning-based algorithms with\nnaive combinatorial methods, tackling the uncertainty of both sides' behaviors,\nincluding emerging timings, spatial relationships, and travel duration, etc. In\nthis paper, we propose a one-stage end-to-end reinforcement learning based\norder-dispatching approach that solves behavior prediction and combinatorial\noptimization uniformly in a sequential decision-making manner. Specifically, we\nemploy a two-layer Markov Decision Process framework to model this problem, and\npresent \\underline{D}eep \\underline{D}ouble \\underline{S}calable\n\\underline{N}etwork (D2SN), an encoder-decoder structure network to generate\norder-driver assignments directly and stop assignments accordingly. Besides, by\nleveraging contextual dynamics, our approach can adapt to the behavioral\npatterns for better performance. Extensive experiments on Didi's real-world\nbenchmarks justify that the proposed approach significantly outperforms\ncompetitive baselines in optimizing matching efficiency and user experience\ntasks. In addition, we evaluate the deployment outline and discuss the gains\nand experiences obtained during the deployment tests from the view of\nlarge-scale engineering implementation.\n","authors":["Xinlang Yue","Yiran Liu","Fangzhou Shi","Sihong Luo","Chen Zhong","Min Lu","Zhe Xu"],"pdf_url":"https://arxiv.org/pdf/2408.10479v1.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2305.13153v3","updated":"2024-08-20T01:27:21Z","published":"2023-05-22T15:41:33Z","title":"Effective Bilevel Optimization via Minimax Reformulation","summary":" Bilevel optimization has found successful applications in various machine\nlearning problems, including hyper-parameter optimization, data cleaning, and\nmeta-learning. However, its huge computational cost presents a significant\nchallenge for its utilization in large-scale problems. This challenge arises\ndue to the nested structure of the bilevel formulation, where each\nhyper-gradient computation necessitates a costly inner optimization procedure.\nTo address this issue, we propose a reformulation of bilevel optimization as a\nminimax problem, effectively decoupling the outer-inner dependency. Under mild\nconditions, we show these two problems are equivalent. Furthermore, we\nintroduce a multi-stage gradient descent and ascent (GDA) algorithm to solve\nthe resulting minimax problem with convergence guarantees. Extensive\nexperimental results demonstrate that our method outperforms state-of-the-art\nbilevel methods while significantly reducing the computational cost.\n","authors":["Xiaoyu Wang","Rui Pan","Renjie Pi","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.13153v3.pdf","comment":"Typos and intended inclusion of additional experiments"},{"id":"http://arxiv.org/abs/2407.16944v4","updated":"2024-08-20T01:21:38Z","published":"2024-07-24T02:23:18Z","title":"Adaptive Gradient Regularization: A Faster and Generalizable\n Optimization Technique for Deep Neural Networks","summary":" Stochastic optimization plays a crucial role in the advancement of deep\nlearning technologies. Over the decades, significant effort has been dedicated\nto improving the training efficiency and robustness of deep neural networks,\nvia various strategies including gradient normalization (GN) and gradient\ncentralization (GC). Nevertheless, to the best of our knowledge, no one has\nconsidered to capture the optimal gradient descent trajectory, by adaptively\ncontrolling gradient descent direction. To address this concern, this paper is\nthe first attempt to study a new optimization technique for deep neural\nnetworks, using the sum normalization of a gradient vector as coefficients, to\ndynamically regularize gradients and thus to effectively control optimization\ndirection. The proposed technique is hence named as the adaptive gradient\nregularization (AGR). It can be viewed as an adaptive gradient clipping method.\nThe theoretical analysis reveals that the AGR can effectively smooth the loss\nlandscape, and hence can significantly improve the training efficiency and\nmodel generalization performance. We note that AGR can greatly improve the\ntraining efficiency of vanilla optimizers' including Adan and AdamW, by adding\nonly three lines of code. The final experiments conducted on image generation,\nimage classification, and language representation, demonstrate that the AGR\nmethod can not only improve the training efficiency but also enhance the model\ngeneralization performance.\n","authors":["Huixiu Jiang","Ling Yang","Yu Bao","Rutong Si","Sikun Yang"],"pdf_url":"https://arxiv.org/pdf/2407.16944v4.pdf","comment":"12 pages, 13 figures"},{"id":"http://arxiv.org/abs/2408.10474v1","updated":"2024-08-20T01:17:54Z","published":"2024-08-20T01:17:54Z","title":"LeCov: Multi-level Testing Criteria for Large Language Models","summary":" Large Language Models (LLMs) are widely used in many different domains, but\nbecause of their limited interpretability, there are questions about how\ntrustworthy they are in various perspectives, e.g., truthfulness and toxicity.\nRecent research has started developing testing methods for LLMs, aiming to\nuncover untrustworthy issues, i.e., defects, before deployment. However,\nsystematic and formalized testing criteria are lacking, which hinders a\ncomprehensive assessment of the extent and adequacy of testing exploration. To\nmitigate this threat, we propose a set of multi-level testing criteria, LeCov,\nfor LLMs. The criteria consider three crucial LLM internal components, i.e.,\nthe attention mechanism, feed-forward neurons, and uncertainty, and contain\nnine types of testing criteria in total. We apply the criteria in two\nscenarios: test prioritization and coverage-guided testing. The experiment\nevaluation, on three models and four datasets, demonstrates the usefulness and\neffectiveness of LeCov.\n","authors":["Xuan Xie","Jiayang Song","Yuheng Huang","Da Song","Fuyuan Zhang","Felix Juefei-Xu","Lei Ma"],"pdf_url":"https://arxiv.org/pdf/2408.10474v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10473v1","updated":"2024-08-20T01:05:45Z","published":"2024-08-20T01:05:45Z","title":"Enhancing One-shot Pruned Pre-trained Language Models through\n Sparse-Dense-Sparse Mechanism","summary":" Pre-trained language models (PLMs) are engineered to be robust in contextual\nunderstanding and exhibit outstanding performance in various natural language\nprocessing tasks. However, their considerable size incurs significant\ncomputational and storage costs. Modern pruning strategies employ one-shot\ntechniques to compress PLMs without the need for retraining on task-specific or\notherwise general data; however, these approaches often lead to an\nindispensable reduction in performance. In this paper, we propose SDS, a\nSparse-Dense-Sparse pruning framework to enhance the performance of the pruned\nPLMs from a weight distribution optimization perspective. We outline the\npruning process in three steps. Initially, we prune less critical connections\nin the model using conventional one-shot pruning methods. Next, we reconstruct\na dense model featuring a pruning-friendly weight distribution by reactivating\npruned connections with sparse regularization. Finally, we perform a second\npruning round, yielding a superior pruned model compared to the initial\npruning. Experimental results demonstrate that SDS outperforms the\nstate-of-the-art pruning techniques SparseGPT and Wanda under an identical\nsparsity configuration. For instance, SDS reduces perplexity by 9.13 on\nRaw-Wikitext2 and improves accuracy by an average of 2.05% across multiple\nzero-shot benchmarks for OPT-125M with 2:4 sparsity.\n","authors":["Guanchen Li","Xiandong Zhao","Lian Liu","Zeping Li","Dong Li","Lu Tian","Jie He","Ashish Sirasao","Emad Barsoum"],"pdf_url":"https://arxiv.org/pdf/2408.10473v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12879v2","updated":"2024-08-20T00:57:55Z","published":"2024-07-16T09:28:23Z","title":"Large Visual-Language Models Are Also Good Classifiers: A Study of\n In-Context Multimodal Fake News Detection","summary":" Large visual-language models (LVLMs) exhibit exceptional performance in\nvisual-language reasoning across diverse cross-modal benchmarks. Despite these\nadvances, recent research indicates that Large Language Models (LLMs), like\nGPT-3.5-turbo, underachieve compared to well-trained smaller models, such as\nBERT, in Fake News Detection (FND), prompting inquiries into LVLMs' efficacy in\nFND tasks. Although performance could improve through fine-tuning LVLMs, the\nsubstantial parameters and requisite pre-trained weights render it a\nresource-heavy endeavor for FND applications. This paper initially assesses the\nFND capabilities of two notable LVLMs, CogVLM and GPT4V, in comparison to a\nsmaller yet adeptly trained CLIP model in a zero-shot context. The findings\ndemonstrate that LVLMs can attain performance competitive with that of the\nsmaller model. Next, we integrate standard in-context learning (ICL) with\nLVLMs, noting improvements in FND performance, though limited in scope and\nconsistency. To address this, we introduce the \\textbf{I}n-context\n\\textbf{M}ultimodal \\textbf{F}ake \\textbf{N}ews \\textbf{D}etection (IMFND)\nframework, enriching in-context examples and test inputs with predictions and\ncorresponding probabilities from a well-trained smaller model. This strategic\nintegration directs the LVLMs' focus towards news segments associated with\nhigher probabilities, thereby improving their analytical accuracy. The\nexperimental results suggest that the IMFND framework significantly boosts the\nFND efficiency of LVLMs, achieving enhanced accuracy over the standard ICL\napproach across three publicly available FND datasets.\n","authors":["Ye Jiang","Yimin Wang"],"pdf_url":"https://arxiv.org/pdf/2407.12879v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04350v2","updated":"2024-08-20T00:45:40Z","published":"2023-11-07T21:17:59Z","title":"Device Sampling and Resource Optimization for Federated Learning in\n Cooperative Edge Networks","summary":" The conventional federated learning (FedL) architecture distributes machine\nlearning (ML) across worker devices by having them train local models that are\nperiodically aggregated by a server. FedL ignores two important characteristics\nof contemporary wireless networks, however: (i) the network may contain\nheterogeneous communication/computation resources, and (ii) there may be\nsignificant overlaps in devices' local data distributions. In this work, we\ndevelop a novel optimization methodology that jointly accounts for these\nfactors via intelligent device sampling complemented by device-to-device (D2D)\noffloading. Our optimization methodology aims to select the best combination of\nsampled nodes and data offloading configuration to maximize FedL training\naccuracy while minimizing data processing and D2D communication resource\nconsumption subject to realistic constraints on the network topology and device\ncapabilities. Theoretical analysis of the D2D offloading subproblem leads to\nnew FedL convergence bounds and an efficient sequential convex optimizer. Using\nthese results, we develop a sampling methodology based on graph convolutional\nnetworks (GCNs) which learns the relationship between network attributes,\nsampled nodes, and D2D data offloading to maximize FedL accuracy. Through\nevaluation on popular datasets and real-world network measurements from our\nedge testbed, we find that our methodology outperforms popular device sampling\nmethodologies from literature in terms of ML model performance, data processing\noverhead, and energy consumption.\n","authors":["Su Wang","Roberto Morabito","Seyyedali Hosseinalipour","Mung Chiang","Christopher G. Brinton"],"pdf_url":"https://arxiv.org/pdf/2311.04350v2.pdf","comment":"Published in IEEE/ACM Transactions on Networking. arXiv admin note:\n substantial text overlap with arXiv:2101.00787"},{"id":"http://arxiv.org/abs/2408.10468v1","updated":"2024-08-20T00:40:49Z","published":"2024-08-20T00:40:49Z","title":"Tracing Privacy Leakage of Language Models to Training Data via Adjusted\n Influence Functions","summary":" The responses generated by Large Language Models (LLMs) can include sensitive\ninformation from individuals and organizations, leading to potential privacy\nleakage. This work implements Influence Functions (IFs) to trace privacy\nleakage back to the training data, thereby mitigating privacy concerns of\nLanguage Models (LMs). However, we notice that current IFs struggle to\naccurately estimate the influence of tokens with large gradient norms,\npotentially overestimating their influence. When tracing the most influential\nsamples, this leads to frequently tracing back to samples with large gradient\nnorm tokens, overshadowing the actual most influential samples even if their\ninfluences are well estimated. To address this issue, we propose Heuristically\nAdjusted IF (HAIF), which reduces the weight of tokens with large gradient\nnorms, thereby significantly improving the accuracy of tracing the most\ninfluential samples. To establish easily obtained groundtruth for tracing\nprivacy leakage, we construct two datasets, PII-E and PII-CR, representing two\ndistinct scenarios: one with identical text in the model outputs and\npre-training data, and the other where models leverage their reasoning\nabilities to generate text divergent from pre-training data. HAIF significantly\nimproves tracing accuracy, enhancing it by 20.96\\% to 73.71\\% on the PII-E\ndataset and 3.21\\% to 45.93\\% on the PII-CR dataset, compared to the best SOTA\nIFs against various GPT-2 and QWen-1.5 models. HAIF also outperforms SOTA IFs\non real-world pretraining data CLUECorpus2020, demonstrating strong robustness\nregardless prompt and response lengths.\n","authors":["Jinxin Liu","Zao Yang"],"pdf_url":"https://arxiv.org/pdf/2408.10468v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10467v1","updated":"2024-08-20T00:33:45Z","published":"2024-08-20T00:33:45Z","title":"Learning Multimodal Latent Space with EBM Prior and MCMC Inference","summary":" Multimodal generative models are crucial for various applications. We propose\nan approach that combines an expressive energy-based model (EBM) prior with\nMarkov Chain Monte Carlo (MCMC) inference in the latent space for multimodal\ngeneration. The EBM prior acts as an informative guide, while MCMC inference,\nspecifically through short-run Langevin dynamics, brings the posterior\ndistribution closer to its true form. This method not only provides an\nexpressive prior to better capture the complexity of multimodality but also\nimproves the learning of shared latent variables for more coherent generation\nacross modalities. Our proposed method is supported by empirical experiments,\nunderscoring the effectiveness of our EBM prior with MCMC inference in\nenhancing cross-modal and joint generative tasks in multimodal contexts.\n","authors":["Shiyu Yuan","Carlo Lipizzi","Tian Han"],"pdf_url":"https://arxiv.org/pdf/2408.10467v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07910v3","updated":"2024-08-20T00:28:45Z","published":"2023-12-13T05:58:34Z","title":"PromptBench: A Unified Library for Evaluation of Large Language Models","summary":" The evaluation of large language models (LLMs) is crucial to assess their\nperformance and mitigate potential security risks. In this paper, we introduce\nPromptBench, a unified library to evaluate LLMs. It consists of several key\ncomponents that are easily used and extended by researchers: prompt\nconstruction, prompt engineering, dataset and model loading, adversarial prompt\nattack, dynamic evaluation protocols, and analysis tools. PromptBench is\ndesigned to be an open, general, and flexible codebase for research purposes\nthat can facilitate original study in creating new benchmarks, deploying\ndownstream applications, and designing new evaluation protocols. The code is\navailable at: https://github.com/microsoft/promptbench and will be continuously\nsupported.\n","authors":["Kaijie Zhu","Qinlin Zhao","Hao Chen","Jindong Wang","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2312.07910v3.pdf","comment":"Accepted by Journal of Machine Learning Research (JMLR); code:\n https://github.com/microsoft/promptbench"},{"id":"http://arxiv.org/abs/2408.10463v1","updated":"2024-08-20T00:16:12Z","published":"2024-08-20T00:16:12Z","title":"Adversarial training of Keyword Spotting to Minimize TTS Data\n Overfitting","summary":" The keyword spotting (KWS) problem requires large amounts of real speech\ntraining data to achieve high accuracy across diverse populations. Utilizing\nlarge amounts of text-to-speech (TTS) synthesized data can reduce the cost and\ntime associated with KWS development. However, TTS data may contain artifacts\nnot present in real speech, which the KWS model can exploit (overfit), leading\nto degraded accuracy on real speech. To address this issue, we propose applying\nan adversarial training method to prevent the KWS model from learning\nTTS-specific features when trained on large amounts of TTS data. Experimental\nresults demonstrate that KWS model accuracy on real speech data can be improved\nby up to 12% when adversarial loss is used in addition to the original KWS\nloss. Surprisingly, we also observed that the adversarial setup improves\naccuracy by up to 8%, even when trained solely on TTS and real negative speech\ndata, without any real positive examples.\n","authors":["Hyun Jin Park","Dhruuv Agarwal","Neng Chen","Rentao Sun","Kurt Partridge","Justin Chen","Harry Zhang","Pai Zhu","Jacob Bartel","Kyle Kastner","Gary Wang","Andrew Rosenberg","Quan Wang"],"pdf_url":"https://arxiv.org/pdf/2408.10463v1.pdf","comment":"to be published in a Workshop at Interspeech 2024, Synthetic Data's\n Transformative Role in Foundational Speech Models"},{"id":"http://arxiv.org/abs/2310.07048v4","updated":"2024-08-20T00:05:20Z","published":"2023-10-10T22:23:27Z","title":"FedMFS: Federated Multimodal Fusion Learning with Selective Modality\n Communication","summary":" Multimodal federated learning (FL) aims to enrich model training in FL\nsettings where devices are collecting measurements across multiple modalities\n(e.g., sensors measuring pressure, motion, and other types of data). However,\nkey challenges to multimodal FL remain unaddressed, particularly in\nheterogeneous network settings: (i) the set of modalities collected by each\ndevice will be diverse, and (ii) communication limitations prevent devices from\nuploading all their locally trained modality models to the server. In this\npaper, we propose Federated Multimodal Fusion learning with Selective modality\ncommunication (FedMFS), a new multimodal fusion FL methodology that can tackle\nthe above mentioned challenges. The key idea is the introduction of a modality\nselection criterion for each device, which weighs (i) the impact of the\nmodality, gauged by Shapley value analysis, against (ii) the modality model\nsize as a gauge for communication overhead. This enables FedMFS to flexibly\nbalance performance against communication costs, depending on resource\nconstraints and application requirements. Experiments on the real-world\nActionSense dataset demonstrate the ability of FedMFS to achieve comparable\naccuracy to several baselines while reducing the communication overhead by over\n4x.\n","authors":["Liangqi Yuan","Dong-Jun Han","Vishnu Pandi Chellapandi","Stanislaw H. Żak","Christopher G. Brinton"],"pdf_url":"https://arxiv.org/pdf/2310.07048v4.pdf","comment":"ICC 2024"},{"id":"http://arxiv.org/abs/2408.10458v1","updated":"2024-08-20T00:03:23Z","published":"2024-08-20T00:03:23Z","title":"Transfer Operator Learning with Fusion Frame","summary":" The challenge of applying learned knowledge from one domain to solve problems\nin another related but distinct domain, known as transfer learning, is\nfundamental in operator learning models that solve Partial Differential\nEquations (PDEs). These current models often struggle with generalization\nacross different tasks and datasets, limiting their applicability in diverse\nscientific and engineering disciplines. This work presents a novel framework\nthat enhances the transfer learning capabilities of operator learning models\nfor solving Partial Differential Equations (PDEs) through the integration of\nfusion frame theory with the Proper Orthogonal Decomposition (POD)-enhanced\nDeep Operator Network (DeepONet). We introduce an innovative architecture that\ncombines fusion frames with POD-DeepONet, demonstrating superior performance\nacross various PDEs in our experimental analysis. Our framework addresses the\ncritical challenge of transfer learning in operator learning models, paving the\nway for adaptable and efficient solutions across a wide range of scientific and\nengineering applications.\n","authors":["Haoyang Jiang","Yongzhi Qu"],"pdf_url":"https://arxiv.org/pdf/2408.10458v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11243v1","updated":"2024-08-20T23:45:11Z","published":"2024-08-20T23:45:11Z","title":"Do Neural Scaling Laws Exist on Graph Self-Supervised Learning?","summary":" Self-supervised learning~(SSL) is essential to obtain foundation models in\nNLP and CV domains via effectively leveraging knowledge in large-scale\nunlabeled data. The reason for its success is that a suitable SSL design can\nhelp the model to follow the neural scaling law, i.e., the performance\nconsistently improves with increasing model and dataset sizes. However, it\nremains a mystery whether existing SSL in the graph domain can follow the\nscaling behavior toward building Graph Foundation Models~(GFMs) with\nlarge-scale pre-training. In this study, we examine whether existing graph SSL\ntechniques can follow the neural scaling behavior with the potential to serve\nas the essential component for GFMs. Our benchmark includes comprehensive SSL\ntechnique implementations with analysis conducted on both the conventional SSL\nsetting and many new settings adopted in other domains. Surprisingly, despite\nthe SSL loss continuously decreasing, no existing graph SSL techniques follow\nthe neural scaling behavior on the downstream performance. The model\nperformance only merely fluctuates on different data scales and model scales.\nInstead of the scales, the key factors influencing the performance are the\nchoices of model architecture and pretext task design. This paper examines\nexisting SSL techniques for the feasibility of Graph SSL techniques in\ndeveloping GFMs and opens a new direction for graph SSL design with the new\nevaluation prototype. Our code implementation is available online to ease\nreproducibility on https://github.com/GraphSSLScaling/GraphSSLScaling.\n","authors":["Qian Ma","Haitao Mao","Jingzhe Liu","Zhehua Zhang","Chunlin Feng","Yu Song","Yihan Shao","Tianfan Fu","Yao Ma"],"pdf_url":"https://arxiv.org/pdf/2408.11243v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11240v1","updated":"2024-08-20T23:37:08Z","published":"2024-08-20T23:37:08Z","title":"Asymmetric Graph Error Control with Low Complexity in Causal Bandits","summary":" In this paper, the causal bandit problem is investigated, in which the\nobjective is to select an optimal sequence of interventions on nodes in a\ncausal graph. It is assumed that the graph is governed by linear structural\nequations; it is further assumed that both the causal topology and the\ndistribution of interventions are unknown. By exploiting the causal\nrelationships between the nodes whose signals contribute to the reward,\ninterventions are optimized. First, based on the difference between the two\ntypes of graph identification errors (false positives and negatives), a causal\ngraph learning method is proposed, which strongly reduces sample complexity\nrelative to the prior art by learning sub-graphs. Under the assumption of\nGaussian exogenous inputs and minimum-mean squared error weight estimation, a\nnew uncertainty bound tailored to the causal bandit problem is derived. This\nuncertainty bound drives an upper confidence bound based intervention selection\nto optimize the reward. To cope with non-stationary bandits, a sub-graph change\ndetection mechanism is proposed, with high sample efficiency. Numerical results\ncompare the new methodology to existing schemes and show a substantial\nperformance improvement in both stationary and non-stationary settings.\nCompared to existing approaches, the proposed scheme takes 67% fewer samples to\nlearn the causal structure and achieves an average reward gain of 85%.\n","authors":["Chen Peng","Di Zhang","Urbashi Mitra"],"pdf_url":"https://arxiv.org/pdf/2408.11240v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11239v1","updated":"2024-08-20T23:36:00Z","published":"2024-08-20T23:36:00Z","title":"A Little Confidence Goes a Long Way","summary":" We introduce a group of related methods for binary classification tasks using\nprobes of the hidden state activations in large language models (LLMs).\nPerformance is on par with the largest and most advanced LLMs currently\navailable, but requiring orders of magnitude fewer computational resources and\nnot requiring labeled data. This approach involves translating class labels\ninto a semantically rich description, spontaneous symmetry breaking of\nmultilayer perceptron probes for unsupervised learning and inference, training\nprobes to generate confidence scores (prior probabilities) from hidden state\nactivations subject to known constraints via entropy maximization, and\nselecting the most confident probe model from an ensemble for prediction. These\ntechniques are evaluated on four datasets using five base LLMs.\n","authors":["John Scoville","Shang Gao","Devanshu Agrawal","Javed Qadrud-Din"],"pdf_url":"https://arxiv.org/pdf/2408.11239v1.pdf","comment":"13 pages, 2 figures"},{"id":"http://arxiv.org/abs/2408.11237v1","updated":"2024-08-20T23:30:00Z","published":"2024-08-20T23:30:00Z","title":"Out-of-Distribution Detection with Attention Head Masking for Multimodal\n Document Classification","summary":" Detecting out-of-distribution (OOD) data is crucial in machine learning\napplications to mitigate the risk of model overconfidence, thereby enhancing\nthe reliability and safety of deployed systems. The majority of existing OOD\ndetection methods predominantly address uni-modal inputs, such as images or\ntexts. In the context of multi-modal documents, there is a notable lack of\nextensive research on the performance of these methods, which have primarily\nbeen developed with a focus on computer vision tasks. We propose a novel\nmethodology termed as attention head masking (AHM) for multi-modal OOD tasks in\ndocument classification systems. Our empirical results demonstrate that the\nproposed AHM method outperforms all state-of-the-art approaches and\nsignificantly decreases the false positive rate (FPR) compared to existing\nsolutions up to 7.5\\%. This methodology generalizes well to multi-modal data,\nsuch as documents, where visual and textual information are modeled under the\nsame Transformer architecture. To address the scarcity of high-quality publicly\navailable document datasets and encourage further research on OOD detection for\ndocuments, we introduce FinanceDocs, a new document AI dataset. Our code and\ndataset are publicly available.\n","authors":["Christos Constantinou","Georgios Ioannides","Aman Chadha","Aaron Elkins","Edwin Simpson"],"pdf_url":"https://arxiv.org/pdf/2408.11237v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11234v1","updated":"2024-08-20T23:15:41Z","published":"2024-08-20T23:15:41Z","title":"Unified Deep Learning Model for Global Prediction of Aboveground\n Biomass, Canopy Height and Cover from High-Resolution, Multi-Sensor Satellite\n Imagery","summary":" Regular measurement of carbon stock in the world's forests is critical for\ncarbon accounting and reporting under national and international climate\ninitiatives, and for scientific research, but has been largely limited in\nscalability and temporal resolution due to a lack of ground based assessments.\nIncreasing efforts have been made to address these challenges by incorporating\nremotely sensed data. We present a new methodology which uses multi-sensor,\nmulti-spectral imagery at a resolution of 10 meters and a deep learning based\nmodel which unifies the prediction of above ground biomass density (AGBD),\ncanopy height (CH), canopy cover (CC) as well as uncertainty estimations for\nall three quantities. The model is trained on millions of globally sampled\nGEDI-L2/L4 measurements. We validate the capability of our model by deploying\nit over the entire globe for the year 2023 as well as annually from 2016 to\n2023 over selected areas. The model achieves a mean absolute error for AGBD\n(CH, CC) of 26.1 Mg/ha (3.7 m, 9.9 %) and a root mean squared error of 50.6\nMg/ha (5.4 m, 15.8 %) on a globally sampled test dataset, demonstrating a\nsignificant improvement over previously published results. We also report the\nmodel performance against independently collected ground measurements published\nin the literature, which show a high degree of correlation across varying\nconditions. We further show that our pre-trained model facilitates seamless\ntransferability to other GEDI variables due to its multi-head architecture.\n","authors":["Manuel Weber","Carly Beneke","Clyde Wheeler"],"pdf_url":"https://arxiv.org/pdf/2408.11234v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2403.03740v2","updated":"2024-08-20T17:05:13Z","published":"2024-03-06T14:28:53Z","title":"Self-supervised Photographic Image Layout Representation Learning","summary":" In the domain of image layout representation learning, the critical process\nof translating image layouts into succinct vector forms is increasingly\nsignificant across diverse applications, such as image retrieval, manipulation,\nand generation. Most approaches in this area heavily rely on costly labeled\ndatasets and notably lack in adapting their modeling and learning methods to\nthe specific nuances of photographic image layouts. This shortfall makes the\nlearning process for photographic image layouts suboptimal. In our research, we\ndirectly address these challenges. We innovate by defining basic layout\nprimitives that encapsulate various levels of layout information and by mapping\nthese, along with their interconnections, onto a heterogeneous graph structure.\nThis graph is meticulously engineered to capture the intricate layout\ninformation within the pixel domain explicitly. Advancing further, we introduce\nnovel pretext tasks coupled with customized loss functions, strategically\ndesigned for effective self-supervised learning of these layout graphs.\nBuilding on this foundation, we develop an autoencoder-based network\narchitecture skilled in compressing these heterogeneous layout graphs into\nprecise, dimensionally-reduced layout representations. Additionally, we\nintroduce the LODB dataset, which features a broader range of layout categories\nand richer semantics, serving as a comprehensive benchmark for evaluating the\neffectiveness of layout representation learning methods. Our extensive\nexperimentation on this dataset demonstrates the superior performance of our\napproach in the realm of photographic image layout representation learning.\n","authors":["Zhaoran Zhao","Peng Lu","Xujun Peng","Wenhao Guo"],"pdf_url":"https://arxiv.org/pdf/2403.03740v2.pdf","comment":"The authors of the paper believe that there is an error in the\n measurement of the F1 curve in the metrics description"},{"id":"http://arxiv.org/abs/2401.00763v3","updated":"2024-08-20T04:11:26Z","published":"2024-01-01T14:06:55Z","title":"New Job, New Gender? Measuring the Social Bias in Image Generation\n Models","summary":" Image generation models can generate or edit images from a given text. Recent\nadvancements in image generation technology, exemplified by DALL-E and\nMidjourney, have been groundbreaking. These advanced models, despite their\nimpressive capabilities, are often trained on massive Internet datasets, making\nthem susceptible to generating content that perpetuates social stereotypes and\nbiases, which can lead to severe consequences. Prior research on assessing bias\nwithin image generation models suffers from several shortcomings, including\nlimited accuracy, reliance on extensive human labor, and lack of comprehensive\nanalysis. In this paper, we propose BiasPainter, a novel evaluation framework\nthat can accurately, automatically and comprehensively trigger social bias in\nimage generation models. BiasPainter uses a diverse range of seed images of\nindividuals and prompts the image generation models to edit these images using\ngender, race, and age-neutral queries. These queries span 62 professions, 39\nactivities, 57 types of objects, and 70 personality traits. The framework then\ncompares the edited images to the original seed images, focusing on the\nsignificant changes related to gender, race, and age. BiasPainter adopts a key\ninsight that these characteristics should not be modified when subjected to\nneutral prompts. Built upon this design, BiasPainter can trigger the social\nbias and evaluate the fairness of image generation models. We use BiasPainter\nto evaluate six widely-used image generation models, such as stable diffusion\nand Midjourney. Experimental results show that BiasPainter can successfully\ntrigger social bias in image generation models. According to our human\nevaluation, BiasPainter can achieve 90.8% accuracy on automatic bias detection,\nwhich is significantly higher than the results reported in previous work.\n","authors":["Wenxuan Wang","Haonan Bai","Jen-tse Huang","Yuxuan Wan","Youliang Yuan","Haoyi Qiu","Nanyun Peng","Michael R. Lyu"],"pdf_url":"https://arxiv.org/pdf/2401.00763v3.pdf","comment":"ACM MM 2024 Oral"},{"id":"http://arxiv.org/abs/2408.10500v1","updated":"2024-08-20T02:46:03Z","published":"2024-08-20T02:46:03Z","title":"SZTU-CMU at MER2024: Improving Emotion-LLaMA with Conv-Attention for\n Multimodal Emotion Recognition","summary":" This paper presents our winning approach for the MER-NOISE and MER-OV tracks\nof the MER2024 Challenge on multimodal emotion recognition. Our system\nleverages the advanced emotional understanding capabilities of Emotion-LLaMA to\ngenerate high-quality annotations for unlabeled samples, addressing the\nchallenge of limited labeled data. To enhance multimodal fusion while\nmitigating modality-specific noise, we introduce Conv-Attention, a lightweight\nand efficient hybrid framework. Extensive experimentation vali-dates the\neffectiveness of our approach. In the MER-NOISE track, our system achieves a\nstate-of-the-art weighted average F-score of 85.30%, surpassing the second and\nthird-place teams by 1.47% and 1.65%, respectively. For the MER-OV track, our\nutilization of Emotion-LLaMA for open-vocabulary annotation yields an 8.52%\nimprovement in average accuracy and recall compared to GPT-4V, securing the\nhighest score among all participating large multimodal models. The code and\nmodel for Emotion-LLaMA are available at\nhttps://github.com/ZebangCheng/Emotion-LLaMA.\n","authors":["Zebang Cheng","Shuyuan Tu","Dawei Huang","Minghan Li","Xiaojiang Peng","Zhi-Qi Cheng","Alexander G. Hauptmann"],"pdf_url":"https://arxiv.org/pdf/2408.10500v1.pdf","comment":null}]},"2024-08-21T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2408.11815v1","updated":"2024-08-21T17:59:05Z","published":"2024-08-21T17:59:05Z","title":"Great Memory, Shallow Reasoning: Limits of $k$NN-LMs","summary":" $K$-nearest neighbor language models ($k$NN-LMs), which integrate retrieval\nwith next-word prediction, have demonstrated strong performance in language\nmodeling as well as downstream NLP benchmarks. These results have led\nresearchers to argue that models trained on poor quality or outdated data could\nperform well by employing a $k$NN extension that has access to a higher-quality\ndatastore. In this work, we ask whether this improved ability to recall\ninformation really translates into downstream abilities. We extensively\nevaluate $k$NN-LMs on a diverse set of tasks, ranging from sentiment\nclassification and commonsense reasoning to multi-hop reasoning. Results show\nthat $k$NN-LMs excel at memory-intensive tasks, where utilizing the patterns in\nthe input is sufficient for determining the output, but struggle with reasoning\ntasks that require integrating multiple pieces of information to derive new\nknowledge. We further demonstrate through oracle experiments and qualitative\nanalysis that even with perfect retrieval, $k$NN-LMs still fail to determine\nthe correct answers, placing an upper bound on their reasoning performance.\nCode and datastores are released at https://github.com/GSYfate/knnlm-limits/.\n","authors":["Shangyi Geng","Wenting Zhao","Alexander M Rush"],"pdf_url":"https://arxiv.org/pdf/2408.11815v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11049v2","updated":"2024-08-21T17:55:29Z","published":"2024-08-20T17:57:31Z","title":"MagicDec: Breaking the Latency-Throughput Tradeoff for Long Context\n Generation with Speculative Decoding","summary":" Large Language Models (LLMs) have become more prevalent in long-context\napplications such as interactive chatbots, document analysis, and agent\nworkflows, but it is challenging to serve long-context requests with low\nlatency and high throughput. Speculative decoding (SD) is a widely used\ntechnique to reduce latency without sacrificing performance but the\nconventional wisdom suggests that its efficacy is limited to small batch sizes.\nIn MagicDec, we show that surprisingly SD can achieve speedup even for a high\nthroughput inference regime for moderate to long sequences. More interestingly,\nan intelligent drafting strategy can achieve better speedup with increasing\nbatch size based on our rigorous analysis. MagicDec first identifies the\nbottleneck shifts with increasing batch size and sequence length, and uses\nthese insights to deploy speculative decoding more effectively for high\nthroughput inference. Then, it leverages draft models with sparse KV cache to\naddress the KV bottleneck that scales with both sequence length and batch size.\nThis finding underscores the broad applicability of speculative decoding in\nlong-context serving, as it can enhance throughput and reduce latency without\ncompromising accuracy. For moderate to long sequences, we demonstrate up to 2x\nspeedup for LLaMA-2-7B-32K and 1.84x speedup for LLaMA-3.1-8B when serving\nbatch sizes ranging from 32 to 256 on 8 NVIDIA A100 GPUs. The code is available\nat https://github.com/Infini-AI-Lab/MagicDec/.\n","authors":["Jian Chen","Vashisth Tiwari","Ranajoy Sadhukhan","Zhuoming Chen","Jinyuan Shi","Ian En-Hsu Yen","Beidi Chen"],"pdf_url":"https://arxiv.org/pdf/2408.11049v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10188v3","updated":"2024-08-21T17:47:33Z","published":"2024-08-19T17:48:08Z","title":"LongVILA: Scaling Long-Context Visual Language Models for Long Videos","summary":" Long-context capability is critical for multi-modal foundation models,\nespecially for long video understanding. We introduce LongVILA, a full-stack\nsolution for long-context visual-language models by co-designing the algorithm\nand system. For model training, we upgrade existing VLMs to support long video\nunderstanding by incorporating two additional stages, i.e., long context\nextension and long supervised fine-tuning. However, training on long video is\ncomputationally and memory intensive. We introduce the long-context Multi-Modal\nSequence Parallelism (MM-SP) system that efficiently parallelizes long video\ntraining and inference, enabling 2M context length training on 256 GPUs without\nany gradient checkpointing. LongVILA efficiently extends the number of video\nframes of VILA from 8 to 1024, improving the long video captioning score from\n2.00 to 3.26 (out of 5), achieving 99.5% accuracy in 1400-frame (274k context\nlength) video needle-in-a-haystack. LongVILA-8B demonstrates consistent\naccuracy improvements on long videos in the VideoMME benchmark as the number of\nframes increases. Besides, MM-SP is 2.1x - 5.7x faster than ring sequence\nparallelism and 1.1x - 1.4x faster than Megatron with context parallelism +\ntensor parallelism. Moreover, it seamlessly integrates with Hugging Face\nTransformers.\n","authors":["Fuzhao Xue","Yukang Chen","Dacheng Li","Qinghao Hu","Ligeng Zhu","Xiuyu Li","Yunhao Fang","Haotian Tang","Shang Yang","Zhijian Liu","Ethan He","Hongxu Yin","Pavlo Molchanov","Jan Kautz","Linxi Fan","Yuke Zhu","Yao Lu","Song Han"],"pdf_url":"https://arxiv.org/pdf/2408.10188v3.pdf","comment":"Code and models are available at\n https://github.com/NVlabs/VILA/blob/main/LongVILA.md"},{"id":"http://arxiv.org/abs/2408.11800v1","updated":"2024-08-21T17:43:11Z","published":"2024-08-21T17:43:11Z","title":"PermitQA: A Benchmark for Retrieval Augmented Generation in Wind Siting\n and Permitting domain","summary":" In the rapidly evolving landscape of Natural Language Processing (NLP) and\ntext generation, the emergence of Retrieval Augmented Generation (RAG) presents\na promising avenue for improving the quality and reliability of generated text\nby leveraging information retrieved from user specified database. Benchmarking\nis essential to evaluate and compare the performance of the different RAG\nconfigurations in terms of retriever and generator, providing insights into\ntheir effectiveness, scalability, and suitability for the specific domain and\napplications. In this paper, we present a comprehensive framework to generate a\ndomain relevant RAG benchmark. Our framework is based on automatic\nquestion-answer generation with Human (domain experts)-AI Large Language Model\n(LLM) teaming. As a case study, we demonstrate the framework by introducing\nPermitQA, a first-of-its-kind benchmark on the wind siting and permitting\ndomain which comprises of multiple scientific documents/reports related to\nenvironmental impact of wind energy projects. Our framework systematically\nevaluates RAG performance using diverse metrics and multiple question types\nwith varying complexity level. We also demonstrate the performance of different\nmodels on our benchmark.\n","authors":["Rounak Meyur","Hung Phan","Sridevi Wagle","Jan Strube","Mahantesh Halappanavar","Sameera Horawalavithana","Anurag Acharya","Sai Munikoti"],"pdf_url":"https://arxiv.org/pdf/2408.11800v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11799v1","updated":"2024-08-21T17:42:17Z","published":"2024-08-21T17:42:17Z","title":"Practical token pruning for foundation models in few-shot conversational\n virtual assistant systems","summary":" In an enterprise Virtual Assistant (VA) system, intent classification is the\ncrucial component that determines how a user input is handled based on what the\nuser wants. The VA system is expected to be a cost-efficient SaaS service with\nlow training and inference time while achieving high accuracy even with a small\nnumber of training samples. We pretrain a transformer-based sentence embedding\nmodel with a contrastive learning objective and leverage the embedding of the\nmodel as features when training intent classification models. Our approach\nachieves the state-of-the-art results for few-shot scenarios and performs\nbetter than other commercial solutions on popular intent classification\nbenchmarks. However, generating features via a transformer-based model\nincreases the inference time, especially for longer user inputs, due to the\nquadratic runtime of the transformer's attention mechanism. On top of model\ndistillation, we introduce a practical multi-task adaptation approach that\nconfigures dynamic token pruning without the need for task-specific training\nfor intent classification. We demonstrate that this approach improves the\ninference speed of popular sentence transformer models without affecting model\nperformance.\n","authors":["Haode Qi","Cheng Qian","Jian Ni","Pratyush Singh","Reza Fazeli","Gengyu Wang","Zhongzheng Shu","Eric Wayne","Juergen Bross"],"pdf_url":"https://arxiv.org/pdf/2408.11799v1.pdf","comment":"6 pages, 3 figures"},{"id":"http://arxiv.org/abs/2408.11796v1","updated":"2024-08-21T17:38:48Z","published":"2024-08-21T17:38:48Z","title":"LLM Pruning and Distillation in Practice: The Minitron Approach","summary":" We present a comprehensive report on compressing the Llama 3.1 8B and Mistral\nNeMo 12B models to 4B and 8B parameters, respectively, using pruning and\ndistillation. We explore two distinct pruning strategies: (1) depth pruning and\n(2) joint hidden/attention/MLP (width) pruning, and evaluate the results on\ncommon benchmarks from the LM Evaluation Harness. The models are then aligned\nwith NeMo Aligner and tested in instruct-tuned versions. This approach produces\na compelling 4B model from Llama 3.1 8B and a state-of-the-art\nMistral-NeMo-Minitron-8B (MN-Minitron-8B for brevity) model from Mistral NeMo\n12B. We found that with no access to the original data, it is beneficial to\nslightly fine-tune teacher models on the distillation dataset. We open-source\nour base model weights on Hugging Face with a permissive license.\n","authors":["Sharath Turuvekere Sreenivas","Saurav Muralidharan","Raviraj Joshi","Marcin Chochowski","Mostofa Patwary","Mohammad Shoeybi","Bryan Catanzaro","Jan Kautz","Pavlo Molchanov"],"pdf_url":"https://arxiv.org/pdf/2408.11796v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.00333v4","updated":"2024-08-21T17:27:47Z","published":"2023-03-01T08:53:36Z","title":"Competence-Based Analysis of Language Models","summary":" Despite the recent successes of large, pretrained neural language models\n(LLMs), comparatively little is known about the representations of linguistic\nstructure they learn during pretraining, which can lead to unexpected behaviors\nin response to prompt variation or distribution shift. To better understand\nthese models and behaviors, we introduce a general model analysis framework to\nstudy LLMs with respect to their representation and use of human-interpretable\nlinguistic properties. Our framework, CALM (Competence-based Analysis of\nLanguage Models), is designed to investigate LLM competence in the context of\nspecific tasks by intervening on models' internal representations of different\nlinguistic properties using causal probing, and measuring models' alignment\nunder these interventions with a given ground-truth causal model of the task.\nWe also develop a new approach for performing causal probing interventions\nusing gradient-based adversarial attacks, which can target a broader range of\nproperties and representations than prior techniques. Finally, we carry out a\ncase study of CALM using these interventions to analyze and compare LLM\ncompetence across a variety of lexical inference tasks, showing that CALM can\nbe used to explain and predict behaviors across these tasks.\n","authors":["Adam Davies","Jize Jiang","ChengXiang Zhai"],"pdf_url":"https://arxiv.org/pdf/2303.00333v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11457v2","updated":"2024-08-21T17:23:03Z","published":"2024-04-17T15:05:03Z","title":"Bias and Unfairness in Information Retrieval Systems: New Challenges in\n the LLM Era","summary":" With the rapid advancements of large language models (LLMs), information\nretrieval (IR) systems, such as search engines and recommender systems, have\nundergone a significant paradigm shift. This evolution, while heralding new\nopportunities, introduces emerging challenges, particularly in terms of biases\nand unfairness, which may threaten the information ecosystem. In this paper, we\npresent a comprehensive survey of existing works on emerging and pressing bias\nand unfairness issues in IR systems when the integration of LLMs. We first\nunify bias and unfairness issues as distribution mismatch problems, providing a\ngroundwork for categorizing various mitigation strategies through distribution\nalignment. Subsequently, we systematically delve into the specific bias and\nunfairness issues arising from three critical stages of LLMs integration into\nIR systems: data collection, model development, and result evaluation. In doing\nso, we meticulously review and analyze recent literature, focusing on the\ndefinitions, characteristics, and corresponding mitigation strategies\nassociated with these issues. Finally, we identify and highlight some open\nproblems and challenges for future work, aiming to inspire researchers and\nstakeholders in the IR field and beyond to better understand and mitigate bias\nand unfairness issues of IR in this LLM era. We also consistently maintain a\nGitHub repository for the relevant papers and resources in this rising\ndirection at https://github.com/KID-22/LLM-IR-Bias-Fairness-Survey.\n","authors":["Sunhao Dai","Chen Xu","Shicheng Xu","Liang Pang","Zhenhua Dong","Jun Xu"],"pdf_url":"https://arxiv.org/pdf/2404.11457v2.pdf","comment":"KDD 2024 Tutorial&Survey; Tutorial Website:\n https://llm-ir-bias-fairness.github.io/"},{"id":"http://arxiv.org/abs/2408.11788v1","updated":"2024-08-21T17:21:13Z","published":"2024-08-21T17:21:13Z","title":"DreamFactory: Pioneering Multi-Scene Long Video Generation with a\n Multi-Agent Framework","summary":" Current video generation models excel at creating short, realistic clips, but\nstruggle with longer, multi-scene videos. We introduce \\texttt{DreamFactory},\nan LLM-based framework that tackles this challenge. \\texttt{DreamFactory}\nleverages multi-agent collaboration principles and a Key Frames Iteration\nDesign Method to ensure consistency and style across long videos. It utilizes\nChain of Thought (COT) to address uncertainties inherent in large language\nmodels. \\texttt{DreamFactory} generates long, stylistically coherent, and\ncomplex videos. Evaluating these long-form videos presents a challenge. We\npropose novel metrics such as Cross-Scene Face Distance Score and Cross-Scene\nStyle Consistency Score. To further research in this area, we contribute the\nMulti-Scene Videos Dataset containing over 150 human-rated videos.\n","authors":["Zhifei Xie","Daniel Tang","Dingwei Tan","Jacques Klein","Tegawend F. Bissyand","Saad Ezzini"],"pdf_url":"https://arxiv.org/pdf/2408.11788v1.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2408.11779v1","updated":"2024-08-21T17:09:00Z","published":"2024-08-21T17:09:00Z","title":"Personality Alignment of Large Language Models","summary":" Current methods for aligning large language models (LLMs) typically aim to\nreflect general human values and behaviors, but they often fail to capture the\nunique characteristics and preferences of individual users. To address this\ngap, we introduce the concept of Personality Alignment. This approach tailors\nLLMs' responses and decisions to match the specific preferences of individual\nusers or closely related groups. Inspired by psychometrics, we created the\nPersonality Alignment with Personality Inventories (PAPI) dataset, which\nincludes data from 300,000 real subjects, each providing behavioral preferences\nbased on the Big Five Personality Factors. This dataset allows us to\nquantitatively evaluate the extent to which LLMs can align with each subject's\nbehavioral patterns. Recognizing the challenges of personality alignments: such\nas limited personal data, diverse preferences, and scalability requirements: we\ndeveloped an activation intervention optimization method. This method enhances\nLLMs' ability to efficiently align with individual behavioral preferences using\nminimal data and computational resources. Remarkably, our method, PAS, achieves\nsuperior performance while requiring only 1/5 of the optimization time compared\nto DPO, offering practical value for personality alignment. Our work paves the\nway for future AI systems to make decisions and reason in truly personality\nways, enhancing the relevance and meaning of AI interactions for each user and\nadvancing human-centered artificial intelligence.The code has released in\n\\url{https://github.com/zhu-minjun/PAlign}.\n","authors":["Minjun Zhu","Linyi Yang","Yue Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.11779v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09544v2","updated":"2024-08-21T17:04:08Z","published":"2024-08-18T17:01:42Z","title":"No Such Thing as a General Learner: Language models and their dual\n optimization","summary":" What role can the otherwise successful Large Language Models (LLMs) play in\nthe understanding of human cognition, and in particular in terms of informing\nlanguage acquisition debates? To contribute to this question, we first argue\nthat neither humans nor LLMs are general learners, in a variety of senses. We\nmake a novel case for how in particular LLMs follow a dual-optimization\nprocess: they are optimized during their training (which is typically compared\nto language acquisition), and modern LLMs have also been selected, through a\nprocess akin to natural selection in a species. From this perspective, we argue\nthat the performance of LLMs, whether similar or dissimilar to that of humans,\ndoes not weigh easily on important debates about the importance of human\ncognitive biases for language.\n","authors":["Emmanuel Chemla","Ryan M. Nefdt"],"pdf_url":"https://arxiv.org/pdf/2408.09544v2.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.11775v1","updated":"2024-08-21T17:00:05Z","published":"2024-08-21T17:00:05Z","title":"Leveraging Fine-Tuned Retrieval-Augmented Generation with Long-Context\n Support: For 3GPP Standards","summary":" Recent studies show that large language models (LLMs) struggle with technical\nstandards in telecommunications. We propose a fine-tuned retrieval-augmented\ngeneration (RAG) system based on the Phi-2 small language model (SLM) to serve\nas an oracle for communication networks. Our developed system leverages\nforward-looking semantic chunking to adaptively determine parsing breakpoints\nbased on embedding similarity, enabling effective processing of diverse\ndocument formats. To handle the challenge of multiple similar contexts in\ntechnical standards, we employ a re-ranking algorithm to prioritize the most\nrelevant retrieved chunks. Recognizing the limitations of Phi-2's small context\nwindow, we implement a recent technique, namely SelfExtend, to expand the\ncontext window during inference, which not only boosts the performance but also\ncan accommodate a wider range of user queries and design requirements from\ncustomers to specialized technicians. For fine-tuning, we utilize the low-rank\nadaptation (LoRA) technique to enhance computational efficiency during training\nand enable effective fine-tuning on small datasets. Our comprehensive\nexperiments demonstrate substantial improvements over existing\nquestion-answering approaches in the telecom domain, achieving performance that\nexceeds larger language models such as GPT-4 (which is about 880 times larger\nin size). This work presents a novel approach to leveraging SLMs for\ncommunication networks, offering a balance of efficiency and performance. This\nwork can serve as a foundation towards agentic language models for networks.\n","authors":["Omar Erak","Nouf Alabbasi","Omar Alhussein","Ismail Lotfi","Amr Hussein","Sami Muhaidat","Merouane Debbah"],"pdf_url":"https://arxiv.org/pdf/2408.11775v1.pdf","comment":"submitted to Proc. IEEE Globecom"},{"id":"http://arxiv.org/abs/2309.11419v2","updated":"2024-08-21T16:54:23Z","published":"2023-09-20T15:50:08Z","title":"KOSMOS-2.5: A Multimodal Literate Model","summary":" The automatic reading of text-intensive images represents a significant\nadvancement toward achieving Artificial General Intelligence (AGI). In this\npaper we present KOSMOS-2.5, a multimodal literate model for machine reading of\ntext-intensive images. Pre-trained on a large-scale corpus of text-intensive\nimages, KOSMOS-2.5 excels in two distinct yet complementary transcription\ntasks: (1) generating spatially-aware text blocks, where each block of text is\nassigned spatial coordinates within the image, and (2) producing structured\ntext output that captures both style and structure in markdown format. This\nunified multimodal literate capability is achieved through a shared\ndecoder-only autoregressive Transformer architecture and task-specific prompts.\nBuilding on this foundation, we fine-tune KOSMOS-2.5 for document understanding\ntasks, resulting in a document understanding generalist named KOSMOS-2.5-CHAT.\nAdditionally, a large corpus of 357.4 million document pages spanning diverse\ndomains was curated for pre-training. We evaluate KOSMOS-2.5 on two newly\nproposed benchmarks, OCREval and MarkdownEval, for document-level text\nrecognition and image-to-markdown generation, demonstrating impressive literate\ncapabilities comparable to GPT-4o. KOSMOS-2.5-CHAT achieves performance\ncomparable to other state-of-the-art generalists that are five times larger\n(1.3B vs. 7B) across nine text-rich visual question answering benchmarks.\nModels and code have been available at \\url{https://aka.ms/kosmos25}.\n","authors":["Tengchao Lv","Yupan Huang","Jingye Chen","Yuzhong Zhao","Yilin Jia","Lei Cui","Shuming Ma","Yaoyao Chang","Shaohan Huang","Wenhui Wang","Li Dong","Weiyao Luo","Shaoxiang Wu","Guoxin Wang","Cha Zhang","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2309.11419v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11749v1","updated":"2024-08-21T16:16:34Z","published":"2024-08-21T16:16:34Z","title":"Against All Odds: Overcoming Typology, Script, and Language Confusion in\n Multilingual Embedding Inversion Attacks","summary":" Large Language Models (LLMs) are susceptible to malicious influence by cyber\nattackers through intrusions such as adversarial, backdoor, and embedding\ninversion attacks. In response, the burgeoning field of LLM Security aims to\nstudy and defend against such threats. Thus far, the majority of works in this\narea have focused on monolingual English models, however, emerging research\nsuggests that multilingual LLMs may be more vulnerable to various attacks than\ntheir monolingual counterparts. While previous work has investigated embedding\ninversion over a small subset of European languages, it is challenging to\nextrapolate these findings to languages from different linguistic families and\nwith differing scripts. To this end, we explore the security of multilingual\nLLMs in the context of embedding inversion attacks and investigate\ncross-lingual and cross-script inversion across 20 languages, spanning over 8\nlanguage families and 12 scripts. Our findings indicate that languages written\nin Arabic script and Cyrillic script are particularly vulnerable to embedding\ninversion, as are languages within the Indo-Aryan language family. We further\nobserve that inversion models tend to suffer from language confusion, sometimes\ngreatly reducing the efficacy of an attack. Accordingly, we systematically\nexplore this bottleneck for inversion models, uncovering predictable patterns\nwhich could be leveraged by attackers. Ultimately, this study aims to further\nthe field's understanding of the outstanding security vulnerabilities facing\nmultilingual LLMs and raise awareness for the languages most at risk of\nnegative impact from these attacks.\n","authors":["Yiyi Chen","Russa Biswas","Heather Lent","Johannes Bjerva"],"pdf_url":"https://arxiv.org/pdf/2408.11749v1.pdf","comment":"11 pages, 4 figures, 7 tables"},{"id":"http://arxiv.org/abs/2408.11745v1","updated":"2024-08-21T16:11:59Z","published":"2024-08-21T16:11:59Z","title":"FocusLLM: Scaling LLM's Context by Parallel Decoding","summary":" Empowering LLMs with the ability to utilize useful information from a long\ncontext is crucial for many downstream applications. However, achieving long\ncontext lengths with the conventional transformer architecture requires\nsubstantial training and inference resources. In this paper, we present\nFocusLLM, a framework designed to extend the context length of any decoder-only\nLLM, enabling the model to focus on relevant information from very long\nsequences. FocusLLM processes long text inputs by dividing them into chunks\nbased on the model's original context length to alleviate the issue of\nattention distraction. Then, it appends the local context to each chunk as a\nprompt to extract essential information from each chunk based on a novel\nparallel decoding mechanism, and ultimately integrates the extracted\ninformation into the local context. FocusLLM stands out for great training\nefficiency and versatility: trained with an 8K input length with much less\ntraining cost than previous methods, FocusLLM exhibits superior performance\nacross downstream long-context tasks and maintains strong language modeling\nability when handling extensive long texts, even up to 400K tokens. Our code is\navailable at https://github.com/leezythu/FocusLLM.\n","authors":["Zhenyu Li","Yike Zhang","Tengyu Pan","Yutao Sun","Zhichao Duan","Junjie Fang","Rong Han","Zixuan Wang","Jianyong Wang"],"pdf_url":"https://arxiv.org/pdf/2408.11745v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11727v1","updated":"2024-08-21T15:54:04Z","published":"2024-08-21T15:54:04Z","title":"Efficient Detection of Toxic Prompts in Large Language Models","summary":" Large language models (LLMs) like ChatGPT and Gemini have significantly\nadvanced natural language processing, enabling various applications such as\nchatbots and automated content generation. However, these models can be\nexploited by malicious individuals who craft toxic prompts to elicit harmful or\nunethical responses. These individuals often employ jailbreaking techniques to\nbypass safety mechanisms, highlighting the need for robust toxic prompt\ndetection methods. Existing detection techniques, both blackbox and whitebox,\nface challenges related to the diversity of toxic prompts, scalability, and\ncomputational efficiency. In response, we propose ToxicDetector, a lightweight\ngreybox method designed to efficiently detect toxic prompts in LLMs.\nToxicDetector leverages LLMs to create toxic concept prompts, uses embedding\nvectors to form feature vectors, and employs a Multi-Layer Perceptron (MLP)\nclassifier for prompt classification. Our evaluation on various versions of the\nLLama models, Gemma-2, and multiple datasets demonstrates that ToxicDetector\nachieves a high accuracy of 96.39\\% and a low false positive rate of 2.00\\%,\noutperforming state-of-the-art methods. Additionally, ToxicDetector's\nprocessing time of 0.0780 seconds per prompt makes it highly suitable for\nreal-time applications. ToxicDetector achieves high accuracy, efficiency, and\nscalability, making it a practical method for toxic prompt detection in LLMs.\n","authors":["Yi Liu","Junzhe Yu","Huijia Sun","Ling Shi","Gelei Deng","Yuqi Chen","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2408.11727v1.pdf","comment":"Accepted by the 39th IEEE/ACM International Conference on Automated\n Software Engineering (ASE 2024)"},{"id":"http://arxiv.org/abs/2408.10923v2","updated":"2024-08-21T15:51:33Z","published":"2024-08-20T15:05:02Z","title":"LBC: Language-Based-Classifier for Out-Of-Variable Generalization","summary":" Large Language Models (LLMs) have great success in natural language\nprocessing tasks such as response generation. However, their use in tabular\ndata has been limited due to their inferior performance compared to traditional\nmachine learning models (TMLs) such as XGBoost. We find that the pre-trained\nknowledge of LLMs enables them to interpret new variables that appear in a test\nwithout additional training, a capability central to the concept of\nOut-of-Variable (OOV). From the findings, we propose a\nLanguage-Based-Classifier (LBC), a classifier that maximizes the benefits of\nLLMs to outperform TMLs on OOV tasks. LBC employs three key methodological\nstrategies: 1) Categorical changes to adjust data to better fit the model's\nunderstanding, 2) Advanced order and indicator to enhance data representation\nto the model, and 3) Using verbalizer to map logit scores to classes during\ninference to generate model predictions. These strategies, combined with the\npre-trained knowledge of LBC, emphasize the model's ability to effectively\nhandle OOV tasks. We empirically and theoretically validate the superiority of\nLBC. LBC is the first study to apply an LLM-based model to OOV tasks. The\nsource code is at\nhttps://github.com/ASDASDanonymous/Language-Based-Classifier-forOOVtasks.\n","authors":["Kangjun Noh","Baekryun Seong","Hoyoon Byun","Youngjun Choi","Sungjin Song","Kyungwoo Song"],"pdf_url":"https://arxiv.org/pdf/2408.10923v2.pdf","comment":"16 pages, 7 figures, 4 tables"},{"id":"http://arxiv.org/abs/2404.03862v2","updated":"2024-08-21T15:23:28Z","published":"2024-04-05T02:27:09Z","title":"Verifiable by Design: Aligning Language Models to Quote from\n Pre-Training Data","summary":" To trust the fluent generations of large language models (LLMs), humans must\nbe able to verify their correctness against trusted, external sources. Recent\nefforts, such as providing citations via retrieved documents or post-hoc\nprovenance, enhance verifiability but still provide no guarantees on their\ncorrectness. To address these limitations, we tackle the verifiability goal\nwith a different philosophy: trivializing the verification process by\ndeveloping models that quote verbatim statements from trusted sources in\npre-training data. We propose Quote-Tuning, and demonstrate it is feasible to\nalign LLMs to provide quoted statements from data memorized during\npre-training. The core of Quote-Tuning is a fast membership inference function\n(Marone and Van Durme, 2023) that efficiently verifies text against a trusted\ncorpus. We leverage this tool to design a reward function to quantify quotes in\nmodel responses, which is then used to create a dataset for preference\nlearning. Experimental results show that Quote-Tuning significantly increases\nverbatim quotes from high-quality pre-training documents by 55% to 130%\nrelative to un-tuned models while maintaining response quality. Quote-Tuning\nalso generalizes quoting to out-of-domain data, is applicable in different\ntasks, and provides additional benefits to truthfulness. Our method not only\nserves as a hassle-free method to increase quoting but also opens up avenues\nfor improving LLM trustworthiness through better verifiability.\n","authors":["Jingyu Zhang","Marc Marone","Tianjian Li","Benjamin Van Durme","Daniel Khashabi"],"pdf_url":"https://arxiv.org/pdf/2404.03862v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10264v3","updated":"2024-08-21T15:12:37Z","published":"2024-07-14T16:12:57Z","title":"What Makes and Breaks Safety Fine-tuning? A Mechanistic Study","summary":" Safety fine-tuning helps align Large Language Models (LLMs) with human\npreferences for their safe deployment. To better understand the underlying\nfactors that make models safe via safety fine-tuning, we design a synthetic\ndata generation framework that captures salient aspects of an unsafe input by\nmodeling the interaction between the task the model is asked to perform (e.g.,\n\"design\") versus the specific concepts the task is asked to be performed upon\n(e.g., a \"cycle\" vs. a \"bomb\"). Using this, we investigate three well-known\nsafety fine-tuning methods -- supervised safety fine-tuning, direct preference\noptimization, and unlearning -- and provide significant evidence demonstrating\nthat these methods minimally transform MLP weights to specifically align unsafe\ninputs into its weights' null space. This yields a clustering of inputs based\non whether the model deems them safe or not. Correspondingly, when an\nadversarial input (e.g., a jailbreak) is provided, its activations are closer\nto safer samples, leading to the model processing such an input as if it were\nsafe. We validate our findings, wherever possible, on real-world models --\nspecifically, Llama-2 7B and Llama-3 8B.\n","authors":["Samyak Jain","Ekdeep Singh Lubana","Kemal Oksuz","Tom Joy","Philip H. S. Torr","Amartya Sanyal","Puneet K. Dokania"],"pdf_url":"https://arxiv.org/pdf/2407.10264v3.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2408.10468v2","updated":"2024-08-21T14:35:48Z","published":"2024-08-20T00:40:49Z","title":"Tracing Privacy Leakage of Language Models to Training Data via Adjusted\n Influence Functions","summary":" The responses generated by Large Language Models (LLMs) can include sensitive\ninformation from individuals and organizations, leading to potential privacy\nleakage. This work implements Influence Functions (IFs) to trace privacy\nleakage back to the training data, thereby mitigating privacy concerns of\nLanguage Models (LMs). However, we notice that current IFs struggle to\naccurately estimate the influence of tokens with large gradient norms,\npotentially overestimating their influence. When tracing the most influential\nsamples, this leads to frequently tracing back to samples with large gradient\nnorm tokens, overshadowing the actual most influential samples even if their\ninfluences are well estimated. To address this issue, we propose Heuristically\nAdjusted IF (HAIF), which reduces the weight of tokens with large gradient\nnorms, thereby significantly improving the accuracy of tracing the most\ninfluential samples. To establish easily obtained groundtruth for tracing\nprivacy leakage, we construct two datasets, PII-E and PII-CR, representing two\ndistinct scenarios: one with identical text in the model outputs and\npre-training data, and the other where models leverage their reasoning\nabilities to generate text divergent from pre-training data. HAIF significantly\nimproves tracing accuracy, enhancing it by 20.96\\% to 73.71\\% on the PII-E\ndataset and 3.21\\% to 45.93\\% on the PII-CR dataset, compared to the best SOTA\nIFs against various GPT-2 and QWen-1.5 models. HAIF also outperforms SOTA IFs\non real-world pretraining data CLUECorpus2020, demonstrating strong robustness\nregardless prompt and response lengths.\n","authors":["Jinxin Liu","Zao Yang"],"pdf_url":"https://arxiv.org/pdf/2408.10468v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.02984v2","updated":"2024-08-21T13:55:37Z","published":"2024-01-01T17:35:52Z","title":"Large Language Models in Mental Health Care: a Scoping Review","summary":" The integration of large language models (LLMs) in mental health care is an\nemerging field. There is a need to systematically review the application\noutcomes and delineate the advantages and limitations in clinical settings.\nThis review aims to provide a comprehensive overview of the use of LLMs in\nmental health care, assessing their efficacy, challenges, and potential for\nfuture applications. A systematic search was conducted across multiple\ndatabases including PubMed, Web of Science, Google Scholar, arXiv, medRxiv, and\nPsyArXiv in November 2023. All forms of original research, peer-reviewed or\nnot, published or disseminated between October 1, 2019, and December 2, 2023,\nare included without language restrictions if they used LLMs developed after T5\nand directly addressed research questions in mental health care settings. From\nan initial pool of 313 articles, 34 met the inclusion criteria based on their\nrelevance to LLM application in mental health care and the robustness of\nreported outcomes. Diverse applications of LLMs in mental health care are\nidentified, including diagnosis, therapy, patient engagement enhancement, etc.\nKey challenges include data availability and reliability, nuanced handling of\nmental states, and effective evaluation methods. Despite successes in accuracy\nand accessibility improvement, gaps in clinical applicability and ethical\nconsiderations were evident, pointing to the need for robust data, standardized\nevaluations, and interdisciplinary collaboration. LLMs hold substantial promise\nfor enhancing mental health care. For their full potential to be realized,\nemphasis must be placed on developing robust datasets, development and\nevaluation frameworks, ethical guidelines, and interdisciplinary collaborations\nto address current limitations.\n","authors":["Yining Hua","Fenglin Liu","Kailai Yang","Zehan Li","Hongbin Na","Yi-han Sheu","Peilin Zhou","Lauren V. Moran","Sophia Ananiadou","Andrew Beam","John Torous"],"pdf_url":"https://arxiv.org/pdf/2401.02984v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09835v5","updated":"2024-08-21T13:36:30Z","published":"2023-11-16T12:03:21Z","title":"ML-Bench: Evaluating Large Language Models and Agents for Machine\n Learning Tasks on Repository-Level Code","summary":" Despite Large Language Models (LLMs) like GPT-4 achieving impressive results\nin function-level code generation, they struggle with repository-scale code\nunderstanding (e.g., coming up with the right arguments for calling routines),\nrequiring a deeper comprehension of complex file interactions. Also, recently,\npeople have developed LLM agents that attempt to interact with repository code\n(e.g., compiling and evaluating its execution), prompting the need to evaluate\ntheir performance. These gaps have motivated our development of ML-Bench, a\nbenchmark rooted in real-world programming applications that leverage existing\ncode repositories to perform tasks. Addressing the need for LLMs to interpret\nlong code contexts and translate instructions into precise, executable scripts,\nML-Bench encompasses annotated 9,641 examples across 18 GitHub repositories,\nchallenging LLMs to accommodate user-specified arguments and documentation\nintricacies effectively. To evaluate both LLMs and AI agents, two setups are\nemployed: ML-LLM-Bench for assessing LLMs' text-to-code conversion within a\npredefined deployment environment, and ML-Agent-Bench for testing autonomous\nagents in an end-to-end task execution within a Linux sandbox environment. Our\nfindings indicate that while GPT-4o leads with a Pass@5 rate surpassing 50%,\nthere remains significant scope for improvement, highlighted by issues such as\nhallucinated outputs and difficulties with bash script generation. Notably, in\nthe more demanding ML-Agent-Bench, GPT-4o achieves a 76.47% success rate,\nreflecting the efficacy of iterative action and feedback in complex task\nresolution. Our code, dataset, and models are available at\nhttps://github.com/gersteinlab/ML-bench.\n","authors":["Xiangru Tang","Yuliang Liu","Zefan Cai","Yanjun Shao","Junjie Lu","Yichi Zhang","Zexuan Deng","Helan Hu","Kaikai An","Ruijun Huang","Shuzheng Si","Sheng Chen","Haozhe Zhao","Liang Chen","Yan Wang","Tianyu Liu","Zhiwei Jiang","Baobao Chang","Yin Fang","Yujia Qin","Wangchunshu Zhou","Yilun Zhao","Arman Cohan","Mark Gerstein"],"pdf_url":"https://arxiv.org/pdf/2311.09835v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11609v1","updated":"2024-08-21T13:34:29Z","published":"2024-08-21T13:34:29Z","title":"Xinyu: An Efficient LLM-based System for Commentary Generation","summary":" Commentary provides readers with a deep understanding of events by presenting\ndiverse arguments and evidence. However, creating commentary is a\ntime-consuming task, even for skilled commentators. Large language models\n(LLMs) have simplified the process of natural language generation, but their\ndirect application in commentary creation still faces challenges due to unique\ntask requirements. These requirements can be categorized into two levels: 1)\nfundamental requirements, which include creating well-structured and logically\nconsistent narratives, and 2) advanced requirements, which involve generating\nquality arguments and providing convincing evidence. In this paper, we\nintroduce Xinyu, an efficient LLM-based system designed to assist commentators\nin generating Chinese commentaries. To meet the fundamental requirements, we\ndeconstruct the generation process into sequential steps, proposing targeted\nstrategies and supervised fine-tuning (SFT) for each step. To address the\nadvanced requirements, we present an argument ranking model for arguments and\nestablish a comprehensive evidence database that includes up-to-date events and\nclassic books, thereby strengthening the substantiation of the evidence with\nretrieval augmented generation (RAG) technology. To evaluate the generated\ncommentaries more fairly, corresponding to the two-level requirements, we\nintroduce a comprehensive evaluation metric that considers five distinct\nperspectives in commentary generation. Our experiments confirm the\neffectiveness of our proposed system. We also observe a significant increase in\nthe efficiency of commentators in real-world scenarios, with the average time\nspent on creating a commentary dropping from 4 hours to 20 minutes.\nImportantly, such an increase in efficiency does not compromise the quality of\nthe commentaries.\n","authors":["Yiquan Wu","Bo Tang","Chenyang Xi","Yu Yu","Pengyu Wang","Yifei Liu","Kun Kuang","Haiying Deng","Zhiyu Li","Feiyu Xiong","Jie Hu","Peng Cheng","Zhonghao Wang","Yi Wang","Yi Luo","Mingchuan Yang"],"pdf_url":"https://arxiv.org/pdf/2408.11609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14795v4","updated":"2024-08-21T13:32:18Z","published":"2024-04-23T07:19:20Z","title":"Watch Out for Your Guidance on Generation! Exploring Conditional\n Backdoor Attacks against Large Language Models","summary":" Mainstream backdoor attacks on large language models (LLMs) typically set a\nfixed trigger in the input instance and specific responses for triggered\nqueries. However, the fixed trigger setting (e.g., unusual words) may be easily\ndetected by human detection, limiting the effectiveness and practicality in\nreal-world scenarios. To enhance the stealthiness of backdoor activation, we\npresent a new poisoning paradigm against LLMs triggered by specifying\ngeneration conditions, which are commonly adopted strategies by users during\nmodel inference. The poisoned model performs normally for output under\nnormal/other generation conditions, while becomes harmful for output under\ntarget generation conditions. To achieve this objective, we introduce BrieFool,\nan efficient attack framework. It leverages the characteristics of generation\nconditions by efficient instruction sampling and poisoning data generation,\nthereby influencing the behavior of LLMs under target conditions. Our attack\ncan be generally divided into two types with different targets: Safety\nunalignment attack and Ability degradation attack. Our extensive experiments\ndemonstrate that BrieFool is effective across safety domains and ability\ndomains, achieving higher success rates than baseline methods, with 94.3 % on\nGPT-3.5-turbo\n","authors":["Jiaming He","Wenbo Jiang","Guanyu Hou","Wenshu Fan","Rui Zhang","Hongwei Li"],"pdf_url":"https://arxiv.org/pdf/2404.14795v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11599v1","updated":"2024-08-21T13:11:03Z","published":"2024-08-21T13:11:03Z","title":"Cause-Aware Empathetic Response Generation via Chain-of-Thought\n Fine-Tuning","summary":" Empathetic response generation endows agents with the capability to\ncomprehend dialogue contexts and react to expressed emotions. Previous works\npredominantly focus on leveraging the speaker's emotional labels, but ignore\nthe importance of emotion cause reasoning in empathetic response generation,\nwhich hinders the model's capacity for further affective understanding and\ncognitive inference. In this paper, we propose a cause-aware empathetic\ngeneration approach by integrating emotions and causes through a well-designed\nChain-of-Thought (CoT) prompt on Large Language Models (LLMs). Our approach can\ngreatly promote LLMs' performance of empathy by instruction tuning and\nenhancing the role awareness of an empathetic listener in the prompt.\nAdditionally, we propose to incorporate cause-oriented external knowledge from\nCOMET into the prompt, which improves the diversity of generation and\nalleviates conflicts between internal and external knowledge at the same time.\nExperimental results on the benchmark dataset demonstrate that our approach on\nLLaMA-7b achieves state-of-the-art performance in both automatic and human\nevaluations.\n","authors":["Xinhao Chen","Chong Yang","Man Lan","Li Cai","Yang Chen","Tu Hu","Xinlin Zhuang","Aimin Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.11599v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21033v2","updated":"2024-08-21T13:09:02Z","published":"2024-07-17T05:42:43Z","title":"Multi-Grained Query-Guided Set Prediction Network for Grounded\n Multimodal Named Entity Recognition","summary":" Grounded Multimodal Named Entity Recognition (GMNER) is an emerging\ninformation extraction (IE) task, aiming to simultaneously extract entity\nspans, types, and corresponding visual regions of entities from given\nsentence-image pairs data. Recent unified methods employing machine reading\ncomprehension or sequence generation-based frameworks show limitations in this\ndifficult task. The former, utilizing human-designed queries, struggles to\ndifferentiate ambiguous entities, such as Jordan (Person) and off-White x\nJordan (Shoes). The latter, following the one-by-one decoding order, suffers\nfrom exposure bias issues. We maintain that these works misunderstand the\nrelationships of multimodal entities. To tackle these, we propose a novel\nunified framework named Multi-grained Query-guided Set Prediction Network\n(MQSPN) to learn appropriate relationships at intra-entity and inter-entity\nlevels. Specifically, MQSPN consists of a Multi-grained Query Set (MQS) and a\nMultimodal Set Prediction Network (MSP). MQS explicitly aligns entity regions\nwith entity spans by employing a set of learnable queries to strengthen\nintra-entity connections. Based on distinct intra-entity modeling, MSP\nreformulates GMNER as a set prediction, guiding models to establish appropriate\ninter-entity relationships from a global matching perspective. Additionally, we\nincorporate a query-guided Fusion Net (QFNet) to work as a glue network between\nMQS and MSP. Extensive experiments demonstrate that our approach achieves\nstate-of-the-art performances in widely used benchmarks.\n","authors":["Jielong Tang","Zhenxing Wang","Ziyang Gong","Jianxing Yu","Xiangwei Zhu","Jian Yin"],"pdf_url":"https://arxiv.org/pdf/2407.21033v2.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2408.11587v1","updated":"2024-08-21T12:50:23Z","published":"2024-08-21T12:50:23Z","title":"Large Language Models are Good Attackers: Efficient and Stealthy Textual\n Backdoor Attacks","summary":" With the burgeoning advancements in the field of natural language processing\n(NLP), the demand for training data has increased significantly. To save costs,\nit has become common for users and businesses to outsource the labor-intensive\ntask of data collection to third-party entities. Unfortunately, recent research\nhas unveiled the inherent risk associated with this practice, particularly in\nexposing NLP systems to potential backdoor attacks. Specifically, these attacks\nenable malicious control over the behavior of a trained model by poisoning a\nsmall portion of the training data. Unlike backdoor attacks in computer vision,\ntextual backdoor attacks impose stringent requirements for attack stealthiness.\nHowever, existing attack methods meet significant trade-off between\neffectiveness and stealthiness, largely due to the high information entropy\ninherent in textual data. In this paper, we introduce the Efficient and\nStealthy Textual backdoor attack method, EST-Bad, leveraging Large Language\nModels (LLMs). Our EST-Bad encompasses three core strategies: optimizing the\ninherent flaw of models as the trigger, stealthily injecting triggers with\nLLMs, and meticulously selecting the most impactful samples for backdoor\ninjection. Through the integration of these techniques, EST-Bad demonstrates an\nefficient achievement of competitive attack performance while maintaining\nsuperior stealthiness compared to prior methods across various text classifier\ndatasets.\n","authors":["Ziqiang Li","Yueqi Zeng","Pengfei Xia","Lei Liu","Zhangjie Fu","Bin Li"],"pdf_url":"https://arxiv.org/pdf/2408.11587v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2408.07666v3","updated":"2024-08-21T12:47:31Z","published":"2024-08-14T16:58:48Z","title":"Model Merging in LLMs, MLLMs, and Beyond: Methods, Theories,\n Applications and Opportunities","summary":" Model merging is an efficient empowerment technique in the machine learning\ncommunity that does not require the collection of raw training data and does\nnot require expensive computation. As model merging becomes increasingly\nprevalent across various fields, it is crucial to understand the available\nmodel merging techniques comprehensively. However, there is a significant gap\nin the literature regarding a systematic and thorough review of these\ntechniques. This survey provides a comprehensive overview of model merging\nmethods and theories, their applications in various domains and settings, and\nfuture research directions. Specifically, we first propose a new taxonomic\napproach that exhaustively discusses existing model merging methods. Secondly,\nwe discuss the application of model merging techniques in large language\nmodels, multimodal large language models, and 10+ machine learning subfields,\nincluding continual learning, multi-task learning, few-shot learning, etc.\nFinally, we highlight the remaining challenges of model merging and discuss\nfuture research directions. A comprehensive list of papers about model merging\nis available at\n\\url{https://github.com/EnnengYang/Awesome-Model-Merging-Methods-Theories-Applications}.\n","authors":["Enneng Yang","Li Shen","Guibing Guo","Xingwei Wang","Xiaochun Cao","Jie Zhang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2408.07666v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11574v1","updated":"2024-08-21T12:29:38Z","published":"2024-08-21T12:29:38Z","title":"Drama Engine: A Framework for Narrative Agents","summary":" This technical report presents the Drama Engine, a novel framework for\nagentic interaction with large language models designed for narrative purposes.\nThe framework adapts multi-agent system principles to create dynamic,\ncontext-aware companions that can develop over time and interact with users and\neach other. Key features include multi-agent workflows with delegation, dynamic\nprompt assembly, and model-agnostic design. The Drama Engine introduces unique\nelements such as companion development, mood systems, and automatic context\nsummarising. It is implemented in TypeScript. The framework's applications\ninclude multi-agent chats and virtual co-workers for creative writing. The\npaper discusses the system's architecture, prompt assembly process, delegation\nmechanisms, and moderation techniques, as well as potential ethical\nconsiderations and future extensions.\n","authors":["Martin Pichlmair","Riddhi Raj","Charlene Putney"],"pdf_url":"https://arxiv.org/pdf/2408.11574v1.pdf","comment":"10 pages, 2 figures, 2 tables"},{"id":"http://arxiv.org/abs/2407.19345v2","updated":"2024-08-21T12:22:51Z","published":"2024-07-27T21:56:23Z","title":"Inference-Time Selective Debiasing","summary":" We propose selective debiasing -- an inference-time safety mechanism that\naims to increase the overall quality of models in terms of prediction\nperformance and fairness in the situation when re-training a model is\nprohibitive. The method is inspired by selective prediction, where some\npredictions that are considered low quality are discarded at inference time. In\nour approach, we identify the potentially biased model predictions and, instead\nof discarding them, we debias them using LEACE -- a post-processing debiasing\nmethod. To select problematic predictions, we propose a bias quantification\napproach based on KL divergence, which achieves better results than standard UQ\nmethods. Experiments with text classification datasets demonstrate that\nselective debiasing helps to close the performance gap between post-processing\nmethods and at-training and pre-processing debiasing techniques.\n","authors":["Gleb Kuzmin","Neemesh Yadav","Ivan Smirnov","Timothy Baldwin","Artem Shelmanov"],"pdf_url":"https://arxiv.org/pdf/2407.19345v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11554v1","updated":"2024-08-21T12:05:21Z","published":"2024-08-21T12:05:21Z","title":"Differentiating Choices via Commonality for Multiple-Choice Question\n Answering","summary":" Multiple-choice question answering (MCQA) becomes particularly challenging\nwhen all choices are relevant to the question and are semantically similar. Yet\nthis setting of MCQA can potentially provide valuable clues for choosing the\nright answer. Existing models often rank each choice separately, overlooking\nthe context provided by other choices. Specifically, they fail to leverage the\nsemantic commonalities and nuances among the choices for reasoning. In this\npaper, we propose a novel MCQA model by differentiating choices through\nidentifying and eliminating their commonality, called DCQA. Our model captures\ntoken-level attention of each choice to the question, and separates tokens of\nthe question attended to by all the choices (i.e., commonalities) from those by\nindividual choices (i.e., nuances). Using the nuances as refined contexts for\nthe choices, our model can effectively differentiate choices with subtle\ndifferences and provide justifications for choosing the correct answer. We\nconduct comprehensive experiments across five commonly used MCQA benchmarks,\ndemonstrating that DCQA consistently outperforms baseline models. Furthermore,\nour case study illustrates the effectiveness of the approach in directing the\nattention of the model to more differentiating features.\n","authors":["Wenqing Deng","Zhe Wang","Kewen Wang","Shirui Pan","Xiaowang Zhang","Zhiyong Feng"],"pdf_url":"https://arxiv.org/pdf/2408.11554v1.pdf","comment":"9 pages, accepted to ECAI 2024"},{"id":"http://arxiv.org/abs/2406.12975v2","updated":"2024-08-21T11:57:05Z","published":"2024-06-18T18:00:03Z","title":"SHIELD: Evaluation and Defense Strategies for Copyright Compliance in\n LLM Text Generation","summary":" Large Language Models (LLMs) have transformed machine learning but raised\nsignificant legal concerns due to their potential to produce text that\ninfringes on copyrights, resulting in several high-profile lawsuits. The legal\nlandscape is struggling to keep pace with these rapid advancements, with\nongoing debates about whether generated text might plagiarize copyrighted\nmaterials. Current LLMs may infringe on copyrights or overly restrict\nnon-copyrighted texts, leading to these challenges: (i) the need for a\ncomprehensive evaluation benchmark to assess copyright compliance from multiple\naspects; (ii) evaluating robustness against safeguard bypassing attacks; and\n(iii) developing effective defense targeted against the generation of\ncopyrighted text. To tackle these challenges, we introduce a curated dataset to\nevaluate methods, test attack strategies, and propose lightweight, real-time\ndefense to prevent the generation of copyrighted text, ensuring the safe and\nlawful use of LLMs. Our experiments demonstrate that current LLMs frequently\noutput copyrighted text, and that jailbreaking attacks can significantly\nincrease the volume of copyrighted output. Our proposed defense mechanism\nsignificantly reduces the volume of copyrighted text generated by LLMs by\neffectively refusing malicious requests. Code is publicly available at\nhttps://github.com/xz-liu/SHIELD\n","authors":["Xiaoze Liu","Ting Sun","Tianyang Xu","Feijie Wu","Cunxiang Wang","Xiaoqian Wang","Jing Gao"],"pdf_url":"https://arxiv.org/pdf/2406.12975v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2408.11546v1","updated":"2024-08-21T11:54:22Z","published":"2024-08-21T11:54:22Z","title":"Memorization In In-Context Learning","summary":" In-context learning (ICL) has proven to be an effective strategy for\nimproving the performance of large language models (LLMs) with no additional\ntraining. However, the exact mechanism behind these performance improvements\nremains unclear. This study is the first to show how ICL surfaces memorized\ntraining data and to explore the correlation between this memorization and\nperformance across various ICL regimes: zero-shot, few-shot, and many-shot. Our\nmost notable findings include: (1) ICL significantly surfaces memorization\ncompared to zero-shot learning in most cases; (2) demonstrations, without their\nlabels, are the most effective element in surfacing memorization; (3) ICL\nimproves performance when the surfaced memorization in few-shot regimes reaches\na high level (about 40%); and (4) there is a very strong correlation between\nperformance and memorization in ICL when it outperforms zero-shot learning.\nOverall, our study uncovers a hidden phenomenon -- memorization -- at the core\nof ICL, raising an important question: to what extent do LLMs truly generalize\nfrom demonstrations in ICL, and how much of their success is due to\nmemorization?\n","authors":["Shahriar Golchin","Mihai Surdeanu","Steven Bethard","Eduardo Blanco","Ellen Riloff"],"pdf_url":"https://arxiv.org/pdf/2408.11546v1.pdf","comment":"v1"},{"id":"http://arxiv.org/abs/2404.10501v2","updated":"2024-08-21T11:36:47Z","published":"2024-04-16T12:19:54Z","title":"Self-Supervised Visual Preference Alignment","summary":" This paper makes the first attempt towards unsupervised preference alignment\nin Vision-Language Models (VLMs). We generate chosen and rejected responses\nwith regard to the original and augmented image pairs, and conduct preference\nalignment with direct preference optimization. It is based on a core idea:\nproperly designed augmentation to the image input will induce VLM to generate\nfalse but hard negative responses, which helps the model to learn from and\nproduce more robust and powerful answers. The whole pipeline no longer hinges\non supervision from GPT-4 or human involvement during alignment, and is highly\nefficient with few lines of code. With only 8k randomly sampled unsupervised\ndata, it achieves 90\\% relative score to GPT-4 on complex reasoning in\nLLaVA-Bench, and improves LLaVA-7B/13B by 6.7\\%/5.6\\% score on complex\nmulti-modal benchmark MM-Vet. Visualizations shows its improved ability to\nalign with user-intentions. A series of ablations are firmly conducted to\nreveal the latent mechanism of the approach, which also indicates its potential\ntowards further scaling. Code are available in\nhttps://github.com/Kevinz-code/SeVa.\n","authors":["Ke Zhu","Zheng Ge","Liang Zhao","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.10501v2.pdf","comment":"MM2024 oral"},{"id":"http://arxiv.org/abs/2408.09205v2","updated":"2024-08-21T11:34:56Z","published":"2024-08-17T13:54:34Z","title":"Architectural Foundations for the Large Language Model Infrastructures","summary":" The development of a large language model (LLM) infrastructure is a pivotal\nundertaking in artificial intelligence. This paper explores the intricate\nlandscape of LLM infrastructure, software, and data management. By analyzing\nthese core components, we emphasize the pivotal considerations and safeguards\ncrucial for successful LLM development. This work presents a concise synthesis\nof the challenges and strategies inherent in constructing a robust and\neffective LLM infrastructure, offering valuable insights for researchers and\npractitioners alike.\n","authors":["Hongyin Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.09205v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09416v2","updated":"2024-08-21T11:24:42Z","published":"2024-08-18T09:15:11Z","title":"Challenges and Responses in the Practice of Large Language Models","summary":" This paper carefully summarizes extensive and profound questions from all\nwalks of life, focusing on the current high-profile AI field, covering multiple\ndimensions such as industry trends, academic research, technological innovation\nand business applications. This paper meticulously curates questions that are\nboth thought-provoking and practically relevant, providing nuanced and\ninsightful answers to each. To facilitate readers' understanding and reference,\nthis paper specifically classifies and organizes these questions systematically\nand meticulously from the five core dimensions of computing power\ninfrastructure, software architecture, data resources, application scenarios,\nand brain science. This work aims to provide readers with a comprehensive,\nin-depth and cutting-edge AI knowledge framework to help people from all walks\nof life grasp the pulse of AI development, stimulate innovative thinking, and\npromote industrial progress.\n","authors":["Hongyin Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.09416v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05688v2","updated":"2024-08-21T11:10:36Z","published":"2024-05-09T11:38:23Z","title":"Evaluating Dialect Robustness of Language Models via Conversation\n Understanding","summary":" With an evergrowing number of LLMs reporting superlative performance for\nEnglish, their ability to perform equitably for different dialects of English\n($\\textit{i.e.}$, dialect robustness) needs to be ascertained. Specifically, we\nuse English language (US English or Indian English) conversations between\nhumans who play the word-guessing game of 'taboo'. We formulate two evaluative\ntasks: target word prediction (TWP) ($\\textit{i.e.}$, predict the masked target\nword in a conversation) and target word selection (TWS) ($\\textit{i.e.}$,\nselect the most likely masked target word in a conversation, from among a set\nof candidate words). Extending MD3, an existing dialectic dataset of\ntaboo-playing conversations, we introduce M-MD3, a target-word-masked version\nof MD3 with the en-US and en-IN subsets. We create two subsets: en-MV (where\nen-US is transformed to include dialectal information) and en-TR (where\ndialectal information is removed from en-IN). We evaluate one open-source\n(Llama3) and two closed-source (GPT-4/3.5) LLMs. LLMs perform significantly\nbetter for US English than Indian English for both TWP and TWS tasks, for all\nsettings, exhibiting marginalisation against the Indian dialect of English.\nWhile GPT-based models perform the best, the comparatively smaller models work\nmore equitably after fine-tuning. Our error analysis shows that the LLMs can\nunderstand the dialect better after fine-tuning using dialectal data. Our\nevaluation methodology exhibits a novel way to examine attributes of language\nmodels using pre-existing dialogue datasets.\n","authors":["Dipankar Srirag","Nihar Ranjan Sahoo","Aditya Joshi"],"pdf_url":"https://arxiv.org/pdf/2405.05688v2.pdf","comment":"12 pages, 3 figures, 7 tables"},{"id":"http://arxiv.org/abs/2408.11517v1","updated":"2024-08-21T10:49:15Z","published":"2024-08-21T10:49:15Z","title":"Imagining from Images with an AI Storytelling Tool","summary":" A method for generating narratives by analyzing single images or image\nsequences is presented, inspired by the time immemorial tradition of Narrative\nArt. The proposed method explores the multimodal capabilities of GPT-4o to\ninterpret visual content and create engaging stories, which are illustrated by\na Stable Diffusion XL model. The method is supported by a fully implemented\ntool, called ImageTeller, which accepts images from diverse sources as input.\nUsers can guide the narrative's development according to the conventions of\nfundamental genres - such as Comedy, Romance, Tragedy, Satire or Mystery -, opt\nto generate data-driven stories, or to leave the prototype free to decide how\nto handle the narrative structure. User interaction is provided along the\ngeneration process, allowing the user to request alternative chapters or\nillustrations, and even reject and restart the story generation based on the\nsame input. Additionally, users can attach captions to the input images,\ninfluencing the system's interpretation of the visual content. Examples of\ngenerated stories are provided, along with details on how to access the\nprototype.\n","authors":["Edirlei Soares de Lima","Marco A. Casanova","Antonio L. Furtado"],"pdf_url":"https://arxiv.org/pdf/2408.11517v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11512v1","updated":"2024-08-21T10:44:10Z","published":"2024-08-21T10:44:10Z","title":"IKUN for WMT24 General MT Task: LLMs Are here for Multilingual Machine\n Translation","summary":" This paper introduces two multilingual systems, IKUN and IKUN-C, developed\nfor the general machine translation task in WMT24. IKUN and IKUN-C represent an\nopen system and a constrained system, respectively, built on Llama-3-8b and\nMistral-7B-v0.3. Both systems are designed to handle all 11 language directions\nusing a single model. According to automatic evaluation metrics, IKUN-C\nachieved 6 first-place and 3 second-place finishes among all constrained\nsystems, while IKUN secured 1 first-place and 2 second-place finishes across\nboth open and constrained systems. These encouraging results suggest that large\nlanguage models (LLMs) are nearing the level of proficiency required for\neffective multilingual machine translation. The systems are based on a\ntwo-stage approach: first, continuous pre-training on monolingual data in 10\nlanguages, followed by fine-tuning on high-quality parallel data for 11\nlanguage directions. The primary difference between IKUN and IKUN-C lies in\ntheir monolingual pre-training strategy. IKUN-C is pre-trained using\nconstrained monolingual data, whereas IKUN leverages monolingual data from the\nOSCAR dataset. In the second phase, both systems are fine-tuned on parallel\ndata sourced from NTREX, Flores, and WMT16-23 for all 11 language pairs.\n","authors":["Baohao Liao","Christian Herold","Shahram Khadivi","Christof Monz"],"pdf_url":"https://arxiv.org/pdf/2408.11512v1.pdf","comment":"5 pages, 1 figure, 3 tables"},{"id":"http://arxiv.org/abs/2306.09237v3","updated":"2024-08-21T10:36:44Z","published":"2023-06-15T16:19:15Z","title":"One Law, Many Languages: Benchmarking Multilingual Legal Reasoning for\n Judicial Support","summary":" Recent strides in Large Language Models (LLMs) have saturated many Natural\nLanguage Processing (NLP) benchmarks, emphasizing the need for more challenging\nones to properly assess LLM capabilities. However, domain-specific and\nmultilingual benchmarks are rare because they require in-depth expertise to\ndevelop. Still, most public models are trained predominantly on English\ncorpora, while other languages remain understudied, particularly for practical\ndomain-specific NLP tasks. In this work, we introduce a novel NLP benchmark for\nthe legal domain that challenges LLMs in five key dimensions: processing\n\\emph{long documents} (up to 50K tokens), using \\emph{domain-specific\nknowledge} (embodied in legal texts), \\emph{multilingual} understanding\n(covering five languages), \\emph{multitasking} (comprising legal\ndocument-to-document Information Retrieval, Court View Generation, Leading\nDecision Summarization, Citation Extraction, and eight challenging Text\nClassification tasks) and \\emph{reasoning} (comprising especially Court View\nGeneration, but also the Text Classification tasks). Our benchmark contains\ndiverse datasets from the Swiss legal system, allowing for a comprehensive\nstudy of the underlying non-English, inherently multilingual legal system.\nDespite the large size of our datasets (some with hundreds of thousands of\nexamples), existing publicly available multilingual models struggle with most\ntasks, even after extensive in-domain pre-training and fine-tuning. We publish\nall resources (benchmark suite, pre-trained models, code) under permissive open\nCC BY-SA licenses.\n","authors":["Ronja Stern","Vishvaksenan Rasiah","Veton Matoshi","Srinanda Brügger Bose","Matthias Stürmer","Ilias Chalkidis","Daniel E. Ho","Joel Niklaus"],"pdf_url":"https://arxiv.org/pdf/2306.09237v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11490v1","updated":"2024-08-21T10:01:12Z","published":"2024-08-21T10:01:12Z","title":"DocTabQA: Answering Questions from Long Documents Using Tables","summary":" We study a new problem setting of question answering (QA), referred to as\nDocTabQA. Within this setting, given a long document, the goal is to respond to\nquestions by organizing the answers into structured tables derived directly\nfrom the document's content. Unlike traditional QA approaches which\npredominantly rely on unstructured text to formulate responses, DocTabQA aims\nto leverage structured tables as answers to convey information clearly and\nsystematically, thereby enhancing user comprehension and highlighting\nrelationships between data points. To the best of our knowledge, this problem\nhas not been previously explored. In this paper, we introduce the QTabA\ndataset, encompassing 300 financial documents, accompanied by manually\nannotated 1.5k question-table pairs. Initially, we leverage Large Language\nModels (LLMs) such as GPT-4 to establish a baseline. However, it is widely\nacknowledged that LLMs encounter difficulties when tasked with generating\nintricate, structured outputs from long input sequences. To overcome these\nchallenges, we present a two-stage framework, called DocTabTalk, which\ninitially retrieves relevant sentences from extensive documents and\nsubsequently generates hierarchical tables based on these identified sentences.\nDocTabTalk incorporates two key technological innovations: AlignLLaMA and\nTabTalk, which are specifically tailored to assist GPT-4 in tackling DocTabQA,\nenabling it to generate well-structured, hierarchical tables with improved\norganization and clarity. Comprehensive experimental evaluations conducted on\nboth QTabA and RotoWire datasets demonstrate that our DocTabTalk significantly\nenhances the performances of the GPT-4 in our proposed DocTabQA task and the\ntable generation task. The code and dataset are available at\nhttps://github.com/SmileWHC/DocTabQA for further research.\n","authors":["Haochen Wang","Kai Hu","Haoyu Dong","Liangcai Gao"],"pdf_url":"https://arxiv.org/pdf/2408.11490v1.pdf","comment":"18 pages,5 figures"},{"id":"http://arxiv.org/abs/2407.19832v3","updated":"2024-08-21T09:52:52Z","published":"2024-07-29T09:38:15Z","title":"ML-Mamba: Efficient Multi-Modal Large Language Model Utilizing Mamba-2","summary":" Multimodal Large Language Models (MLLMs) have attracted much attention for\ntheir multifunctionality. However, traditional Transformer architectures incur\nsignificant overhead due to their secondary computational complexity. To\naddress this issue, we introduce ML-Mamba, a multimodal language model, which\nutilizes the latest and efficient Mamba-2 model for inference. Mamba-2 is known\nfor its linear scalability and fast processing of long sequences. We replace\nthe Transformer-based backbone with a pre-trained Mamba-2 model and explore\nmethods for integrating 2D visual selective scanning mechanisms into multimodal\nlearning while also trying various visual encoders and Mamba-2 model variants.\nOur extensive experiments in various multimodal benchmark tests demonstrate the\ncompetitive performance of ML-Mamba and highlight the potential of state space\nmodels in multimodal tasks. The experimental results show that: (1) we\nempirically explore how to effectively apply the 2D vision selective scan\nmechanism for multimodal learning. We propose a novel multimodal connector\ncalled the Mamba-2 Scan Connector (MSC), which enhances representational\ncapabilities. (2) ML-Mamba achieves performance comparable to state-of-the-art\nmethods such as TinyLaVA and MobileVLM v2 through its linear sequential\nmodeling while faster inference speed; (3) Compared to multimodal models\nutilizing Mamba-1, the Mamba-2-based ML-Mamba exhibits superior inference\nperformance and effectiveness.\n","authors":["Wenjun Huang","Jiakai Pan","Jiahao Tang","Yanyu Ding","Yifei Xing","Yuhe Wang","Zhengzhuo Wang","Jianguo Hu"],"pdf_url":"https://arxiv.org/pdf/2407.19832v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11469v1","updated":"2024-08-21T09:38:15Z","published":"2024-08-21T09:38:15Z","title":"The Self-Contained Negation Test Set","summary":" Several methodologies have recently been proposed to evaluate the ability of\nPretrained Language Models (PLMs) to interpret negation. In this article, we\nbuild on Gubelmann and Handschuh (2022), which studies the modification of\nPLMs' predictions as a function of the polarity of inputs, in English.\nCrucially, this test uses ``self-contained'' inputs ending with a masked\nposition: depending on the polarity of a verb in the input, a particular token\nis either semantically ruled out or allowed at the masked position. By\nreplicating Gubelmann and Handschuh (2022) experiments, we have uncovered flaws\nthat weaken the conclusions that can be drawn from this test. We thus propose\nan improved version, the Self-Contained Neg Test, which is more controlled,\nmore systematic, and entirely based on examples forming minimal pairs varying\nonly in the presence or absence of verbal negation in English. When applying\nour test to the roberta and bert base and large models, we show that only\nroberta-large shows trends that match the expectations, while bert-base is\nmostly insensitive to negation. For all the tested models though, in a\nsignificant number of test instances the top-1 prediction remains the token\nthat is semantically forbidden by the context, which shows how much room for\nimprovement remains for a proper treatment of the negation phenomenon.\n","authors":["David Kletz","Pascal Amsili","Marie Candito"],"pdf_url":"https://arxiv.org/pdf/2408.11469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11193v9","updated":"2024-08-21T09:31:02Z","published":"2023-12-18T13:40:16Z","title":"Training With \"Paraphrasing the Original Text\" Improves Long-Context\n Performance","summary":" As Large Language Models (LLMs) continue to evolve, more are being designed\nto handle long-context inputs. Despite this advancement, most of them still\nface challenges in accurately handling long-context tasks, often showing the\n\"lost in the middle\" issue. We identify that insufficient retrieval capability\nis one of the important reasons for this issue. To tackle this challenge, we\npropose a novel approach to design training data for long-context tasks, aiming\nat augmenting LLMs' proficiency in extracting key information from long\ncontext. Specially, we incorporate an additional part named \"paraphrasing the\noriginal text\" when constructing the answer of training samples and then\nfine-tuning the model. Experimenting on LongBench and NaturalQuestions\nMulti-document-QA dataset with models of Llama and Qwen series, our method\nachieves an improvement of up to 8.48% and 4.48% in average scores,\nrespectively, showing effectiveness in improving the model' s performance on\nlong-context tasks. The model and training data have been made available on\nHuggingFace(https://huggingface.co/yuyijiong/Qwen-14b-chat-yarn-32k).\n","authors":["Yijiong Yu","Yongfeng Huang","Zhixiao Qi","Zhe Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.11193v9.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11457v1","updated":"2024-08-21T09:23:20Z","published":"2024-08-21T09:23:20Z","title":"Expanding FLORES+ Benchmark for more Low-Resource Settings:\n Portuguese-Emakhuwa Machine Translation Evaluation","summary":" As part of the Open Language Data Initiative shared tasks, we have expanded\nthe FLORES+ evaluation set to include Emakhuwa, a low-resource language widely\nspoken in Mozambique. We translated the dev and devtest sets from Portuguese\ninto Emakhuwa, and we detail the translation process and quality assurance\nmeasures used. Our methodology involved various quality checks, including\npost-editing and adequacy assessments. The resulting datasets consist of\nmultiple reference sentences for each source. We present baseline results from\ntraining a Neural Machine Translation system and fine-tuning existing\nmultilingual translation models. Our findings suggest that spelling\ninconsistencies remain a challenge in Emakhuwa. Additionally, the baseline\nmodels underperformed on this evaluation set, underscoring the necessity for\nfurther research to enhance machine translation quality for Emakhuwa. The data\nis publicly available at https://huggingface.co/datasets/LIACC/Emakhuwa-FLORES.\n","authors":["Felermino D. M. Antonio Ali","Henrique Lopes Cardoso","Rui Sousa-Silva"],"pdf_url":"https://arxiv.org/pdf/2408.11457v1.pdf","comment":"Open Language Data Initiative 2024 shared tasks"},{"id":"http://arxiv.org/abs/2408.11443v1","updated":"2024-08-21T08:53:35Z","published":"2024-08-21T08:53:35Z","title":"Distributional Properties of Subword Regularization","summary":" Subword regularization, used widely in NLP, improves model performance by\nreducing the dependency on exact tokenizations, augmenting the training corpus,\nand exposing the model to more unique contexts during training. BPE and\nMaxMatch, two popular subword tokenization schemes, have stochastic dropout\nregularization variants. However, there has not been an analysis of the\ndistributions formed by them. We show that these stochastic variants are\nheavily biased towards a small set of tokenizations per word. If the benefits\nof subword regularization are as mentioned, we hypothesize that biasedness\nartificially limits the effectiveness of these schemes. Thus, we propose an\nalgorithm to uniformly sample tokenizations that we use as a drop-in\nreplacement for the stochastic aspects of existing tokenizers, and find that it\nimproves machine translation quality.\n","authors":["Marco Cognetta","Vilém Zouhar","Naoaki Okazaki"],"pdf_url":"https://arxiv.org/pdf/2408.11443v1.pdf","comment":"4 pages + 4 page appendix. 3 figures"},{"id":"http://arxiv.org/abs/2408.11440v1","updated":"2024-08-21T08:51:00Z","published":"2024-08-21T08:51:00Z","title":"LAHAJA: A Robust Multi-accent Benchmark for Evaluating Hindi ASR Systems","summary":" Hindi, one of the most spoken language of India, exhibits a diverse array of\naccents due to its usage among individuals from diverse linguistic origins. To\nenable a robust evaluation of Hindi ASR systems on multiple accents, we create\na benchmark, LAHAJA, which contains read and extempore speech on a diverse set\nof topics and use cases, with a total of 12.5 hours of Hindi audio, sourced\nfrom 132 speakers spanning 83 districts of India. We evaluate existing\nopen-source and commercial models on LAHAJA and find their performance to be\npoor. We then train models using different datasets and find that our model\ntrained on multilingual data with good speaker diversity outperforms existing\nmodels by a significant margin. We also present a fine-grained analysis which\nshows that the performance declines for speakers from North-East and South\nIndia, especially with content heavy in named entities and specialized\nterminology.\n","authors":["Tahir Javed","Janki Nawale","Sakshi Joshi","Eldho George","Kaushal Bhogale","Deovrat Mehendale","Mitesh M. Khapra"],"pdf_url":"https://arxiv.org/pdf/2408.11440v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11431v1","updated":"2024-08-21T08:39:49Z","published":"2024-08-21T08:39:49Z","title":"Diagnosing and Remedying Knowledge Deficiencies in LLMs via Label-free\n Curricular Meaningful Learning","summary":" Large Language Models (LLMs) are versatile and demonstrate impressive\ngeneralization ability by mining and learning information from extensive\nunlabeled text. However, they still exhibit reasoning mistakes, often stemming\nfrom knowledge deficiencies, which can affect their trustworthiness and\nreliability. Although users can provide diverse and comprehensive queries,\nobtaining sufficient and effective feedback is demanding. Furthermore,\nevaluating LLMs comprehensively with limited labeled samples is difficult. This\nmakes it a challenge to diagnose and remedy the deficiencies of LLMs through\nrich label-free user queries. To tackle this challenge, we propose a label-free\ncurricular meaningful learning framework (LaMer). LaMer first employs relative\nentropy to automatically diagnose and quantify the knowledge deficiencies of\nLLMs in a label-free setting. Next, to remedy the diagnosed knowledge\ndeficiencies, we apply curricular meaningful learning: first, we adopt\nmeaningful learning to adaptively synthesize augmentation data according to the\nseverity of the deficiencies, and then design a curricular deficiency remedy\nstrategy to remedy the knowledge deficiencies of LLMs progressively.\nExperiments show that LaMer efficiently and effectively diagnoses and remedies\nknowledge deficiencies in LLMs, improving various LLMs across seven\nout-of-distribution (OOD) reasoning and language understanding benchmarks,\nachieving comparable results to baselines with just 40\\% training data. LaMer\neven surpasses methods that rely on labeled datasets for deficiency diagnosis.\nIn application, our label-free method can offer an effective knowledge\ndeficiency diagnostic tool for efficient LLM development.\n","authors":["Kai Xiong","Xiao Ding","Li Du","Jiahao Ying","Ting Liu","Bing Qin","Yixin Cao"],"pdf_url":"https://arxiv.org/pdf/2408.11431v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2301.13066v3","updated":"2024-08-21T08:25:25Z","published":"2023-01-30T17:10:34Z","title":"A Human Word Association based model for topic detection in social\n networks","summary":" With the widespread use of social networks, detecting the topics discussed on\nthese platforms has become a significant challenge. Current approaches\nprimarily rely on frequent pattern mining or semantic relations, often\nneglecting the structure of the language. Language structural methods aim to\ndiscover the relationships between words and how humans understand them.\nTherefore, this paper introduces a topic detection framework for social\nnetworks based on the concept of imitating the mental ability of word\nassociation. This framework employs the Human Word Association method and\nincludes a specially designed extraction algorithm. The performance of this\nmethod is evaluated using the FA-CUP dataset, a benchmark in the field of topic\ndetection. The results indicate that the proposed method significantly improves\ntopic detection compared to other methods, as evidenced by Topic-recall and the\nkeyword F1 measure. Additionally, to assess the applicability and\ngeneralizability of the proposed method, a dataset of Telegram posts in the\nPersian language is used. The results demonstrate that this method outperforms\nother topic detection methods.\n","authors":["Mehrdad Ranjbar Khadivi","Shahin Akbarpour","Mohammad-Reza Feizi-Derakhshi","Babak Anari"],"pdf_url":"https://arxiv.org/pdf/2301.13066v3.pdf","comment":"This is a preprint of an article published in \"Annals of Data\n Science\". The final authenticated version is available online at:\n https://link.springer.com/article/10.1007/s40745-024-00561-0"},{"id":"http://arxiv.org/abs/2408.11415v1","updated":"2024-08-21T08:20:41Z","published":"2024-08-21T08:20:41Z","title":"Towards \"Differential AI Psychology\" and in-context Value-driven\n Statement Alignment with Moral Foundations Theory","summary":" Contemporary research in social sciences is increasingly utilizing\nstate-of-the-art statistical language models to annotate or generate content.\nWhile these models perform benchmark-leading on common language tasks and show\nexemplary task-independent emergent abilities, transferring them to novel\nout-of-domain tasks is only insufficiently explored. The implications of the\nstatistical black-box approach - stochastic parrots - are prominently\ncriticized in the language model research community; however, the significance\nfor novel generative tasks is not.\n This work investigates the alignment between personalized language models and\nsurvey participants on a Moral Foundation Theory questionnaire. We adapt\ntext-to-text models to different political personas and survey the\nquestionnaire repetitively to generate a synthetic population of persona and\nmodel combinations. Analyzing the intra-group variance and cross-alignment\nshows significant differences across models and personas. Our findings indicate\nthat adapted models struggle to represent the survey-captured assessment of\npolitical ideologies. Thus, using language models to mimic social interactions\nrequires measurable improvements in in-context optimization or parameter\nmanipulation to align with psychological and sociological stereotypes. Without\nquantifiable alignment, generating politically nuanced content remains\nunfeasible. To enhance these representations, we propose a testable framework\nto generate agents based on moral value statements for future research.\n","authors":["Simon Münker"],"pdf_url":"https://arxiv.org/pdf/2408.11415v1.pdf","comment":"8 pages, 6 tables"},{"id":"http://arxiv.org/abs/2408.09420v3","updated":"2024-08-21T07:50:40Z","published":"2024-08-18T09:31:13Z","title":"Enhancing Startup Success Predictions in Venture Capital: A GraphRAG\n Augmented Multivariate Time Series Method","summary":" In the Venture Capital(VC) industry, predicting the success of startups is\nchallenging due to limited financial data and the need for subjective revenue\nforecasts. Previous methods based on time series analysis or deep learning\noften fall short as they fail to incorporate crucial inter-company\nrelationships such as competition and collaboration. Regarding the issues, we\npropose a novel approach using GrahphRAG augmented time series model. With\nGraphRAG, time series predictive methods are enhanced by integrating these\nvital relationships into the analysis framework, allowing for a more dynamic\nunderstanding of the startup ecosystem in venture capital. Our experimental\nresults demonstrate that our model significantly outperforms previous models in\nstartup success predictions. To the best of our knowledge, our work is the\nfirst application work of GraphRAG.\n","authors":["Zitian Gao","Yihao Xiao"],"pdf_url":"https://arxiv.org/pdf/2408.09420v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11396v1","updated":"2024-08-21T07:43:49Z","published":"2024-08-21T07:43:49Z","title":"MoE-LPR: Multilingual Extension of Large Language Models through\n Mixture-of-Experts with Language Priors Routing","summary":" Large Language Models (LLMs) are often English-centric due to the\ndisproportionate distribution of languages in their pre-training data.\nEnhancing non-English language capabilities through post-pretraining often\nresults in catastrophic forgetting of the ability of original languages.\nPrevious methods either achieve good expansion with severe forgetting or slight\nforgetting with poor expansion, indicating the challenge of balancing language\nexpansion while preventing forgetting. In this paper, we propose a method\ncalled MoE-LPR (Mixture-of-Experts with Language Priors Routing) to alleviate\nthis problem. MoE-LPR employs a two-stage training approach to enhance the\nmultilingual capability. First, the model is post-pretrained into a\nMixture-of-Experts (MoE) architecture by upcycling, where all the original\nparameters are frozen and new experts are added. In this stage, we focus\nimproving the ability on expanded languages, without using any original\nlanguage data. Then, the model reviews the knowledge of the original languages\nwith replay data amounting to less than 1% of post-pretraining, where we\nincorporate language priors routing to better recover the abilities of the\noriginal languages. Evaluations on multiple benchmarks show that MoE-LPR\noutperforms other post-pretraining methods. Freezing original parameters\npreserves original language knowledge while adding new experts preserves the\nlearning ability. Reviewing with LPR enables effective utilization of\nmultilingual knowledge within the parameters. Additionally, the MoE\narchitecture maintains the same inference overhead while increasing total model\nparameters. Extensive experiments demonstrate MoE-LPR's effectiveness in\nimproving expanded languages and preserving original language proficiency with\nsuperior scalability. Code and scripts are freely available at\nhttps://github.com/zjwang21/MoE-LPR.git.\n","authors":["Hao Zhou","Zhijun Wang","Shujian Huang","Xin Huang","Xue Han","Junlan Feng","Chao Deng","Weihua Luo","Jiajun Chen"],"pdf_url":"https://arxiv.org/pdf/2408.11396v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11393v1","updated":"2024-08-21T07:38:51Z","published":"2024-08-21T07:38:51Z","title":"First Activations Matter: Training-Free Methods for Dynamic Activation\n in Large Language Models","summary":" Dynamic activation (DA) techniques, such as DejaVu and MoEfication, have\ndemonstrated their potential to significantly enhance the inference efficiency\nof large language models (LLMs). However, these techniques often rely on ReLU\nactivation functions or require additional parameters and training to maintain\nperformance. This paper introduces a training-free Threshold-based Dynamic\nActivation(TDA) method that leverage sequence information to exploit the\ninherent sparsity of models across various architectures. This method is\ndesigned to accelerate generation speed by 18-25\\% without significantly\ncompromising task performance, thereby addressing the limitations of existing\nDA techniques. Moreover, we delve into the root causes of LLM sparsity and\ntheoretically analyze two of its critical features: history-related activation\nuncertainty and semantic-irrelevant activation inertia. Our comprehensive\nanalyses not only provide a robust theoretical foundation for DA methods but\nalso offer valuable insights to guide future research in optimizing LLMs for\ngreater efficiency and effectiveness.\n","authors":["Chi Ma","Mincong Huang","Ying Zhang","Chao Wang","Yujie Wang","Lei Yu","Chuan Liu","Wei Lin"],"pdf_url":"https://arxiv.org/pdf/2408.11393v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11382v1","updated":"2024-08-21T07:23:34Z","published":"2024-08-21T07:23:34Z","title":"On the Interchangeability of Positional Embeddings in Multilingual\n Neural Machine Translation Models","summary":" Standard Neural Machine Translation (NMT) models have traditionally been\ntrained with Sinusoidal Positional Embeddings (PEs), which are inadequate for\ncapturing long-range dependencies and are inefficient for long-context or\ndocument-level translation. In contrast, state-of-the-art large language models\n(LLMs) employ relative PEs, demonstrating superior length generalization. This\nwork explores the potential for efficiently switching the Positional Embeddings\nof pre-trained NMT models from absolute sinusoidal PEs to relative approaches\nsuch as RoPE and ALiBi. Our findings reveal that sinusoidal PEs can be\neffectively replaced with RoPE and ALiBi with negligible or no performance\nloss, achieved by fine-tuning on a small fraction of high-quality data.\nAdditionally, models trained without Positional Embeddings (NoPE) are not a\nviable solution for Encoder-Decoder architectures, as they consistently\nunder-perform compared to models utilizing any form of Positional Embedding.\nFurthermore, even a model trained from scratch with these relative PEs slightly\nunder-performs a fine-tuned model, underscoring the efficiency and validity of\nour hypothesis.\n","authors":["Varun Gumma","Pranjal A. Chitale","Kalika Bali"],"pdf_url":"https://arxiv.org/pdf/2408.11382v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2408.11381v1","updated":"2024-08-21T07:20:48Z","published":"2024-08-21T07:20:48Z","title":"RAGLAB: A Modular and Research-Oriented Unified Framework for\n Retrieval-Augmented Generation","summary":" Large Language Models (LLMs) demonstrate human-level capabilities in\ndialogue, reasoning, and knowledge retention. However, even the most advanced\nLLMs face challenges such as hallucinations and real-time updating of their\nknowledge. Current research addresses this bottleneck by equipping LLMs with\nexternal knowledge, a technique known as Retrieval Augmented Generation (RAG).\nHowever, two key issues constrained the development of RAG. First, there is a\ngrowing lack of comprehensive and fair comparisons between novel RAG\nalgorithms. Second, open-source tools such as LlamaIndex and LangChain employ\nhigh-level abstractions, which results in a lack of transparency and limits the\nability to develop novel algorithms and evaluation metrics. To close this gap,\nwe introduce RAGLAB, a modular and research-oriented open-source library.\nRAGLAB reproduces 6 existing algorithms and provides a comprehensive ecosystem\nfor investigating RAG algorithms. Leveraging RAGLAB, we conduct a fair\ncomparison of 6 RAG algorithms across 10 benchmarks. With RAGLAB, researchers\ncan efficiently compare the performance of various algorithms and develop novel\nalgorithms.\n","authors":["Xuanwang Zhang","Yunze Song","Yidong Wang","Shuyun Tang","Xinfeng Li","Zhengran Zeng","Zhen Wu","Wei Ye","Wenyuan Xu","Yue Zhang","Xinyu Dai","Shikun Zhang","Qingsong Wen"],"pdf_url":"https://arxiv.org/pdf/2408.11381v1.pdf","comment":"6 pages, 3 figures"},{"id":"http://arxiv.org/abs/2408.10774v2","updated":"2024-08-21T06:48:16Z","published":"2024-08-20T12:13:04Z","title":"Flexora: Flexible Low Rank Adaptation for Large Language Models","summary":" Large Language Models (LLMs) are driving advancements in artificial\nintelligence by increasing the scale of model parameters, which has\nsignificantly enhanced generalization ability and unlocked new capabilities in\npractice. However, their performance in specific downstream tasks is usually\nhindered by their knowledge boundaries on these tasks. Thus, fine-tuning\ntechniques, especially the widely used Low-Rank Adaptation (LoRA) method, have\nbeen introduced to expand the boundaries on these tasks, whereas LoRA would\nunderperform on certain tasks owing to its potential overfitting on these\ntasks. To overcome this overfitting and improve the performance of LoRA, we\npropose the flexible low rank adaptation (Flexora) method to automatically and\nflexibly select the most important layers needing to be fine-tuned to achieve\nthe best performance on different downstream tasks. Specifically, Flexora\nfirstly frames this layer selection problem as a well-defined hyperparameter\noptimization (HPO) problem, then addresses it using the unrolled\ndifferentiation (UD) method, and finally selects the most useful layers based\non the optimized hyperparameters. Our extensive experiments on many pretrained\nmodels and natural language tasks show that Flexora is able to consistently\nimprove over the existing baselines, indicating the effectiveness of our\nFlexora in practice. We additionally provide insightful theoretical results and\nmany ablation studies to deliver a comprehensive understanding of our Flexora.\n","authors":["Chenxing Wei","Yao Shu","Ying Tiffany He","Fei Richard Yu"],"pdf_url":"https://arxiv.org/pdf/2408.10774v2.pdf","comment":"29 pages, 13 figures"},{"id":"http://arxiv.org/abs/2408.11366v1","updated":"2024-08-21T06:35:21Z","published":"2024-08-21T06:35:21Z","title":"GeoReasoner: Reasoning On Geospatially Grounded Context For Natural\n Language Understanding","summary":" In human reading and communication, individuals tend to engage in geospatial\nreasoning, which involves recognizing geographic entities and making informed\ninferences about their interrelationships. To mimic such cognitive process,\ncurrent methods either utilize conventional natural language understanding\ntoolkits, or directly apply models pretrained on geo-related natural language\ncorpora. However, these methods face two significant challenges: i) they do not\ngeneralize well to unseen geospatial scenarios, and ii) they overlook the\nimportance of integrating geospatial context from geographical databases with\nlinguistic information from the Internet. To handle these challenges, we\npropose GeoReasoner, a language model capable of reasoning on geospatially\ngrounded natural language. Specifically, it first leverages Large Language\nModels (LLMs) to generate a comprehensive location description based on\nlinguistic and geospatial information. It also encodes direction and distance\ninformation into spatial embedding via treating them as pseudo-sentences.\nConsequently, the model is trained on both anchor-level and neighbor-level\ninputs to learn geo-entity representation. Extensive experimental results\ndemonstrate GeoReasoner's superiority in three tasks: toponym recognition,\ntoponym linking, and geo-entity typing, compared to the state-of-the-art\nbaselines.\n","authors":["Yibo Yan","Joey Lee"],"pdf_url":"https://arxiv.org/pdf/2408.11366v1.pdf","comment":"Accepted by International Conference on Information and Knowledge\n Management 2024"},{"id":"http://arxiv.org/abs/2408.09121v2","updated":"2024-08-21T06:01:08Z","published":"2024-08-17T07:11:02Z","title":"Selective Prompt Anchoring for Code Generation","summary":" Recent advances in large language models (LLMs) such as Copilot and ChatGPT\nhave transformed software development by automating coding tasks. Despite these\nadvancements, challenges remain in reducing error rates and fully meeting user\nexpectations. Our empirical study reveals LLMs tend to dilute their\nself-attention on the initial prompt as more code tokens are generated. We\nhypothesize this self-attention dilution issue is one of the root causes of\ninaccuracies in LLM-generated code. To mitigate this issue, we propose\nSelective Prompt Anchoring (SPA). SPA amplifies the influence of the selected\nparts in the initial prompt, which we refer to as ``anchored text'', during\ncode generation. Specifically, SPA calculates the logit distribution difference\nwith and without the anchored text. We prove this difference approximates the\nanchored text's contextual contribution to the output logits. SPA creates an\naugmented logit distribution by linearly combining the original logit\ndistribution and the logit difference. We evaluate SPA with five LLMs on four\nbenchmarks. Our results demonstrate that using SPA can consistently improve\nPass@1 rates by up to 9.7% in all settings. Notably, with selective text\nanchoring, a small version of DeepSeek-Coder (6.7B) can achieve better\nperformance than an original much larger version (33B). Our code is available\nat https://github.com/magic-YuanTian/Selective-Prompt-Anchoring.\n","authors":["Yuan Tian","Tianyi Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.09121v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07091v2","updated":"2024-08-21T05:58:36Z","published":"2024-08-09T14:57:53Z","title":"Node Level Graph Autoencoder: Unified Pretraining for Textual Graph\n Learning","summary":" Textual graphs are ubiquitous in real-world applications, featuring rich text\ninformation with complex relationships, which enables advanced research across\nvarious fields. Textual graph representation learning aims to generate\nlow-dimensional feature embeddings from textual graphs that can improve the\nperformance of downstream tasks. A high-quality feature embedding should\neffectively capture both the structural and the textual information in a\ntextual graph. However, most textual graph dataset benchmarks rely on word2vec\ntechniques to generate feature embeddings, which inherently limits their\ncapabilities. Recent works on textual graph representation learning can be\ncategorized into two folds: supervised and unsupervised methods. Supervised\nmethods finetune a language model on labeled nodes, which have limited\ncapabilities when labeled data is scarce. Unsupervised methods, on the other\nhand, extract feature embeddings by developing complex training pipelines. To\naddress these limitations, we propose a novel unified unsupervised learning\nautoencoder framework, named Node Level Graph AutoEncoder (NodeGAE). We employ\nlanguage models as the backbone of the autoencoder, with pretraining on text\nreconstruction. Additionally, we add an auxiliary loss term to make the feature\nembeddings aware of the local graph structure. Our method maintains simplicity\nin the training process and demonstrates generalizability across diverse\ntextual graphs and downstream tasks. We evaluate our method on two core graph\nrepresentation learning downstream tasks: node classification and link\nprediction. Comprehensive experiments demonstrate that our approach\nsubstantially enhances the performance of diverse graph neural networks (GNNs)\nacross multiple textual graph datasets.\n","authors":["Wenbin Hu","Huihao Jing","Qi Hu","Haoran Li","Yangqiu Song"],"pdf_url":"https://arxiv.org/pdf/2408.07091v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19342v2","updated":"2024-08-21T05:44:11Z","published":"2024-07-27T21:12:46Z","title":"Parameter-Efficient Fine-Tuning via Circular Convolution","summary":" Low-Rank Adaptation (LoRA) has gained popularity for fine-tuning large\nfoundation models, leveraging low-rank matrices $\\mathbf{A}$ and $\\mathbf{B}$\nto represent weight changes (i.e., $\\Delta \\mathbf{W} = \\mathbf{B}\n\\mathbf{A}$). This method reduces trainable parameters and mitigates heavy\nmemory consumption associated with full delta matrices by sequentially\nmultiplying $\\mathbf{A}$ and $\\mathbf{B}$ with the activation. Despite its\nsuccess, the intrinsic low-rank characteristic may limit its performance.\nAlthough several variants have been proposed to address this issue, they often\noverlook the crucial computational and memory efficiency brought by LoRA. In\nthis paper, we propose Circular Convolution Adaptation (C$^3$A), which not only\nachieves high-rank adaptation with enhanced performance but also excels in both\ncomputational power and memory utilization. Extensive experiments demonstrate\nthat C$^3$A consistently outperforms LoRA and its variants across various\nfine-tuning tasks.\n","authors":["Aochuan Chen","Jiashun Cheng","Zijing Liu","Ziqi Gao","Fugee Tsung","Yu Li","Jia Li"],"pdf_url":"https://arxiv.org/pdf/2407.19342v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2310.00280v3","updated":"2024-08-21T05:11:10Z","published":"2023-09-30T07:11:39Z","title":"Corex: Pushing the Boundaries of Complex Reasoning through Multi-Model\n Collaboration","summary":" Large Language Models (LLMs) are evolving at an unprecedented pace and have\nexhibited considerable capability in the realm of natural language processing\n(NLP) with world knowledge. Benefiting from ultra-large-scale training corpora,\na single LLM can manage typical NLP tasks competently. However, its performance\nin executing reasoning tasks is still confined by the limitations of its\ninternal representations. To push this boundary further, we introduce Corex in\nthis paper, a suite of novel general-purpose strategies that transform LLMs\ninto autonomous agents pioneering multi-model collaborations for complex\ntask-solving. Inspired by human behaviors, Corex is constituted by diverse\ncollaboration paradigms including Debate, Review, and Retrieve modes, which\ncollectively work towards enhancing the factuality, faithfulness, and\nreliability of the reasoning process. These paradigms foster task-agnostic\napproaches that enable LLMs to ''think outside the box,'' thereby overcoming\nhallucinations and providing better solutions. Through extensive experiments\nacross four different types of reasoning tasks, we demonstrate that\norchestrating multiple LLMs to work in concert yields substantially better\nperformance compared to existing methods. Further results and in-depth analysis\ndemonstrate the cost-effectiveness of our method, facilitating collaboration\namong different LLMs and promoting annotation efficiency.\n","authors":["Qiushi Sun","Zhangyue Yin","Xiang Li","Zhiyong Wu","Xipeng Qiu","Lingpeng Kong"],"pdf_url":"https://arxiv.org/pdf/2310.00280v3.pdf","comment":"COLM 2024 / ICLR 2024 Workshop on LLM Agents"},{"id":"http://arxiv.org/abs/2408.11344v1","updated":"2024-08-21T05:04:25Z","published":"2024-08-21T05:04:25Z","title":"Clinical Context-aware Radiology Report Generation from Medical Images\n using Transformers","summary":" Recent developments in the field of Natural Language Processing, especially\nlanguage models such as the transformer have brought state-of-the-art results\nin language understanding and language generation. In this work, we investigate\nthe use of the transformer model for radiology report generation from chest\nX-rays. We also highlight limitations in evaluating radiology report generation\nusing only the standard language generation metrics. We then applied a\ntransformer based radiology report generation architecture, and also compare\nthe performance of a transformer based decoder with the recurrence based\ndecoder. Experiments were performed using the IU-CXR dataset, showing superior\nresults to its LSTM counterpart and being significantly faster. Finally, we\nidentify the need of evaluating radiology report generation system using both\nlanguage generation metrics and classification metrics, which helps to provide\nrobust measure of generated reports in terms of their coherence and diagnostic\nvalue.\n","authors":["Sonit Singh"],"pdf_url":"https://arxiv.org/pdf/2408.11344v1.pdf","comment":"21 pages, 6 figures, 8 tables"},{"id":"http://arxiv.org/abs/2406.08772v2","updated":"2024-08-21T05:00:04Z","published":"2024-06-13T03:04:28Z","title":"MMFakeBench: A Mixed-Source Multimodal Misinformation Detection\n Benchmark for LVLMs","summary":" Current multimodal misinformation detection (MMD) methods often assume a\nsingle source and type of forgery for each sample, which is insufficient for\nreal-world scenarios where multiple forgery sources coexist. The lack of a\nbenchmark for mixed-source misinformation has hindered progress in this field.\nTo address this, we introduce MMFakeBench, the first comprehensive benchmark\nfor mixed-source MMD. MMFakeBench includes 3 critical sources: textual veracity\ndistortion, visual veracity distortion, and cross-modal consistency distortion,\nalong with 12 sub-categories of misinformation forgery types. We further\nconduct an extensive evaluation of 6 prevalent detection methods and 15 large\nvision-language models (LVLMs) on MMFakeBench under a zero-shot setting. The\nresults indicate that current methods struggle under this challenging and\nrealistic mixed-source MMD setting. Additionally, we propose an innovative\nunified framework, which integrates rationales, actions, and tool-use\ncapabilities of LVLM agents, significantly enhancing accuracy and\ngeneralization. We believe this study will catalyze future research into more\nrealistic mixed-source multimodal misinformation and provide a fair evaluation\nof misinformation detection methods.\n","authors":["Xuannan Liu","Zekun Li","Peipei Li","Shuhan Xia","Xing Cui","Linzhi Huang","Huaibo Huang","Weihong Deng","Zhaofeng He"],"pdf_url":"https://arxiv.org/pdf/2406.08772v2.pdf","comment":"Project page: https://liuxuannan.github.io/MMFakeBench.github.io/"},{"id":"http://arxiv.org/abs/2408.11334v1","updated":"2024-08-21T04:33:05Z","published":"2024-08-21T04:33:05Z","title":"BURExtract-Llama: An LLM for Clinical Concept Extraction in Breast\n Ultrasound Reports","summary":" Breast ultrasound is essential for detecting and diagnosing abnormalities,\nwith radiology reports summarizing key findings like lesion characteristics and\nmalignancy assessments. Extracting this critical information is challenging due\nto the unstructured nature of these reports, with varied linguistic styles and\ninconsistent formatting. While proprietary LLMs like GPT-4 are effective, they\nare costly and raise privacy concerns when handling protected health\ninformation. This study presents a pipeline for developing an in-house LLM to\nextract clinical information from radiology reports. We first use GPT-4 to\ncreate a small labeled dataset, then fine-tune a Llama3-8B model on it.\nEvaluated on clinician-annotated reports, our model achieves an average F1\nscore of 84.6%, which is on par with GPT-4. Our findings demonstrate the\nfeasibility of developing an in-house LLM that not only matches GPT-4's\nperformance but also offers cost reductions and enhanced data privacy.\n","authors":["Yuxuan Chen","Haoyan Yang","Hengkai Pan","Fardeen Siddiqui","Antonio Verdone","Qingyang Zhang","Sumit Chopra","Chen Zhao","Yiqiu Shen"],"pdf_url":"https://arxiv.org/pdf/2408.11334v1.pdf","comment":"This paper has been accepted as the oral paper for the HCHM workshop,\n ACM Multimedia 2024"},{"id":"http://arxiv.org/abs/2408.11330v1","updated":"2024-08-21T04:27:44Z","published":"2024-08-21T04:27:44Z","title":"Design Principle Transfer in Neural Architecture Search via Large\n Language Models","summary":" Transferable neural architecture search (TNAS) has been introduced to design\nefficient neural architectures for multiple tasks, to enhance the practical\napplicability of NAS in real-world scenarios. In TNAS, architectural knowledge\naccumulated in previous search processes is reused to warm up the architecture\nsearch for new tasks. However, existing TNAS methods still search in an\nextensive search space, necessitating the evaluation of numerous architectures.\nTo overcome this challenge, this work proposes a novel transfer paradigm, i.e.,\ndesign principle transfer. In this work, the linguistic description of various\nstructural components' effects on architectural performance is termed design\nprinciples. They are learned from established architectures and then can be\nreused to reduce the search space by discarding unpromising architectures.\nSearching in the refined search space can boost both the search performance and\nefficiency for new NAS tasks. To this end, a large language model\n(LLM)-assisted design principle transfer (LAPT) framework is devised. In LAPT,\nLLM is applied to automatically reason the design principles from a set of\ngiven architectures, and then a principle adaptation method is applied to\nrefine these principles progressively based on the new search results.\nExperimental results show that LAPT can beat the state-of-the-art TNAS methods\non most tasks and achieve comparable performance on others.\n","authors":["Xun Zhou","Liang Feng","Xingyu Wu","Zhichao Lu","Kay Chen Tan"],"pdf_url":"https://arxiv.org/pdf/2408.11330v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11327v1","updated":"2024-08-21T04:20:55Z","published":"2024-08-21T04:20:55Z","title":"Plug, Play, and Fuse: Zero-Shot Joint Decoding via Word-Level Re-ranking\n Across Diverse Vocabularies","summary":" Recent advancements in NLP have resulted in models with specialized\nstrengths, such as processing multimodal inputs or excelling in specific\ndomains. However, real-world tasks, like multimodal translation, often require\na combination of these strengths, such as handling both translation and image\nprocessing. While individual translation and vision models are powerful, they\ntypically lack the ability to perform both tasks in a single system. Combining\nthese models poses challenges, particularly due to differences in their\nvocabularies, which limit the effectiveness of traditional ensemble methods to\npost-generation techniques like N-best list re-ranking. In this work, we\npropose a novel zero-shot ensembling strategy that allows for the integration\nof different models during the decoding phase without the need for additional\ntraining. Our approach re-ranks beams during decoding by combining scores at\nthe word level, using heuristics to predict when a word is completed. We\ndemonstrate the effectiveness of this method in machine translation scenarios,\nshowing that it enables the generation of translations that are both speech-\nand image-aware while also improving overall translation quality\\footnote{We\nwill release the code upon paper acceptance.}.\n","authors":["Sai Koneru","Matthias Huck","Miriam Exel","Jan Niehues"],"pdf_url":"https://arxiv.org/pdf/2408.11327v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2408.06537v4","updated":"2024-08-21T04:03:06Z","published":"2024-08-13T00:06:56Z","title":"Introducing the NewsPaLM MBR and QE Dataset: LLM-Generated High-Quality\n Parallel Data Outperforms Traditional Web-Crawled Data","summary":" Recent research in neural machine translation (NMT) has shown that training\non high-quality machine-generated data can outperform training on\nhuman-generated data. This work accompanies the first-ever release of a\nLLM-generated, MBR-decoded and QE-reranked dataset with both sentence-level and\nmulti-sentence examples. We perform extensive experiments to demonstrate the\nquality of our dataset in terms of its downstream impact on NMT model\nperformance. We find that training from scratch on our (machine-generated)\ndataset outperforms training on the (web-crawled) WMT'23 training dataset\n(which is 300 times larger), and also outperforms training on the top-quality\nsubset of the WMT'23 training dataset. We also find that performing\nself-distillation by finetuning the LLM which generated this dataset\noutperforms the LLM's strong few-shot baseline. These findings corroborate the\nquality of our dataset, and demonstrate the value of high-quality\nmachine-generated data in improving performance of NMT models.\n","authors":["Mara Finkelstein","David Vilar","Markus Freitag"],"pdf_url":"https://arxiv.org/pdf/2408.06537v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11319v1","updated":"2024-08-21T03:59:51Z","published":"2024-08-21T03:59:51Z","title":"Towards Evaluating Large Language Models on Sarcasm Understanding","summary":" In the era of large language models (LLMs), the task of ``System I''~-~the\nfast, unconscious, and intuitive tasks, e.g., sentiment analysis, text\nclassification, etc., have been argued to be successfully solved. However,\nsarcasm, as a subtle linguistic phenomenon, often employs rhetorical devices\nlike hyperbole and figuration to convey true sentiments and intentions,\ninvolving a higher level of abstraction than sentiment analysis. There is\ngrowing concern that the argument about LLMs' success may not be fully tenable\nwhen considering sarcasm understanding. To address this question, we select\neleven SOTA LLMs and eight SOTA pre-trained language models (PLMs) and present\ncomprehensive evaluations on six widely used benchmark datasets through\ndifferent prompting approaches, i.e., zero-shot input/output (IO) prompting,\nfew-shot IO prompting, chain of thought (CoT) prompting. Our results highlight\nthree key findings: (1) current LLMs underperform supervised PLMs based sarcasm\ndetection baselines across six sarcasm benchmarks. This suggests that\nsignificant efforts are still required to improve LLMs' understanding of human\nsarcasm. (2) GPT-4 consistently and significantly outperforms other LLMs across\nvarious prompting methods, with an average improvement of 14.0\\%$\\uparrow$.\nClaude 3 and ChatGPT demonstrate the next best performance after GPT-4. (3)\nFew-shot IO prompting method outperforms the other two methods: zero-shot IO\nand few-shot CoT. The reason is that sarcasm detection, being a holistic,\nintuitive, and non-rational cognitive process, is argued not to adhere to\nstep-by-step logical reasoning, making CoT less effective in understanding\nsarcasm compared to its effectiveness in mathematical reasoning tasks.\n","authors":["Yazhou Zhang","Chunwang Zou","Zheng Lian","Prayag Tiwari","Jing Qin"],"pdf_url":"https://arxiv.org/pdf/2408.11319v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09717v3","updated":"2024-08-21T03:55:29Z","published":"2024-06-14T04:55:30Z","title":"UniBridge: A Unified Approach to Cross-Lingual Transfer Learning for\n Low-Resource Languages","summary":" In this paper, we introduce UniBridge (Cross-Lingual Transfer Learning with\nOptimized Embeddings and Vocabulary), a comprehensive approach developed to\nimprove the effectiveness of Cross-Lingual Transfer Learning, particularly in\nlanguages with limited resources. Our approach tackles two essential elements\nof a language model: the initialization of embeddings and the optimal\nvocabulary size. Specifically, we propose a novel embedding initialization\nmethod that leverages both lexical and semantic alignment for a language. In\naddition, we present a method for systematically searching for the optimal\nvocabulary size, ensuring a balance between model complexity and linguistic\ncoverage. Our experiments across multilingual datasets show that our approach\ngreatly improves the F1-Score in several languages. UniBridge is a robust and\nadaptable solution for cross-lingual systems in various languages, highlighting\nthe significance of initializing embeddings and choosing the right vocabulary\nsize in cross-lingual environments.\n","authors":["Trinh Pham","Khoi M. Le","Luu Anh Tuan"],"pdf_url":"https://arxiv.org/pdf/2406.09717v3.pdf","comment":"First two authors contribute equally. Accepted at ACL 2024"},{"id":"http://arxiv.org/abs/2408.10903v2","updated":"2024-08-21T03:31:25Z","published":"2024-08-20T14:47:38Z","title":"BEYOND DIALOGUE: A Profile-Dialogue Alignment Framework Towards General\n Role-Playing Language Model","summary":" The rapid advancement of large language models (LLMs) has revolutionized\nrole-playing, enabling the development of general role-playing models. However,\ncurrent role-playing training has two significant issues: (I) Using a\npredefined role profile to prompt dialogue training for specific scenarios\nusually leads to inconsistencies and even conflicts between the dialogue and\nthe profile, resulting in training biases. (II) The model learns to imitate the\nrole based solely on the profile, neglecting profile-dialogue alignment at the\nsentence level. In this work, we propose a simple yet effective framework\ncalled BEYOND DIALOGUE, designed to overcome these hurdles. This framework\ninnovatively introduces \"beyond dialogue\" tasks to align dialogue with profile\ntraits based on each specific scenario, thereby eliminating biases during\ntraining. Furthermore, by adopting an innovative prompting mechanism that\ngenerates reasoning outcomes for training, the framework allows the model to\nachieve fine-grained alignment between profile and dialogue at the sentence\nlevel. The aforementioned methods are fully automated and low-cost.\nAdditionally, the integration of automated dialogue and objective evaluation\nmethods forms a comprehensive framework, paving the way for general\nrole-playing. Experimental results demonstrate that our model excels in\nadhering to and reflecting various dimensions of role profiles, outperforming\nmost proprietary general and specialized role-playing baselines. All code and\ndatasets are available at https://github.com/yuyouyu32/BeyondDialogue.\n","authors":["Yeyong Yu","Rusheng Yu","Haojie Wei","Zhanqiu Zhang","Quan Qian"],"pdf_url":"https://arxiv.org/pdf/2408.10903v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.11911v5","updated":"2024-08-21T03:26:51Z","published":"2023-09-21T09:22:07Z","title":"InstructERC: Reforming Emotion Recognition in Conversation with\n Multi-task Retrieval-Augmented Large Language Models","summary":" The field of emotion recognition of conversation (ERC) has been focusing on\nseparating sentence feature encoding and context modeling, lacking exploration\nin generative paradigms based on unified designs. In this study, we propose a\nnovel approach, InstructERC, to reformulate the ERC task from a discriminative\nframework to a generative framework based on Large Language Models (LLMs).\nInstructERC makes three significant contributions: (1) it introduces a simple\nyet effective retrieval template module, which helps the model explicitly\nintegrate multi-granularity dialogue supervision information. (2) We introduce\ntwo additional emotion alignment tasks, namely speaker identification and\nemotion prediction tasks, to implicitly model the dialogue role relationships\nand future emotional tendencies in conversations. (3) Pioneeringly, we unify\nemotion labels across benchmarks through the feeling wheel to fit real\napplication scenarios. InstructERC still perform impressively on this unified\ndataset. Our LLM-based plugin framework significantly outperforms all previous\nmodels and achieves comprehensive SOTA on three commonly used ERC datasets.\nExtensive analysis of parameter-efficient and data-scaling experiments provides\nempirical guidance for applying it in practical scenarios.\n","authors":["Shanglin Lei","Guanting Dong","Xiaoping Wang","Keheng Wang","Sirui Wang"],"pdf_url":"https://arxiv.org/pdf/2309.11911v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11308v1","updated":"2024-08-21T03:25:31Z","published":"2024-08-21T03:25:31Z","title":"EEG-Defender: Defending against Jailbreak through Early Exit Generation\n of Large Language Models","summary":" Large Language Models (LLMs) are increasingly attracting attention in various\napplications. Nonetheless, there is a growing concern as some users attempt to\nexploit these models for malicious purposes, including the synthesis of\ncontrolled substances and the propagation of disinformation. In an effort to\nmitigate such risks, the concept of \"Alignment\" technology has been developed.\nHowever, recent studies indicate that this alignment can be undermined using\nsophisticated prompt engineering or adversarial suffixes, a technique known as\n\"Jailbreak.\" Our research takes cues from the human-like generate process of\nLLMs. We identify that while jailbreaking prompts may yield output logits\nsimilar to benign prompts, their initial embeddings within the model's latent\nspace tend to be more analogous to those of malicious prompts. Leveraging this\nfinding, we propose utilizing the early transformer outputs of LLMs as a means\nto detect malicious inputs, and terminate the generation immediately. Built\nupon this idea, we introduce a simple yet significant defense approach called\nEEG-Defender for LLMs. We conduct comprehensive experiments on ten jailbreak\nmethods across three models. Our results demonstrate that EEG-Defender is\ncapable of reducing the Attack Success Rate (ASR) by a significant margin,\nroughly 85\\% in comparison with 50\\% for the present SOTAs, with minimal impact\non the utility and effectiveness of LLMs.\n","authors":["Chongwen Zhao","Zhihao Dou","Kaizhu Huang"],"pdf_url":"https://arxiv.org/pdf/2408.11308v1.pdf","comment":"19 pages, 7 figures"},{"id":"http://arxiv.org/abs/2405.20775v2","updated":"2024-08-21T02:56:47Z","published":"2024-05-26T19:11:21Z","title":"Medical MLLM is Vulnerable: Cross-Modality Jailbreak and Mismatched\n Attacks on Medical Multimodal Large Language Models","summary":" Security concerns related to Large Language Models (LLMs) have been\nextensively explored, yet the safety implications for Multimodal Large Language\nModels (MLLMs), particularly in medical contexts (MedMLLMs), remain\ninsufficiently studied. This paper delves into the underexplored security\nvulnerabilities of MedMLLMs, especially when deployed in clinical environments\nwhere the accuracy and relevance of question-and-answer interactions are\ncritically tested against complex medical challenges. By combining existing\nclinical medical data with atypical natural phenomena, we define the mismatched\nmalicious attack (2M-attack) and introduce its optimized version, known as the\noptimized mismatched malicious attack (O2M-attack or 2M-optimization). Using\nthe voluminous 3MAD dataset that we construct, which covers a wide range of\nmedical image modalities and harmful medical scenarios, we conduct a\ncomprehensive analysis and propose the MCM optimization method, which\nsignificantly enhances the attack success rate on MedMLLMs. Evaluations with\nthis dataset and attack methods, including white-box attacks on LLaVA-Med and\ntransfer attacks (black-box) on four other SOTA models, indicate that even\nMedMLLMs designed with enhanced security features remain vulnerable to security\nbreaches. Our work underscores the urgent need for a concerted effort to\nimplement robust security measures and enhance the safety and efficacy of\nopen-source MedMLLMs, particularly given the potential severity of jailbreak\nattacks and other malicious or clinically significant exploits in medical\nsettings. Our code is available at https://github.com/dirtycomputer/O2M_attack.\n","authors":["Xijie Huang","Xinyuan Wang","Hantao Zhang","Yinghao Zhu","Jiawen Xi","Jingkun An","Hao Wang","Hao Liang","Chengwei Pan"],"pdf_url":"https://arxiv.org/pdf/2405.20775v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11296v1","updated":"2024-08-21T02:53:23Z","published":"2024-08-21T02:53:23Z","title":"RePair: Automated Program Repair with Process-based Feedback","summary":" The gap between the trepidation of program reliability and the expense of\nrepairs underscores the indispensability of Automated Program Repair (APR). APR\nis instrumental in transforming vulnerable programs into more robust ones,\nbolstering program reliability while simultaneously diminishing the financial\nburden of manual repairs. Commercial-scale language models (LM) have taken APR\nto unprecedented levels. However, the emergence reveals that for models fewer\nthan 100B parameters, making single-step modifications may be difficult to\nachieve the desired effect. Moreover, humans interact with the LM through\nexplicit prompts, which hinders the LM from receiving feedback from compiler\nand test cases to automatically optimize its repair policies. In this\nliterature, we explore how small-scale LM (less than 20B) achieve excellent\nperformance through process supervision and feedback. We start by constructing\na dataset named CodeNet4Repair, replete with multiple repair records, which\nsupervises the fine-tuning of a foundational model. Building upon the\nencouraging outcomes of reinforcement learning, we develop a reward model that\nserves as a critic, providing feedback for the fine-tuned LM's action,\nprogressively optimizing its policy. During inference, we require the LM to\ngenerate solutions iteratively until the repair effect no longer improves or\nhits the maximum step limit. The results show that process-based not only\noutperforms larger outcome-based generation methods, but also nearly matches\nthe performance of closed-source commercial large-scale LMs.\n","authors":["Yuze Zhao","Zhenya Huang","Yixiao Ma","Rui Li","Kai Zhang","Hao Jiang","Qi Liu","Linbo Zhu","Yu Su"],"pdf_url":"https://arxiv.org/pdf/2408.11296v1.pdf","comment":"15 pages, 13 figures"},{"id":"http://arxiv.org/abs/2408.11294v1","updated":"2024-08-21T02:49:41Z","published":"2024-08-21T02:49:41Z","title":"RedWhale: An Adapted Korean LLM Through Efficient Continual Pretraining","summary":" The field of Natural Language Processing (NLP) has seen significant\nadvancements with the development of Large Language Models (LLMs). However,\nmuch of this research remains focused on English, often overlooking\nlow-resource languages like Korean. This oversight presents challenges due to\nthe unique non-alphabetic token structure of Korean and the substantial memory\nand computational demands required for LLM training, which frequently lead to\nmemory constraints and out-of-memory errors. To address these issues, we\npresent RedWhale, a model specifically tailored for Korean language processing.\nRedWhale is developed using an efficient continual pretraining approach that\nincludes a comprehensive Korean corpus preprocessing pipeline, a specialized\ntokenizer, an optimized model initialization technique, and a multistage\npretraining strategy. These innovations collectively reduce training time and\ncomputational costs while maintaining high levels of accuracy and\ncomprehension. By leveraging cross-lingual transfer learning, which exploits\nshared linguistic similarities across languages, RedWhale builds on English\nmodels to enhance Korean language processing. Experimental results demonstrate\nthat RedWhale outperforms other leading models on Korean NLP benchmarks,\nincluding the Korean Balanced Evaluation of Significant Tasks (KoBEST), showing\nsuperior understanding and generation of Korean text. Furthermore, RedWhale\nshowed no signs of convergence even after pretraining on 9.7 billion tokens,\nindicating the potential for further improvements with additional training.\nThis work represents a significant advancement in bridging the linguistic\ndivide, particularly in enhancing NLP capabilities for the Korean language.\n","authors":["Anh-Dung Vo","Minseong Jung","Wonbeen Lee","Daewoo Choi"],"pdf_url":"https://arxiv.org/pdf/2408.11294v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09724v3","updated":"2024-08-21T02:45:15Z","published":"2024-03-12T17:07:53Z","title":"ClaimVer: Explainable Claim-Level Verification and Evidence Attribution\n of Text Through Knowledge Graphs","summary":" In the midst of widespread misinformation and disinformation through social\nmedia and the proliferation of AI-generated texts, it has become increasingly\ndifficult for people to validate and trust information they encounter. Many\nfact-checking approaches and tools have been developed, but they often lack\nappropriate explainability or granularity to be useful in various contexts. A\ntext validation method that is easy to use, accessible, and can perform\nfine-grained evidence attribution has become crucial. More importantly,\nbuilding user trust in such a method requires presenting the rationale behind\neach prediction, as research shows this significantly influences people's\nbelief in automated systems. Localizing and bringing users' attention to the\nspecific problematic content is also paramount, instead of providing simple\nblanket labels. In this paper, we present ClaimVer, a human-centric framework\ntailored to meet users' informational and verification needs by generating rich\nannotations and thereby reducing cognitive load. Designed to deliver\ncomprehensive evaluations of texts, it highlights each claim, verifies it\nagainst a trusted knowledge graph (KG), presents the evidence, and provides\nsuccinct, clear explanations for each claim prediction. Finally, our framework\nintroduces an attribution score, enhancing applicability across a wide range of\ndownstream tasks.\n","authors":["Preetam Prabhu Srikar Dammu","Himanshu Naidu","Mouly Dewan","YoungMin Kim","Tanya Roosta","Aman Chadha","Chirag Shah"],"pdf_url":"https://arxiv.org/pdf/2403.09724v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03637v3","updated":"2024-08-21T02:32:43Z","published":"2024-07-04T05:13:58Z","title":"QET: Enhancing Quantized LLM Parameters and KV cache Compression through\n Element Substitution and Residual Clustering","summary":" Matrix quantization compresses matrix elements into a more compact form to\nreduce storage requirements, with dequantization enabling reconstruction for\nuse. We define the Quantization Error Minimization (QEM) problem as minimizing\nthe difference between the original and quantized matrices while ensuring the\nquantized matrix remains within fixed memory constraints. This technique is\ncrucial in applications like Large Language Model (LLM) weight compression and\nKV cache compression, where large matrix sizes demand efficient storage\nsolutions.\n As modern LLMs like GPT-4 and BERT continue to grow, effective matrix\ncompression is increasingly important. These models contain billions of\nparameters in matrix form, making efficient weight quantization essential for\nboth storage and computational efficiency. Similarly, KV caches, storing\nintermediate inference results, are matrix-based and benefit significantly from\noptimized compression techniques.\n To address the QEM problem in the context of LLM weight and KV cache\ncompression, we propose Quantum Entanglement Trees (QET). QET leverages the\nlocal structure of matrix elements by iteratively swapping elements to create a\nlocally ordered matrix, which is then grouped and quantized column by column.\nTo enhance QET, we introduce two optimizations: residual quantization to\nfurther reduce Mean Squared Error (MSE) and masking with batch processing to\naccelerate the algorithm.\n Our experiments demonstrate that QET can reduce MSE to 12.3% of its original\nvalue at the same compression ratio, outperforming leading baseline methods.\nOur contributions include framing the QEM problem specifically for LLM and KV\ncache compression, developing the QET algorithm, and implementing optimizations\nthat improve accuracy and processing speed.\n","authors":["Yanshu Wang","Wang Li","Tong Yang"],"pdf_url":"https://arxiv.org/pdf/2407.03637v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12998v2","updated":"2024-08-21T01:56:45Z","published":"2024-06-18T18:38:17Z","title":"Articulatory Encodec: Coding Speech through Vocal Tract Kinematics","summary":" Vocal tract articulation is a natural, grounded control space of speech\nproduction. The spatiotemporal coordination of articulators combined with the\nvocal source shapes intelligible speech sounds to enable effective spoken\ncommunication. Based on this physiological grounding of speech, we propose a\nnew framework of neural encoding-decoding of speech -- Articulatory Encodec.\nArticulatory Encodec comprises an articulatory analysis model that infers\narticulatory features from speech audio, and an articulatory synthesis model\nthat synthesizes speech audio from articulatory features. The articulatory\nfeatures are kinematic traces of vocal tract articulators and source features,\nwhich are intuitively interpretable and controllable, being the actual physical\ninterface of speech production. An additional speaker identity encoder is\njointly trained with the articulatory synthesizer to inform the voice texture\nof individual speakers. By training on large-scale speech data, we achieve a\nfully intelligible, high-quality articulatory synthesizer that generalizes to\nunseen speakers. Furthermore, the speaker embedding is effectively disentangled\nfrom articulations, which enables accent-perserving zero-shot voice conversion.\nTo the best of our knowledge, this is the first demonstration of universal,\nhigh-performance articulatory inference and synthesis, suggesting the proposed\nframework as a powerful coding system of speech.\n","authors":["Cheol Jun Cho","Peter Wu","Tejas S. Prabhune","Dhruv Agarwal","Gopala K. Anumanchipalli"],"pdf_url":"https://arxiv.org/pdf/2406.12998v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16160v2","updated":"2024-08-21T01:52:02Z","published":"2024-07-23T03:58:08Z","title":"UniMEL: A Unified Framework for Multimodal Entity Linking with Large\n Language Models","summary":" Multimodal Entity Linking (MEL) is a crucial task that aims at linking\nambiguous mentions within multimodal contexts to the referent entities in a\nmultimodal knowledge base, such as Wikipedia. Existing methods focus heavily on\nusing complex mechanisms and extensive model tuning methods to model the\nmultimodal interaction on specific datasets. However, these methods\novercomplicate the MEL task and overlook the visual semantic information, which\nmakes them costly and hard to scale. Moreover, these methods can not solve the\nissues like textual ambiguity, redundancy, and noisy images, which severely\ndegrade their performance. Fortunately, the advent of Large Language Models\n(LLMs) with robust capabilities in text understanding and reasoning,\nparticularly Multimodal Large Language Models (MLLMs) that can process\nmultimodal inputs, provides new insights into addressing this challenge.\nHowever, how to design a universally applicable LLMs-based MEL approach remains\na pressing challenge. To this end, we propose UniMEL, a unified framework which\nestablishes a new paradigm to process multimodal entity linking tasks using\nLLMs. In this framework, we employ LLMs to augment the representation of\nmentions and entities individually by integrating textual and visual\ninformation and refining textual information. Subsequently, we employ the\nembedding-based method for retrieving and re-ranking candidate entities. Then,\nwith only ~0.26% of the model parameters fine-tuned, LLMs can make the final\nselection from the candidate entities. Extensive experiments on three public\nbenchmark datasets demonstrate that our solution achieves state-of-the-art\nperformance, and ablation studies verify the effectiveness of all modules. Our\ncode is available at https://github.com/Javkonline/UniMEL.\n","authors":["Liu Qi","He Yongyi","Lian Defu","Zheng Zhi","Xu Tong","Liu Che","Chen Enhong"],"pdf_url":"https://arxiv.org/pdf/2407.16160v2.pdf","comment":"CIKM 2024. The first two authors contributed equally to this work"},{"id":"http://arxiv.org/abs/2408.11261v1","updated":"2024-08-21T01:03:21Z","published":"2024-08-21T01:03:21Z","title":"Towards Analyzing and Mitigating Sycophancy in Large Vision-Language\n Models","summary":" Large Vision-Language Models (LVLMs) have shown significant capability in\nvision-language understanding. However, one critical issue that persists in\nthese models is sycophancy, which means models are unduly influenced by leading\nor deceptive prompts, resulting in biased outputs and hallucinations. Despite\nthe progress in LVLMs, evaluating and mitigating sycophancy is yet much\nunder-explored. In this work, we fill this gap by systematically analyzing\nsycophancy on various VL benchmarks with curated leading queries and further\nproposing a text contrastive decoding method for mitigation. While the specific\nsycophantic behavior varies significantly among models, our analysis reveals\nthe severe deficiency of all LVLMs in resilience of sycophancy across various\ntasks. For improvement, we propose Leading Query Contrastive Decoding (LQCD), a\nmodel-agnostic method focusing on calibrating the LVLMs' over-reliance on\nleading cues by identifying and suppressing the probabilities of sycophancy\ntokens at the decoding stage. Extensive experiments show that LQCD effectively\nmitigate sycophancy, outperforming both prompt engineering methods and common\nmethods for hallucination mitigation. We further demonstrate that LQCD does not\nhurt but even slightly improves LVLMs' responses to neutral queries, suggesting\nit being a more effective strategy for general-purpose decoding but not limited\nto sycophancy.\n","authors":["Yunpu Zhao","Rui Zhang","Junbin Xiao","Changxin Ke","Ruibo Hou","Yifan Hao","Qi Guo","Yunji Chen"],"pdf_url":"https://arxiv.org/pdf/2408.11261v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11258v1","updated":"2024-08-21T00:48:03Z","published":"2024-08-21T00:48:03Z","title":"Improving Speech Recognition Error Prediction for Modern and\n Off-the-shelf Speech Recognizers","summary":" Modeling the errors of a speech recognizer can help simulate errorful\nrecognized speech data from plain text, which has proven useful for tasks like\ndiscriminative language modeling, improving robustness of NLP systems, where\nlimited or even no audio data is available at train time. Previous work\ntypically considered replicating behavior of GMM-HMM based systems, but the\nbehavior of more modern posterior-based neural network acoustic models is not\nthe same and requires adjustments to the error prediction model. In this work,\nwe extend a prior phonetic confusion based model for predicting speech\nrecognition errors in two ways: first, we introduce a sampling-based paradigm\nthat better simulates the behavior of a posterior-based acoustic model. Second,\nwe investigate replacing the confusion matrix with a sequence-to-sequence model\nin order to introduce context dependency into the prediction. We evaluate the\nerror predictors in two ways: first by predicting the errors made by a\nSwitchboard ASR system on unseen data (Fisher), and then using that same\npredictor to estimate the behavior of an unrelated cloud-based ASR system on a\nnovel task. Sampling greatly improves predictive accuracy within a 100-guess\nparadigm, while the sequence model performs similarly to the confusion matrix.\n","authors":["Prashant Serai","Peidong Wang","Eric Fosler-Lussier"],"pdf_url":"https://arxiv.org/pdf/2408.11258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.11060v2","updated":"2024-08-21T00:31:07Z","published":"2024-02-16T20:20:43Z","title":"Persona-DB: Efficient Large Language Model Personalization for Response\n Prediction with Collaborative Data Refinement","summary":" The increasing demand for personalized interactions with large language\nmodels (LLMs) calls for methodologies capable of accurately and efficiently\nidentifying user opinions and preferences. Retrieval augmentation emerges as an\neffective strategy, as it can accommodate a vast number of users without the\ncosts from fine-tuning. Existing research, however, has largely focused on\nenhancing the retrieval stage and devoted limited exploration toward optimizing\nthe representation of the database, a crucial aspect for tasks such as\npersonalization. In this work, we examine the problem from a novel angle,\nfocusing on how data can be better represented for more data-efficient\nretrieval in the context of LLM customization. To tackle this challenge, we\nintroduce Persona-DB, a simple yet effective framework consisting of a\nhierarchical construction process to improve generalization across task\ncontexts and collaborative refinement to effectively bridge knowledge gaps\namong users. In the evaluation of response prediction, Persona-DB demonstrates\nsuperior context efficiency in maintaining accuracy with a significantly\nreduced retrieval size, a critical advantage in scenarios with extensive\nhistories or limited context windows. Our experiments also indicate a marked\nimprovement of over 10% under cold-start scenarios, when users have extremely\nsparse data. Furthermore, our analysis reveals the increasing importance of\ncollaborative knowledge as the retrieval capacity expands.\n","authors":["Chenkai Sun","Ke Yang","Revanth Gangi Reddy","Yi R. Fung","Hou Pong Chan","Kevin Small","ChengXiang Zhai","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2402.11060v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08459v2","updated":"2024-08-21T00:24:53Z","published":"2024-08-15T23:57:02Z","title":"JPEG-LM: LLMs as Image Generators with Canonical Codec Representations","summary":" Recent work in image and video generation has been adopting the\nautoregressive LLM architecture due to its generality and potentially easy\nintegration into multi-modal systems. The crux of applying autoregressive\ntraining in language generation to visual generation is discretization --\nrepresenting continuous data like images and videos as discrete tokens. Common\nmethods of discretizing images and videos include modeling raw pixel values,\nwhich are prohibitively lengthy, or vector quantization, which requires\nconvoluted pre-hoc training. In this work, we propose to directly model images\nand videos as compressed files saved on computers via canonical codecs (e.g.,\nJPEG, AVC/H.264). Using the default Llama architecture without any\nvision-specific modifications, we pretrain JPEG-LM from scratch to generate\nimages (and AVC-LM to generate videos as a proof of concept), by directly\noutputting compressed file bytes in JPEG and AVC formats. Evaluation of image\ngeneration shows that this simple and straightforward approach is more\neffective than pixel-based modeling and sophisticated vector quantization\nbaselines (on which our method yields a 31% reduction in FID). Our analysis\nshows that JPEG-LM has an especial advantage over vector quantization models in\ngenerating long-tail visual elements. Overall, we show that using canonical\ncodec representations can help lower the barriers between language generation\nand visual generation, facilitating future research on multi-modal\nlanguage/image/video LLMs.\n","authors":["Xiaochuang Han","Marjan Ghazvininejad","Pang Wei Koh","Yulia Tsvetkov"],"pdf_url":"https://arxiv.org/pdf/2408.08459v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11252v1","updated":"2024-08-21T00:17:59Z","published":"2024-08-21T00:17:59Z","title":"Counterfactuals As a Means for Evaluating Faithfulness of Attribution\n Methods in Autoregressive Language Models","summary":" Despite the widespread adoption of autoregressive language models,\nexplainability evaluation research has predominantly focused on span infilling\nand masked language models (MLMs). Evaluating the faithfulness of an\nexplanation method -- how accurately the method explains the inner workings and\ndecision-making of the model -- is very challenging because it is very hard to\nseparate the model from its explanation. Most faithfulness evaluation\ntechniques corrupt or remove some input tokens considered important according\nto a particular attribution (feature importance) method and observe the change\nin the model's output. This approach creates out-of-distribution inputs for\ncausal language models (CLMs) due to their training objective of next token\nprediction. In this study, we propose a technique that leverages counterfactual\ngeneration to evaluate the faithfulness of attribution methods for\nautoregressive language modeling scenarios. Our technique creates fluent and\nin-distribution counterfactuals that makes evaluation protocol more reliable.\nCode is available at https://github.com/Sepehr-Kamahi/faith\n","authors":["Sepehr Kamahi","Yadollah Yaghoobzadeh"],"pdf_url":"https://arxiv.org/pdf/2408.11252v1.pdf","comment":"17 pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.12036v1","updated":"2024-08-21T23:42:06Z","published":"2024-08-21T23:42:06Z","title":"Reasoning and Tools for Human-Level Forecasting","summary":" Language models (LMs) trained on web-scale datasets are largely successful\ndue to their ability to memorize large amounts of training data, even if only\npresent in a few examples. These capabilities are often desirable in evaluation\non tasks such as question answering but raise questions about whether these\nmodels can exhibit genuine reasoning or succeed only at mimicking patterns from\nthe training data. This distinction is particularly salient in forecasting\ntasks, where the answer is not present in the training data, and the model must\nreason to make logical deductions. We present Reasoning and Tools for\nForecasting (RTF), a framework of reasoning-and-acting (ReAct) agents that can\ndynamically retrieve updated information and run numerical simulation with\nequipped tools. We evaluate our model with questions from competitive\nforecasting platforms and demonstrate that our method is competitive with and\ncan outperform human predictions. This suggests that LMs, with the right tools,\ncan indeed think and adapt like humans, offering valuable insights for\nreal-world decision-making.\n","authors":["Elvis Hsieh","Preston Fu","Jonathan Chen"],"pdf_url":"https://arxiv.org/pdf/2408.12036v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12035v1","updated":"2024-08-21T23:38:02Z","published":"2024-08-21T23:38:02Z","title":"Let Community Rules Be Reflected in Online Content Moderation","summary":" Content moderation is a widely used strategy to prevent the dissemination of\nirregular information on social media platforms. Despite extensive research on\ndeveloping automated models to support decision-making in content moderation,\nthere remains a notable scarcity of studies that integrate the rules of online\ncommunities into content moderation. This study addresses this gap by proposing\na community rule-based content moderation framework that directly integrates\ncommunity rules into the moderation of user-generated content. Our experiment\nresults with datasets collected from two domains demonstrate the superior\nperformance of models based on the framework to baseline models across all\nevaluation metrics. In particular, incorporating community rules substantially\nenhances model performance in content moderation. The findings of this research\nhave significant research and practical implications for improving the\neffectiveness and generalizability of content moderation models in online\ncommunities.\n","authors":["Wangjiaxuan Xin","Kanlun Wang","Zhe Fu","Lina Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.12035v1.pdf","comment":"10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2407.15549v2","updated":"2024-08-21T23:22:40Z","published":"2024-07-22T11:19:14Z","title":"Latent Adversarial Training Improves Robustness to Persistent Harmful\n Behaviors in LLMs","summary":" Large language models (LLMs) can often be made to behave in undesirable ways\nthat they are explicitly fine-tuned not to. For example, the LLM red-teaming\nliterature has produced a wide variety of 'jailbreaking' techniques to elicit\nharmful text from models that were fine-tuned to be harmless. Recent work on\nred-teaming, model editing, and interpretability suggests that this challenge\nstems from how (adversarial) fine-tuning largely serves to suppress rather than\nremove undesirable capabilities from LLMs. Prior work has introduced latent\nadversarial training (LAT) as a way to improve robustness to broad classes of\nfailures. These prior works have considered untargeted latent space attacks\nwhere the adversary perturbs latent activations to maximize loss on examples of\ndesirable behavior. Untargeted LAT can provide a generic type of robustness but\ndoes not leverage information about specific failure modes. Here, we experiment\nwith targeted LAT where the adversary seeks to minimize loss on a specific\ncompeting task. We find that it can augment a wide variety of state-of-the-art\nmethods. First, we use targeted LAT to improve robustness to jailbreaks,\noutperforming a strong R2D2 baseline with orders of magnitude less compute.\nSecond, we use it to more effectively remove backdoors with no knowledge of the\ntrigger. Finally, we use it to more effectively unlearn knowledge for specific\nundesirable tasks in a way that is also more robust to re-learning. Overall,\nour results suggest that targeted LAT can be an effective tool for defending\nagainst harmful behaviors from LLMs.\n","authors":["Abhay Sheshadri","Aidan Ewart","Phillip Guo","Aengus Lynch","Cindy Wu","Vivek Hebbar","Henry Sleight","Asa Cooper Stickland","Ethan Perez","Dylan Hadfield-Menell","Stephen Casper"],"pdf_url":"https://arxiv.org/pdf/2407.15549v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.07778v2","updated":"2024-08-21T23:07:49Z","published":"2024-06-12T00:01:32Z","title":"A Study of Backdoors in Instruction Fine-tuned Language Models","summary":" Backdoor data poisoning, inserted within instruction examples used to\nfine-tune a foundation Large Language Model (LLM) for downstream tasks\n(\\textit{e.g.,} sentiment prediction), is a serious security concern due to the\nevasive nature of such attacks. The poisoning is usually in the form of a\n(seemingly innocuous) trigger word or phrase inserted into a very small\nfraction of the fine-tuning samples from a target class. Such backdoor attacks\ncan: alter response sentiment, violate censorship, over-refuse (invoke\ncensorship for legitimate queries), inject false content, or trigger nonsense\nresponses (hallucinations). In this work we investigate the efficacy of\ninstruction fine-tuning backdoor attacks as attack \"hyperparameters\" are varied\nunder a variety of scenarios, considering: the trigger location in the poisoned\nexamples; robustness to change in the trigger location, partial triggers, and\nsynonym substitutions at test time; attack transfer from one (fine-tuning)\ndomain to a related test domain; and clean-label vs. dirty-label poisoning.\nBased on our observations, we propose and evaluate two defenses against these\nattacks: i) a \\textit{during-fine-tuning defense} based on word-frequency\ncounts that assumes the (possibly poisoned) fine-tuning dataset is available\nand identifies the backdoor trigger tokens; and ii) a \\textit{post-fine-tuning\ndefense} based on downstream clean fine-tuning of the backdoored LLM with a\nsmall defense dataset. Finally, we provide a brief survey of related work on\nbackdoor attacks and defenses.\n","authors":["Jayaram Raghuram","George Kesidis","David J. Miller"],"pdf_url":"https://arxiv.org/pdf/2406.07778v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2404.05961v2","updated":"2024-08-21T22:46:05Z","published":"2024-04-09T02:51:05Z","title":"LLM2Vec: Large Language Models Are Secretly Powerful Text Encoders","summary":" Large decoder-only language models (LLMs) are the state-of-the-art models on\nmost of today's NLP tasks and benchmarks. Yet, the community is only slowly\nadopting these models for text embedding tasks, which require rich\ncontextualized representations. In this work, we introduce LLM2Vec, a simple\nunsupervised approach that can transform any decoder-only LLM into a strong\ntext encoder. LLM2Vec consists of three simple steps: 1) enabling bidirectional\nattention, 2) masked next token prediction, and 3) unsupervised contrastive\nlearning. We demonstrate the effectiveness of LLM2Vec by applying it to 4\npopular LLMs ranging from 1.3B to 8B parameters and evaluate the transformed\nmodels on English word- and sequence-level tasks. We outperform encoder-only\nmodels by a large margin on word-level tasks and reach a new unsupervised\nstate-of-the-art performance on the Massive Text Embeddings Benchmark (MTEB).\nMoreover, when combining LLM2Vec with supervised contrastive learning, we\nachieve state-of-the-art performance on MTEB among models that train only on\npublicly available data (as of May 24, 2024). Our strong empirical results and\nextensive analysis demonstrate that LLMs can be effectively transformed into\nuniversal text encoders in a parameter-efficient manner without the need for\nexpensive adaptation or synthetic GPT-4 generated data.\n","authors":["Parishad BehnamGhader","Vaibhav Adlakha","Marius Mosbach","Dzmitry Bahdanau","Nicolas Chapados","Siva Reddy"],"pdf_url":"https://arxiv.org/pdf/2404.05961v2.pdf","comment":"Accepted to COLM 2024"},{"id":"http://arxiv.org/abs/2408.12023v1","updated":"2024-08-21T22:30:36Z","published":"2024-08-21T22:30:36Z","title":"Limitations in Employing Natural Language Supervision for Sensor-Based\n Human Activity Recognition -- And Ways to Overcome Them","summary":" Cross-modal contrastive pre-training between natural language and other\nmodalities, e.g., vision and audio, has demonstrated astonishing performance\nand effectiveness across a diverse variety of tasks and domains. In this paper,\nwe investigate whether such natural language supervision can be used for\nwearable sensor based Human Activity Recognition (HAR), and discover\nthat-surprisingly-it performs substantially worse than standard end-to-end\ntraining and self-supervision. We identify the primary causes for this as:\nsensor heterogeneity and the lack of rich, diverse text descriptions of\nactivities. To mitigate their impact, we also develop strategies and assess\ntheir effectiveness through an extensive experimental evaluation. These\nstrategies lead to significant increases in activity recognition, bringing\nperformance closer to supervised and self-supervised training, while also\nenabling the recognition of unseen activities and cross modal retrieval of\nvideos. Overall, our work paves the way for better sensor-language learning,\nultimately leading to the development of foundational models for HAR using\nwearables.\n","authors":["Harish Haresamudram","Apoorva Beedu","Mashfiqui Rabbi","Sankalita Saha","Irfan Essa","Thomas Ploetz"],"pdf_url":"https://arxiv.org/pdf/2408.12023v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12022v1","updated":"2024-08-21T22:29:56Z","published":"2024-08-21T22:29:56Z","title":"Understanding Epistemic Language with a Bayesian Theory of Mind","summary":" How do people understand and evaluate claims about others' beliefs, even\nthough these beliefs cannot be directly observed? In this paper, we introduce a\ncognitive model of epistemic language interpretation, grounded in Bayesian\ninferences about other agents' goals, beliefs, and intentions: a\nlanguage-augmented Bayesian theory-of-mind (LaBToM). By translating natural\nlanguage into an epistemic ``language-of-thought'', then evaluating these\ntranslations against the inferences produced by inverting a probabilistic\ngenerative model of rational action and perception, LaBToM captures graded\nplausibility judgments about epistemic claims. We validate our model in an\nexperiment where participants watch an agent navigate a maze to find keys\nhidden in boxes needed to reach their goal, then rate sentences about the\nagent's beliefs. In contrast with multimodal LLMs (GPT-4o, Gemini Pro) and\nablated models, our model correlates highly with human judgments for a wide\nrange of expressions, including modal language, uncertainty expressions,\nknowledge claims, likelihood comparisons, and attributions of false belief.\n","authors":["Lance Ying","Tan Zhi-Xuan","Lionel Wong","Vikash Mansinghka","Joshua B. Tenenbaum"],"pdf_url":"https://arxiv.org/pdf/2408.12022v1.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2408.12003v1","updated":"2024-08-21T21:34:01Z","published":"2024-08-21T21:34:01Z","title":"RAG-Optimized Tibetan Tourism LLMs: Enhancing Accuracy and\n Personalization","summary":" With the development of the modern social economy, tourism has become an\nimportant way to meet people's spiritual needs, bringing development\nopportunities to the tourism industry. However, existing large language models\n(LLMs) face challenges in personalized recommendation capabilities and the\ngeneration of content that can sometimes produce hallucinations. This study\nproposes an optimization scheme for Tibet tourism LLMs based on\nretrieval-augmented generation (RAG) technology. By constructing a database of\ntourist viewpoints and processing the data using vectorization techniques, we\nhave significantly improved retrieval accuracy. The application of RAG\ntechnology effectively addresses the hallucination problem in content\ngeneration. The optimized model shows significant improvements in fluency,\naccuracy, and relevance of content generation. This research demonstrates the\npotential of RAG technology in the standardization of cultural tourism\ninformation and data analysis, providing theoretical and technical support for\nthe development of intelligent cultural tourism service systems.\n","authors":["Jinhu Qi","Shuai Yan","Yibo Zhang","Wentao Zhang","Rong Jin","Yuwei Hu","Ke Wang"],"pdf_url":"https://arxiv.org/pdf/2408.12003v1.pdf","comment":"Accepted by AIPR 2024"},{"id":"http://arxiv.org/abs/2408.11981v1","updated":"2024-08-21T20:28:42Z","published":"2024-08-21T20:28:42Z","title":"Large Language Models for Page Stream Segmentation","summary":" Page Stream Segmentation (PSS) is an essential prerequisite for automated\ndocument processing at scale. However, research progress has been limited by\nthe absence of realistic public benchmarks. This paper works towards addressing\nthis gap by introducing TABME++, an enhanced benchmark featuring commercial\nOptical Character Recognition (OCR) annotations. We evaluate the performance of\nlarge language models (LLMs) on PSS, focusing on decoder-based models\nfine-tuned with parameter-efficient methods. Our results show that\ndecoder-based LLMs outperform smaller multimodal encoders. Through a review of\nexisting PSS research and datasets, we identify key challenges and advancements\nin the field. Our findings highlight the key importance of robust OCR,\nproviding valuable insights for the development of more effective document\nprocessing systems.\n","authors":["Hunter Heidenreich","Ratish Dalvi","Rohith Mukku","Nikhil Verma","Neven Pičuljan"],"pdf_url":"https://arxiv.org/pdf/2408.11981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11962v1","updated":"2024-08-21T19:31:01Z","published":"2024-08-21T19:31:01Z","title":"Characterizing Online Toxicity During the 2022 Mpox Outbreak: A\n Computational Analysis of Topical and Network Dynamics","summary":" Background: Online toxicity, encompassing behaviors such as harassment,\nbullying, hate speech, and the dissemination of misinformation, has become a\npressing social concern in the digital age. The 2022 Mpox outbreak, initially\ntermed \"Monkeypox\" but subsequently renamed to mitigate associated stigmas and\nsocietal concerns, serves as a poignant backdrop to this issue. Objective: In\nthis research, we undertake a comprehensive analysis of the toxic online\ndiscourse surrounding the 2022 Mpox outbreak. Our objective is to dissect its\norigins, characterize its nature and content, trace its dissemination patterns,\nand assess its broader societal implications, with the goal of providing\ninsights that can inform strategies to mitigate such toxicity in future crises.\nMethods: We collected more than 1.6 million unique tweets and analyzed them\nfrom five dimensions, including context, extent, content, speaker, and intent.\nUtilizing BERT-based topic modeling and social network community clustering, we\ndelineated the toxic dynamics on Twitter. Results: We identified five\nhigh-level topic categories in the toxic online discourse on Twitter, including\ndisease (46.6%), health policy and healthcare (19.3%), homophobia (23.9%),\npolitics (6.0%), and racism (4.1%). Through the toxicity diffusion networks of\nmentions, retweets, and the top users, we found that retweets of toxic content\nwere widespread, while influential users rarely engaged with or countered this\ntoxicity through retweets. Conclusions: By tracking topical dynamics, we can\ntrack the changing popularity of toxic content online, providing a better\nunderstanding of societal challenges. Network dynamics spotlight key social\nmedia influencers and their intents, indicating that addressing these central\nfigures in toxic discourse can enhance crisis communication and inform\npolicy-making.\n","authors":["Lizhou Fan","Lingyao Li","Libby Hemphill"],"pdf_url":"https://arxiv.org/pdf/2408.11962v1.pdf","comment":"36 pages, 8 figure, and 12 tables"},{"id":"http://arxiv.org/abs/2408.11961v1","updated":"2024-08-21T19:30:59Z","published":"2024-08-21T19:30:59Z","title":"Decoding SEC Actions: Enforcement Trends through Analyzing Blockchain\n litigation using LLM-based Thematic Factor Mapping","summary":" The proliferation of blockchain entities (persons or enterprises) exposes\nthem to potential regulatory actions (e.g., being litigated) by regulatory\nauthorities. Regulatory frameworks for crypto assets are actively being\ndeveloped and refined, increasing the likelihood of such actions. The lack of\nsystematic analysis of the factors driving litigation against blockchain\nentities leaves companies in need of clarity to navigate compliance risks. This\nabsence of insight also deprives investors of the information for informed\ndecision-making. This study focuses on U.S. litigation against blockchain\nentities, particularly by the U.S. Securities and Exchange Commission (SEC)\ngiven its influence on global crypto regulation. Utilizing frontier pretrained\nlanguage models and large language models, we systematically map all SEC\ncomplaints against blockchain companies from 2012 to 2024 to thematic factors\nconceptualized by our study to delineate the factors driving SEC actions. We\nquantify the thematic factors and assess their influence on specific legal Acts\ncited within the complaints on an annual basis, allowing us to discern the\nregulatory emphasis, patterns and conduct trend analysis.\n","authors":["Junliang Luo","Xihan Xiong","William Knottenbelt","Xue Liu"],"pdf_url":"https://arxiv.org/pdf/2408.11961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11940v1","updated":"2024-08-21T18:44:28Z","published":"2024-08-21T18:44:28Z","title":"The State of Commercial Automatic French Legal Speech Recognition\n Systems and their Impact on Court Reporters et al","summary":" In Quebec and Canadian courts, the transcription of court proceedings is a\ncritical task for appeal purposes and must be certified by an official court\nreporter. The limited availability of qualified reporters and the high costs\nassociated with manual transcription underscore the need for more efficient\nsolutions. This paper examines the potential of Automatic Speech Recognition\n(ASR) systems to assist court reporters in transcribing legal proceedings. We\nbenchmark three ASR models, including commercial and open-source options, on\ntheir ability to recognize French legal speech using a curated dataset. Our\nstudy evaluates the performance of these systems using the Word Error Rate\n(WER) metric and introduces the Sonnex Distance to account for phonetic\naccuracy. We also explore the broader implications of ASR adoption on court\nreporters, copyists, the legal system, and litigants, identifying both positive\nand negative impacts. The findings suggest that while current ASR systems show\npromise, they require further refinement to meet the specific needs of the\nlegal domain.\n","authors":["Nicolad Garneau","Olivier Bolduc"],"pdf_url":"https://arxiv.org/pdf/2408.11940v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11926v1","updated":"2024-08-21T18:28:48Z","published":"2024-08-21T18:28:48Z","title":"Defining Boundaries: The Impact of Domain Specification on\n Cross-Language and Cross-Domain Transfer in Machine Translation","summary":" Recent advancements in neural machine translation (NMT) have revolutionized\nthe field, yet the dependency on extensive parallel corpora limits progress for\nlow-resource languages. Cross-lingual transfer learning offers a promising\nsolution by utilizing data from high-resource languages but often struggles\nwith in-domain NMT. In this paper, we investigate three pivotal aspects:\nenhancing the domain-specific quality of NMT by fine-tuning domain-relevant\ndata from different language pairs, identifying which domains are transferable\nin zero-shot scenarios, and assessing the impact of language-specific versus\ndomain-specific factors on adaptation effectiveness. Using English as the\nsource language and Spanish for fine-tuning, we evaluate multiple target\nlanguages including Portuguese, Italian, French, Czech, Polish, and Greek. Our\nfindings reveal significant improvements in domain-specific translation\nquality, especially in specialized fields such as medical, legal, and IT,\nunderscoring the importance of well-defined domain data and transparency of the\nexperiment setup in in-domain transfer learning.\n","authors":["Lia Shahnazaryan","Meriem Beloucif"],"pdf_url":"https://arxiv.org/pdf/2408.11926v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.06255v3","updated":"2024-08-21T18:01:35Z","published":"2024-02-09T09:09:39Z","title":"Fight Back Against Jailbreaking via Prompt Adversarial Tuning","summary":" While Large Language Models (LLMs) have achieved tremendous success in\nvarious applications, they are also susceptible to jailbreak attacks. Several\nprimary defense strategies have been proposed to protect LLMs from producing\nharmful information, mostly with a particular focus on harmful content\nfiltering or heuristical defensive prompt designs. However, how to achieve\nintrinsic robustness through the prompts remains an open problem. In this\npaper, motivated by adversarial training paradigms for achieving reliable\nrobustness, we propose an approach named Prompt Adversarial Tuning (PAT) that\ntrains a prompt control attached to the user prompt as a guard prefix. To\nachieve our defense goal whilst maintaining natural performance, we optimize\nthe control prompt with both adversarial and benign prompts. Comprehensive\nexperiments show that our method is effective against both grey-box and\nblack-box attacks, reducing the success rate of advanced attacks to nearly 0\nwhile maintaining the model's utility on the benign task. The proposed defense\nstrategy incurs only negligible computational overhead, charting a new\nperspective for future explorations in LLM security. Our code is available at\nhttps://github.com/rain152/PAT.\n","authors":["Yichuan Mo","Yuji Wang","Zeming Wei","Yisen Wang"],"pdf_url":"https://arxiv.org/pdf/2402.06255v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11903v1","updated":"2024-08-21T18:00:21Z","published":"2024-08-21T18:00:21Z","title":"Ancient Wisdom, Modern Tools: Exploring Retrieval-Augmented LLMs for\n Ancient Indian Philosophy","summary":" LLMs have revolutionized the landscape of information retrieval and knowledge\ndissemination. However, their application in specialized areas is often\nhindered by factual inaccuracies and hallucinations, especially in long-tail\nknowledge distributions. We explore the potential of retrieval-augmented\ngeneration (RAG) models for long-form question answering (LFQA) in a\nspecialized knowledge domain. We present VedantaNY-10M, a dataset curated from\nextensive public discourses on the ancient Indian philosophy of Advaita\nVedanta. We develop and benchmark a RAG model against a standard, non-RAG LLM,\nfocusing on transcription, retrieval, and generation performance. Human\nevaluations by computational linguists and domain experts show that the RAG\nmodel significantly outperforms the standard model in producing factual and\ncomprehensive responses having fewer hallucinations. In addition, a\nkeyword-based hybrid retriever that emphasizes unique low-frequency terms\nfurther improves results. Our study provides insights into effectively\nintegrating modern large language models with ancient knowledge systems.\nProject page with dataset and code: https://sites.google.com/view/vedantany-10m\n","authors":["Priyanka Mandikal"],"pdf_url":"https://arxiv.org/pdf/2408.11903v1.pdf","comment":"Best paper at the Workshop on Machine Learning for Ancient Languages\n @ ACL 2024. Proceedings of the 1st Machine Learning for Ancient Languages\n Workshop, 2024.ml4al-1.23, Association for Computational Linguistics (ACL)\n 2024. Dataset, code, and evaluation is available at:\n https://sites.google.com/view/vedantany-10m"},{"id":"http://arxiv.org/abs/2408.10923v2","updated":"2024-08-21T15:51:33Z","published":"2024-08-20T15:05:02Z","title":"LBC: Language-Based-Classifier for Out-Of-Variable Generalization","summary":" Large Language Models (LLMs) have great success in natural language\nprocessing tasks such as response generation. However, their use in tabular\ndata has been limited due to their inferior performance compared to traditional\nmachine learning models (TMLs) such as XGBoost. We find that the pre-trained\nknowledge of LLMs enables them to interpret new variables that appear in a test\nwithout additional training, a capability central to the concept of\nOut-of-Variable (OOV). From the findings, we propose a\nLanguage-Based-Classifier (LBC), a classifier that maximizes the benefits of\nLLMs to outperform TMLs on OOV tasks. LBC employs three key methodological\nstrategies: 1) Categorical changes to adjust data to better fit the model's\nunderstanding, 2) Advanced order and indicator to enhance data representation\nto the model, and 3) Using verbalizer to map logit scores to classes during\ninference to generate model predictions. These strategies, combined with the\npre-trained knowledge of LBC, emphasize the model's ability to effectively\nhandle OOV tasks. We empirically and theoretically validate the superiority of\nLBC. LBC is the first study to apply an LLM-based model to OOV tasks. The\nsource code is at https://github.com/sksmssh/LBCforOOVGen\n","authors":["Kangjun Noh","Baekryun Seong","Hoyoon Byun","Youngjun Choi","Sungjin Song","Kyungwoo Song"],"pdf_url":"https://arxiv.org/pdf/2408.10923v2.pdf","comment":"16 pages, 7 figures, 4 tables"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2408.11817v1","updated":"2024-08-21T17:59:32Z","published":"2024-08-21T17:59:32Z","title":"GRAB: A Challenging GRaph Analysis Benchmark for Large Multimodal Models","summary":" Large multimodal models (LMMs) have exhibited proficiencies across many\nvisual tasks. Although numerous well-known benchmarks exist to evaluate model\nperformance, they increasingly have insufficient headroom. As such, there is a\npressing need for a new generation of benchmarks challenging enough for the\nnext generation of LMMs. One area that LMMs show potential is graph analysis,\nspecifically, the tasks an analyst might typically perform when interpreting\nfigures such as estimating the mean, intercepts or correlations of functions\nand data series. In this work, we introduce GRAB, a graph analysis benchmark,\nfit for current and future frontier LMMs. Our benchmark is entirely synthetic,\nensuring high-quality, noise-free questions. GRAB is comprised of 2170\nquestions, covering four tasks and 23 graph properties. We evaluate 20 LMMs on\nGRAB, finding it to be a challenging benchmark, with the highest performing\nmodel attaining a score of just 21.7%. Finally, we conduct various ablations to\ninvestigate where the models succeed and struggle. We release GRAB to encourage\nprogress in this important, growing domain.\n","authors":["Jonathan Roberts","Kai Han","Samuel Albanie"],"pdf_url":"https://arxiv.org/pdf/2408.11817v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11814v1","updated":"2024-08-21T17:58:49Z","published":"2024-08-21T17:58:49Z","title":"SynPlay: Importing Real-world Diversity for a Synthetic Human Dataset","summary":" We introduce Synthetic Playground (SynPlay), a new synthetic human dataset\nthat aims to bring out the diversity of human appearance in the real world. We\nfocus on two factors to achieve a level of diversity that has not yet been seen\nin previous works: i) realistic human motions and poses and ii) multiple camera\nviewpoints towards human instances. We first use a game engine and its\nlibrary-provided elementary motions to create games where virtual players can\ntake less-constrained and natural movements while following the game rules\n(i.e., rule-guided motion design as opposed to detail-guided design). We then\naugment the elementary motions with real human motions captured with a motion\ncapture device. To render various human appearances in the games from multiple\nviewpoints, we use seven virtual cameras encompassing the ground and aerial\nviews, capturing abundant aerial-vs-ground and dynamic-vs-static attributes of\nthe scene. Through extensive and carefully-designed experiments, we show that\nusing SynPlay in model training leads to enhanced accuracy over existing\nsynthetic datasets for human detection and segmentation. The benefit of SynPlay\nbecomes even greater for tasks in the data-scarce regime, such as few-shot and\ncross-domain learning tasks. These results clearly demonstrate that SynPlay can\nbe used as an essential dataset with rich attributes of complex human\nappearances and poses suitable for model pretraining. SynPlay dataset\ncomprising over 73k images and 6.5M human instances, is available for download\nat https://synplaydataset.github.io/.\n","authors":["Jinsub Yim","Hyungtae Lee","Sungmin Eum","Yi-Ting Shen","Yan Zhang","Heesung Kwon","Shuvra S. Bhattacharyya"],"pdf_url":"https://arxiv.org/pdf/2408.11814v1.pdf","comment":"Project Page: https://synplaydataset.github.io/"},{"id":"http://arxiv.org/abs/2408.11813v1","updated":"2024-08-21T17:58:02Z","published":"2024-08-21T17:58:02Z","title":"SEA: Supervised Embedding Alignment for Token-Level Visual-Textual\n Integration in MLLMs","summary":" Multimodal Large Language Models (MLLMs) have recently demonstrated\nremarkable perceptual and reasoning abilities, typically comprising a Vision\nEncoder, an Adapter, and a Large Language Model (LLM). The adapter serves as\nthe critical bridge between the visual and language components. However,\ntraining adapters with image-level supervision often results in significant\nmisalignment, undermining the LLMs' capabilities and limiting the potential of\nMultimodal LLMs. To address this, we introduce Supervised Embedding Alignment\n(SEA), a token-level alignment method that leverages vision-language\npre-trained models, such as CLIP, to align visual tokens with the LLM's\nembedding space through contrastive learning. This approach ensures a more\ncoherent integration of visual and language representations, enhancing the\nperformance and interpretability of multimodal LLMs while preserving their\ninherent capabilities. Extensive experiments show that SEA effectively improves\nMLLMs, particularly for smaller models, without adding extra data or inference\ncomputation. SEA also lays the groundwork for developing more general and\nadaptable solutions to enhance multimodal systems.\n","authors":["Yuanyang Yin","Yaqi Zhao","Yajie Zhang","Ke Lin","Jiahao Wang","Xin Tao","Pengfei Wan","Di Zhang","Baoqun Yin","Wentao Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.11813v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11811v1","updated":"2024-08-21T17:57:06Z","published":"2024-08-21T17:57:06Z","title":"EmbodiedSAM: Online Segment Any 3D Thing in Real Time","summary":" Embodied tasks require the agent to fully understand 3D scenes simultaneously\nwith its exploration, so an online, real-time, fine-grained and\nhighly-generalized 3D perception model is desperately needed. Since\nhigh-quality 3D data is limited, directly training such a model in 3D is almost\ninfeasible. Meanwhile, vision foundation models (VFM) has revolutionized the\nfield of 2D computer vision with superior performance, which makes the use of\nVFM to assist embodied 3D perception a promising direction. However, most\nexisting VFM-assisted 3D perception methods are either offline or too slow that\ncannot be applied in practical embodied tasks. In this paper, we aim to\nleverage Segment Anything Model (SAM) for real-time 3D instance segmentation in\nan online setting. This is a challenging problem since future frames are not\navailable in the input streaming RGB-D video, and an instance may be observed\nin several frames so object matching between frames is required. To address\nthese challenges, we first propose a geometric-aware query lifting module to\nrepresent the 2D masks generated by SAM by 3D-aware queries, which is then\niteratively refined by a dual-level query decoder. In this way, the 2D masks\nare transferred to fine-grained shapes on 3D point clouds. Benefit from the\nquery representation for 3D masks, we can compute the similarity matrix between\nthe 3D masks from different views by efficient matrix operation, which enables\nreal-time inference. Experiments on ScanNet, ScanNet200, SceneNN and 3RScan\nshow our method achieves leading performance even compared with offline\nmethods. Our method also demonstrates great generalization ability in several\nzero-shot dataset transferring experiments and show great potential in\nopen-vocabulary and data-efficient setting. Code and demo are available at\nhttps://xuxw98.github.io/ESAM/, with only one RTX 3090 GPU required for\ntraining and evaluation.\n","authors":["Xiuwei Xu","Huangxing Chen","Linqing Zhao","Ziwei Wang","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2408.11811v1.pdf","comment":"Project page: https://xuxw98.github.io/ESAM/"},{"id":"http://arxiv.org/abs/2408.11810v1","updated":"2024-08-21T17:56:34Z","published":"2024-08-21T17:56:34Z","title":"Pixel Is Not A Barrier: An Effective Evasion Attack for Pixel-Domain\n Diffusion Models","summary":" Diffusion Models have emerged as powerful generative models for high-quality\nimage synthesis, with many subsequent image editing techniques based on them.\nHowever, the ease of text-based image editing introduces significant risks,\nsuch as malicious editing for scams or intellectual property infringement.\nPrevious works have attempted to safeguard images from diffusion-based editing\nby adding imperceptible perturbations. These methods are costly and\nspecifically target prevalent Latent Diffusion Models (LDMs), while\nPixel-domain Diffusion Models (PDMs) remain largely unexplored and robust\nagainst such attacks. Our work addresses this gap by proposing a novel\nattacking framework with a feature representation attack loss that exploits\nvulnerabilities in denoising UNets and a latent optimization strategy to\nenhance the naturalness of protected images. Extensive experiments demonstrate\nthe effectiveness of our approach in attacking dominant PDM-based editing\nmethods (e.g., SDEdit) while maintaining reasonable protection fidelity and\nrobustness against common defense methods. Additionally, our framework is\nextensible to LDMs, achieving comparable performance to existing approaches.\n","authors":["Chun-Yen Shih","Li-Xuan Peng","Jia-Wei Liao","Ernie Chu","Cheng-Fu Chou","Jun-Cheng Chen"],"pdf_url":"https://arxiv.org/pdf/2408.11810v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11805v1","updated":"2024-08-21T17:48:31Z","published":"2024-08-21T17:48:31Z","title":"ACE: A Cross-Platform Visual-Exoskeletons System for Low-Cost Dexterous\n Teleoperation","summary":" Learning from demonstrations has shown to be an effective approach to robotic\nmanipulation, especially with the recently collected large-scale robot data\nwith teleoperation systems. Building an efficient teleoperation system across\ndiverse robot platforms has become more crucial than ever. However, there is a\nnotable lack of cost-effective and user-friendly teleoperation systems for\ndifferent end-effectors, e.g., anthropomorphic robot hands and grippers, that\ncan operate across multiple platforms. To address this issue, we develop ACE, a\ncross-platform visual-exoskeleton system for low-cost dexterous teleoperation.\nOur system utilizes a hand-facing camera to capture 3D hand poses and an\nexoskeleton mounted on a portable base, enabling accurate real-time capture of\nboth finger and wrist poses. Compared to previous systems, which often require\nhardware customization according to different robots, our single system can\ngeneralize to humanoid hands, arm-hands, arm-gripper, and quadruped-gripper\nsystems with high-precision teleoperation. This enables imitation learning for\ncomplex manipulation tasks on diverse platforms.\n","authors":["Shiqi Yang","Minghuan Liu","Yuzhe Qin","Runyu Ding","Jialong Li","Xuxin Cheng","Ruihan Yang","Sha Yi","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2408.11805v1.pdf","comment":"Webpage: https://ace-teleop.github.io/"},{"id":"http://arxiv.org/abs/2408.10188v3","updated":"2024-08-21T17:47:33Z","published":"2024-08-19T17:48:08Z","title":"LongVILA: Scaling Long-Context Visual Language Models for Long Videos","summary":" Long-context capability is critical for multi-modal foundation models,\nespecially for long video understanding. We introduce LongVILA, a full-stack\nsolution for long-context visual-language models by co-designing the algorithm\nand system. For model training, we upgrade existing VLMs to support long video\nunderstanding by incorporating two additional stages, i.e., long context\nextension and long supervised fine-tuning. However, training on long video is\ncomputationally and memory intensive. We introduce the long-context Multi-Modal\nSequence Parallelism (MM-SP) system that efficiently parallelizes long video\ntraining and inference, enabling 2M context length training on 256 GPUs without\nany gradient checkpointing. LongVILA efficiently extends the number of video\nframes of VILA from 8 to 1024, improving the long video captioning score from\n2.00 to 3.26 (out of 5), achieving 99.5% accuracy in 1400-frame (274k context\nlength) video needle-in-a-haystack. LongVILA-8B demonstrates consistent\naccuracy improvements on long videos in the VideoMME benchmark as the number of\nframes increases. Besides, MM-SP is 2.1x - 5.7x faster than ring sequence\nparallelism and 1.1x - 1.4x faster than Megatron with context parallelism +\ntensor parallelism. Moreover, it seamlessly integrates with Hugging Face\nTransformers.\n","authors":["Fuzhao Xue","Yukang Chen","Dacheng Li","Qinghao Hu","Ligeng Zhu","Xiuyu Li","Yunhao Fang","Haotian Tang","Shang Yang","Zhijian Liu","Ethan He","Hongxu Yin","Pavlo Molchanov","Jan Kautz","Linxi Fan","Yuke Zhu","Yao Lu","Song Han"],"pdf_url":"https://arxiv.org/pdf/2408.10188v3.pdf","comment":"Code and models are available at\n https://github.com/NVlabs/VILA/blob/main/LongVILA.md"},{"id":"http://arxiv.org/abs/2408.11801v1","updated":"2024-08-21T17:43:15Z","published":"2024-08-21T17:43:15Z","title":"Story3D-Agent: Exploring 3D Storytelling Visualization with Large\n Language Models","summary":" Traditional visual storytelling is complex, requiring specialized knowledge\nand substantial resources, yet often constrained by human creativity and\ncreation precision. While Large Language Models (LLMs) enhance visual\nstorytelling, current approaches often limit themselves to 2D visuals or\noversimplify stories through motion synthesis and behavioral simulation,\nfailing to create comprehensive, multi-dimensional narratives. To this end, we\npresent Story3D-Agent, a pioneering approach that leverages the capabilities of\nLLMs to transform provided narratives into 3D-rendered visualizations. By\nintegrating procedural modeling, our approach enables precise control over\nmulti-character actions and motions, as well as diverse decorative elements,\nensuring the long-range and dynamic 3D representation. Furthermore, our method\nsupports narrative extension through logical reasoning, ensuring that generated\ncontent remains consistent with existing conditions. We have thoroughly\nevaluated our Story3D-Agent to validate its effectiveness, offering a basic\nframework to advance 3D story representation.\n","authors":["Yuzhou Huang","Yiran Qin","Shunlin Lu","Xintao Wang","Rui Huang","Ying Shan","Ruimao Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.11801v1.pdf","comment":"Project page: https://yuzhou914.github.io/Story3D-Agent/"},{"id":"http://arxiv.org/abs/2408.11795v1","updated":"2024-08-21T17:36:37Z","published":"2024-08-21T17:36:37Z","title":"EE-MLLM: A Data-Efficient and Compute-Efficient Multimodal Large\n Language Model","summary":" In the realm of multimodal research, numerous studies leverage substantial\nimage-text pairs to conduct modal alignment learning, transforming Large\nLanguage Models (LLMs) into Multimodal LLMs and excelling in a variety of\nvisual-language tasks. The prevailing methodologies primarily fall into two\ncategories: self-attention-based and cross-attention-based methods. While\nself-attention-based methods offer superior data efficiency due to their simple\nMLP architecture, they often suffer from lower computational efficiency due to\nconcatenating visual and textual tokens as input for LLM. Conversely,\ncross-attention-based methods, although less data-efficient due to additional\nlearnable parameters, exhibit higher computational efficiency by avoiding long\nsequence input for LLM. To address these trade-offs, we introduce the\nData-Efficient and Compute-Efficient Multimodal Large Language Model (EE-MLLM).\nWithout introducing additional modules or learnable parameters, EE-MLLM\nachieves both data and compute efficiency. Specifically, we modify the original\nself-attention mechanism in MLLM to a composite attention mechanism. This\nmechanism has two key characteristics: 1) Eliminating the computational\noverhead of self-attention within visual tokens to achieve compute efficiency,\nand 2) Reusing the weights on each layer of LLM to facilitate effective\nmodality alignment between vision and language for data efficiency.\nExperimental results demonstrate the effectiveness of EE-MLLM across a range of\nbenchmarks, including general-purpose datasets like MMBench and SeedBench, as\nwell as fine-grained tasks such as TextVQA and DocVQA.\n","authors":["Feipeng Ma","Yizhou Zhou","Hebei Li","Zilong He","Siying Wu","Fengyun Rao","Yueyi Zhang","Xiaoyan Sun"],"pdf_url":"https://arxiv.org/pdf/2408.11795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11788v1","updated":"2024-08-21T17:21:13Z","published":"2024-08-21T17:21:13Z","title":"DreamFactory: Pioneering Multi-Scene Long Video Generation with a\n Multi-Agent Framework","summary":" Current video generation models excel at creating short, realistic clips, but\nstruggle with longer, multi-scene videos. We introduce \\texttt{DreamFactory},\nan LLM-based framework that tackles this challenge. \\texttt{DreamFactory}\nleverages multi-agent collaboration principles and a Key Frames Iteration\nDesign Method to ensure consistency and style across long videos. It utilizes\nChain of Thought (COT) to address uncertainties inherent in large language\nmodels. \\texttt{DreamFactory} generates long, stylistically coherent, and\ncomplex videos. Evaluating these long-form videos presents a challenge. We\npropose novel metrics such as Cross-Scene Face Distance Score and Cross-Scene\nStyle Consistency Score. To further research in this area, we contribute the\nMulti-Scene Videos Dataset containing over 150 human-rated videos.\n","authors":["Zhifei Xie","Daniel Tang","Dingwei Tan","Jacques Klein","Tegawend F. Bissyand","Saad Ezzini"],"pdf_url":"https://arxiv.org/pdf/2408.11788v1.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2408.11787v1","updated":"2024-08-21T17:19:23Z","published":"2024-08-21T17:19:23Z","title":"NuSegDG: Integration of Heterogeneous Space and Gaussian Kernel for\n Domain-Generalized Nuclei Segmentation","summary":" Domain-generalized nuclei segmentation refers to the generalizability of\nmodels to unseen domains based on knowledge learned from source domains and is\nchallenged by various image conditions, cell types, and stain strategies.\nRecently, the Segment Anything Model (SAM) has made great success in universal\nimage segmentation by interactive prompt modes (e.g., point and box). Despite\nits strengths, the original SAM presents limited adaptation to medical images.\nMoreover, SAM requires providing manual bounding box prompts for each object to\nproduce satisfactory segmentation masks, so it is laborious in nuclei\nsegmentation scenarios. To address these limitations, we propose a\ndomain-generalizable framework for nuclei image segmentation, abbreviated to\nNuSegDG. Specifically, we first devise a Heterogeneous Space Adapter\n(HS-Adapter) to learn multi-dimensional feature representations of different\nnuclei domains by injecting a small number of trainable parameters into the\nimage encoder of SAM. To alleviate the labor-intensive requirement of manual\nprompts, we introduce a Gaussian-Kernel Prompt Encoder (GKP-Encoder) to\ngenerate density maps driven by a single point, which guides segmentation\npredictions by mixing position prompts and semantic prompts. Furthermore, we\npresent a Two-Stage Mask Decoder (TSM-Decoder) to effectively convert semantic\nmasks to instance maps without the manual demand for morphological shape\nrefinement. Based on our experimental evaluations, the proposed NuSegDG\ndemonstrates state-of-the-art performance in nuclei instance segmentation,\nexhibiting superior domain generalization capabilities. The source code is\navailable at https://github.com/xq141839/NuSegDG.\n","authors":["Zhenye Lou","Qing Xu","Zekun Jiang","Xiangjian He","Zhen Chen","Yi Wang","Chenxin Li","Maggie M. He","Wenting Duan"],"pdf_url":"https://arxiv.org/pdf/2408.11787v1.pdf","comment":"Under Reivew"},{"id":"http://arxiv.org/abs/2408.11785v1","updated":"2024-08-21T17:16:21Z","published":"2024-08-21T17:16:21Z","title":"Timeline and Boundary Guided Diffusion Network for Video Shadow\n Detection","summary":" Video Shadow Detection (VSD) aims to detect the shadow masks with frame\nsequence. Existing works suffer from inefficient temporal learning. Moreover,\nfew works address the VSD problem by considering the characteristic (i.e.,\nboundary) of shadow. Motivated by this, we propose a Timeline and Boundary\nGuided Diffusion (TBGDiff) network for VSD where we take account of the\npast-future temporal guidance and boundary information jointly. In detail, we\ndesign a Dual Scale Aggregation (DSA) module for better temporal understanding\nby rethinking the affinity of the long-term and short-term frames for the\nclipped video. Next, we introduce Shadow Boundary Aware Attention (SBAA) to\nutilize the edge contexts for capturing the characteristics of shadows.\nMoreover, we are the first to introduce the Diffusion model for VSD in which we\nexplore a Space-Time Encoded Embedding (STEE) to inject the temporal guidance\nfor Diffusion to conduct shadow detection. Benefiting from these designs, our\nmodel can not only capture the temporal information but also the shadow\nproperty. Extensive experiments show that the performance of our approach\novertakes the state-of-the-art methods, verifying the effectiveness of our\ncomponents. We release the codes, weights, and results at\n\\url{https://github.com/haipengzhou856/TBGDiff}.\n","authors":["Haipeng Zhou","Honqiu Wang","Tian Ye","Zhaohu Xing","Jun Ma","Ping Li","Qiong Wang","Lei Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.11785v1.pdf","comment":"ACM MM2024"},{"id":"http://arxiv.org/abs/2404.09293v2","updated":"2024-08-21T17:07:02Z","published":"2024-04-14T16:09:33Z","title":"A Novel State Space Model with Local Enhancement and State Sharing for\n Image Fusion","summary":" In image fusion tasks, images from different sources possess distinct\ncharacteristics. This has driven the development of numerous methods to explore\nbetter ways of fusing them while preserving their respective\ncharacteristics.Mamba, as a state space model, has emerged in the field of\nnatural language processing. Recently, many studies have attempted to extend\nMamba to vision tasks. However, due to the nature of images different from\ncausal language sequences, the limited state capacity of Mamba weakens its\nability to model image information. Additionally, the sequence modeling ability\nof Mamba is only capable of spatial information and cannot effectively capture\nthe rich spectral information in images. Motivated by these challenges, we\ncustomize and improve the vision Mamba network designed for the image fusion\ntask. Specifically, we propose the local-enhanced vision Mamba block, dubbed as\nLEVM. The LEVM block can improve local information perception of the network\nand simultaneously learn local and global spatial information. Furthermore, we\npropose the state sharing technique to enhance spatial details and integrate\nspatial and spectral information. Finally, the overall network is a multi-scale\nstructure based on vision Mamba, called LE-Mamba. Extensive experiments show\nthe proposed methods achieve state-of-the-art results on multispectral\npansharpening and multispectral and hyperspectral image fusion datasets, and\ndemonstrate the effectiveness of the proposed approach. Codes can be accessed\nat \\url{https://github.com/294coder/Efficient-MIF}.\n","authors":["Zihan Cao","Xiao Wu","Liang-Jian Deng","Yu Zhong"],"pdf_url":"https://arxiv.org/pdf/2404.09293v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11105v2","updated":"2024-08-21T17:04:18Z","published":"2024-06-16T23:55:25Z","title":"Exploiting Diffusion Prior for Out-of-Distribution Detection","summary":" Out-of-distribution (OOD) detection is crucial for deploying robust machine\nlearning models, especially in areas where security is critical. However,\ntraditional OOD detection methods often fail to capture complex data\ndistributions from large scale date. In this paper, we present a novel approach\nfor OOD detection that leverages the generative ability of diffusion models and\nthe powerful feature extraction capabilities of CLIP. By using these features\nas conditional inputs to a diffusion model, we can reconstruct the images after\nencoding them with CLIP. The difference between the original and reconstructed\nimages is used as a signal for OOD identification. The practicality and\nscalability of our method is increased by the fact that it does not require\nclass-specific labeled ID data, as is the case with many other methods.\nExtensive experiments on several benchmark datasets demonstrates the robustness\nand effectiveness of our method, which have significantly improved the\ndetection accuracy.\n","authors":["Armando Zhu","Jiabei Liu","Keqin Li","Shuying Dai","Bo Hong","Peng Zhao","Changsong Wei"],"pdf_url":"https://arxiv.org/pdf/2406.11105v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01105v2","updated":"2024-08-21T17:02:21Z","published":"2024-02-02T02:44:59Z","title":"A Survey for Foundation Models in Autonomous Driving","summary":" The advent of foundation models has revolutionized the fields of natural\nlanguage processing and computer vision, paving the way for their application\nin autonomous driving (AD). This survey presents a comprehensive review of more\nthan 40 research papers, demonstrating the role of foundation models in\nenhancing AD. Large language models contribute to planning and simulation in\nAD, particularly through their proficiency in reasoning, code generation and\ntranslation. In parallel, vision foundation models are increasingly adapted for\ncritical tasks such as 3D object detection and tracking, as well as creating\nrealistic driving scenarios for simulation and testing. Multi-modal foundation\nmodels, integrating diverse inputs, exhibit exceptional visual understanding\nand spatial reasoning, crucial for end-to-end AD. This survey not only provides\na structured taxonomy, categorizing foundation models based on their modalities\nand functionalities within the AD domain but also delves into the methods\nemployed in current research. It identifies the gaps between existing\nfoundation models and cutting-edge AD approaches, thereby charting future\nresearch directions and proposing a roadmap for bridging these gaps.\n","authors":["Haoxiang Gao","Zhongruo Wang","Yaqian Li","Kaiwen Long","Ming Yang","Yiqing Shen"],"pdf_url":"https://arxiv.org/pdf/2402.01105v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.11419v2","updated":"2024-08-21T16:54:23Z","published":"2023-09-20T15:50:08Z","title":"KOSMOS-2.5: A Multimodal Literate Model","summary":" The automatic reading of text-intensive images represents a significant\nadvancement toward achieving Artificial General Intelligence (AGI). In this\npaper we present KOSMOS-2.5, a multimodal literate model for machine reading of\ntext-intensive images. Pre-trained on a large-scale corpus of text-intensive\nimages, KOSMOS-2.5 excels in two distinct yet complementary transcription\ntasks: (1) generating spatially-aware text blocks, where each block of text is\nassigned spatial coordinates within the image, and (2) producing structured\ntext output that captures both style and structure in markdown format. This\nunified multimodal literate capability is achieved through a shared\ndecoder-only autoregressive Transformer architecture and task-specific prompts.\nBuilding on this foundation, we fine-tune KOSMOS-2.5 for document understanding\ntasks, resulting in a document understanding generalist named KOSMOS-2.5-CHAT.\nAdditionally, a large corpus of 357.4 million document pages spanning diverse\ndomains was curated for pre-training. We evaluate KOSMOS-2.5 on two newly\nproposed benchmarks, OCREval and MarkdownEval, for document-level text\nrecognition and image-to-markdown generation, demonstrating impressive literate\ncapabilities comparable to GPT-4o. KOSMOS-2.5-CHAT achieves performance\ncomparable to other state-of-the-art generalists that are five times larger\n(1.3B vs. 7B) across nine text-rich visual question answering benchmarks.\nModels and code have been available at \\url{https://aka.ms/kosmos25}.\n","authors":["Tengchao Lv","Yupan Huang","Jingye Chen","Yuzhong Zhao","Yilin Jia","Lei Cui","Shuming Ma","Yaoyao Chang","Shaohan Huang","Wenhui Wang","Li Dong","Weiyao Luo","Shaoxiang Wu","Guoxin Wang","Cha Zhang","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2309.11419v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11768v1","updated":"2024-08-21T16:42:58Z","published":"2024-08-21T16:42:58Z","title":"Embedding Ordinality to Binary Loss Function for Improving Solar Flare\n Forecasting","summary":" In this paper, we propose a novel loss function aimed at optimizing the\nbinary flare prediction problem by embedding the intrinsic ordinal flare\ncharacteristics into the binary cross-entropy (BCE) loss function. This\nmodification is intended to provide the model with better guidance based on the\nordinal characteristics of the data and improve the overall performance of the\nmodels. For our experiments, we employ a ResNet34-based model with transfer\nlearning to predict $\\geq$M-class flares by utilizing the shape-based features\nof magnetograms of active region (AR) patches spanning from $-$90$^{\\circ}$ to\n$+$90$^{\\circ}$ of solar longitude as our input data. We use a composite skill\nscore (CSS) as our evaluation metric, which is calculated as the geometric mean\nof the True Skill Score (TSS) and the Heidke Skill Score (HSS) to rank and\ncompare our models' performance. The primary contributions of this work are as\nfollows: (i) We introduce a novel approach to encode ordinality into a binary\nloss function showing an application to solar flare prediction, (ii) We enhance\nsolar flare forecasting by enabling flare predictions for each AR across the\nentire solar disk, without any longitudinal restrictions, and evaluate and\ncompare performance. (iii) Our candidate model, optimized with the proposed\nloss function, shows an improvement of $\\sim$7%, $\\sim$4%, and $\\sim$3% for AR\npatches within $\\pm$30$^\\circ$, $\\pm$60$^\\circ$, and $\\pm$90$^\\circ$ of solar\nlongitude, respectively in terms of CSS, when compared with standard BCE.\nAdditionally, we demonstrate the ability to issue flare forecasts for ARs in\nnear-limb regions (regions between $\\pm$60$^{\\circ}$ to $\\pm$90$^{\\circ}$) with\na CSS=0.34 (TSS=0.50 and HSS=0.23), expanding the scope of AR-based models for\nsolar flare prediction. This advances the reliability of solar flare forecasts,\nleading to more effective prediction capabilities.\n","authors":["Chetraj Pandey","Anli Ji","Jinsu Hong","Rafal A. Angryk","Berkay Aydin"],"pdf_url":"https://arxiv.org/pdf/2408.11768v1.pdf","comment":"10 Pages, 8 Figures. This manuscript is accepted to be published at\n DSAA 2024 conference. arXiv admin note: substantial text overlap with\n arXiv:2406.11054"},{"id":"http://arxiv.org/abs/2408.10356v2","updated":"2024-08-21T16:42:06Z","published":"2024-08-19T18:54:01Z","title":"Diversity and stylization of the contemporary user-generated visual arts\n in the complexity-entropy plane","summary":" The advent of computational and numerical methods in recent times has\nprovided new avenues for analyzing art historiographical narratives and tracing\nthe evolution of art styles therein. Here, we investigate an evolutionary\nprocess underpinning the emergence and stylization of contemporary\nuser-generated visual art styles using the complexity-entropy (C-H) plane,\nwhich quantifies local structures in paintings. Informatizing 149,780 images\ncurated in DeviantArt and Behance platforms from 2010 to 2020, we analyze the\nrelationship between local information of the C-H space and multi-level image\nfeatures generated by a deep neural network and a feature extraction algorithm.\nThe results reveal significant statistical relationships between the C-H\ninformation of visual artistic styles and the dissimilarities of the\nmulti-level image features over time within groups of artworks. By disclosing a\nparticular C-H region where the diversity of image representations is\nnoticeably manifested, our analyses reveal an empirical condition of emerging\nstyles that are both novel in the C-H plane and characterized by greater\nstylistic diversity. Our research shows that visual art analyses combined with\nphysics-inspired methodologies and machine learning, can provide macroscopic\ninsights into quantitatively mapping relevant characteristics of an\nevolutionary process underpinning the creative stylization of uncharted visual\narts of given groups and time.\n","authors":["Seunghwan Kim","Byunghwee Lee","Wonjae Lee"],"pdf_url":"https://arxiv.org/pdf/2408.10356v2.pdf","comment":"18 pages, 3 figures, 1 table, SI(4 figures, 3 tables)"},{"id":"http://arxiv.org/abs/2407.06174v4","updated":"2024-08-21T16:33:02Z","published":"2024-07-08T17:49:41Z","title":"The Tug-of-War Between Deepfake Generation and Detection","summary":" Multimodal generative models are rapidly evolving, leading to a surge in the\ngeneration of realistic video and audio that offers exciting possibilities but\nalso serious risks. Deepfake videos, which can convincingly impersonate\nindividuals, have particularly garnered attention due to their potential misuse\nin spreading misinformation and creating fraudulent content. This survey paper\nexamines the dual landscape of deepfake video generation and detection,\nemphasizing the need for effective countermeasures against potential abuses. We\nprovide a comprehensive overview of current deepfake generation techniques,\nincluding face swapping, reenactment, and audio-driven animation, which\nleverage cutting-edge technologies like GANs and diffusion models to produce\nhighly realistic fake videos. Additionally, we analyze various detection\napproaches designed to differentiate authentic from altered videos, from\ndetecting visual artifacts to deploying advanced algorithms that pinpoint\ninconsistencies across video and audio signals.\n The effectiveness of these detection methods heavily relies on the diversity\nand quality of datasets used for training and evaluation. We discuss the\nevolution of deepfake datasets, highlighting the importance of robust, diverse,\nand frequently updated collections to enhance the detection accuracy and\ngeneralizability. As deepfakes become increasingly indistinguishable from\nauthentic content, developing advanced detection techniques that can keep pace\nwith generation technologies is crucial. We advocate for a proactive approach\nin the \"tug-of-war\" between deepfake creators and detectors, emphasizing the\nneed for continuous research collaboration, standardization of evaluation\nmetrics, and the creation of comprehensive benchmarks.\n","authors":["Hannah Lee","Changyeon Lee","Kevin Farhat","Lin Qiu","Steve Geluso","Aerin Kim","Oren Etzioni"],"pdf_url":"https://arxiv.org/pdf/2407.06174v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11760v1","updated":"2024-08-21T16:32:03Z","published":"2024-08-21T16:32:03Z","title":"SBDet: A Symmetry-Breaking Object Detector via Relaxed\n Rotation-Equivariance","summary":" Introducing Group Equivariant Convolution (GConv) empowers models to explore\nsymmetries hidden in visual data, improving their performance. However, in\nreal-world scenarios, objects or scenes often exhibit perturbations of a\nsymmetric system, specifically a deviation from a symmetric architecture, which\ncan be characterized by a non-trivial action of a symmetry group, known as\nSymmetry-Breaking. Traditional GConv methods are limited by the strict\noperation rules in the group space, only ensuring features remain strictly\nequivariant under limited group transformations, making it difficult to adapt\nto Symmetry-Breaking or non-rigid transformations. Motivated by this, we\nintroduce a novel Relaxed Rotation GConv (R2GConv) with our defined Relaxed\nRotation-Equivariant group $\\mathbf{R}_4$. Furthermore, we propose a Relaxed\nRotation-Equivariant Network (R2Net) as the backbone and further develop the\nSymmetry-Breaking Object Detector (SBDet) for 2D object detection built upon\nit. Experiments demonstrate the effectiveness of our proposed R2GConv in\nnatural image classification tasks, and SBDet achieves excellent performance in\nobject detection tasks with improved generalization capabilities and\nrobustness.\n","authors":["Zhiqiang Wu","Yingjie Liu","Hanlin Dong","Xuan Tang","Jian Yang","Bo Jin","Mingsong Chen","Xian Wei"],"pdf_url":"https://arxiv.org/pdf/2408.11760v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11758v1","updated":"2024-08-21T16:30:45Z","published":"2024-08-21T16:30:45Z","title":"MambaCSR: Dual-Interleaved Scanning for Compressed Image\n Super-Resolution With SSMs","summary":" We present MambaCSR, a simple but effective framework based on Mamba for the\nchallenging compressed image super-resolution (CSR) task. Particularly, the\nscanning strategies of Mamba are crucial for effective contextual knowledge\nmodeling in the restoration process despite it relying on selective state space\nmodeling for all tokens. In this work, we propose an efficient dual-interleaved\nscanning paradigm (DIS) for CSR, which is composed of two scanning strategies:\n(i) hierarchical interleaved scanning is designed to comprehensively capture\nand utilize the most potential contextual information within an image by\nsimultaneously taking advantage of the local window-based and sequential\nscanning methods; (ii) horizontal-to-vertical interleaved scanning is proposed\nto reduce the computational cost by leaving the redundancy between the scanning\nof different directions. To overcome the non-uniform compression artifacts, we\nalso propose position-aligned cross-scale scanning to model multi-scale\ncontextual information. Experimental results on multiple benchmarks have shown\nthe great performance of our MambaCSR in the compressed image super-resolution\ntask. The code will be soon available\nin~\\textcolor{magenta}{\\url{https://github.com/renyulin-f/MambaCSR}}.\n","authors":["Yulin Ren","Xin Li","Mengxi Guo","Bingchen Li","Shijie Zhao","Zhibo Chen"],"pdf_url":"https://arxiv.org/pdf/2408.11758v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11748v1","updated":"2024-08-21T16:16:18Z","published":"2024-08-21T16:16:18Z","title":"DH-Bench: Probing Depth and Height Perception of Large Visual-Language\n Models","summary":" Geometric understanding is crucial for navigating and interacting with our\nenvironment. While large Vision Language Models (VLMs) demonstrate impressive\ncapabilities, deploying them in real-world scenarios necessitates a comparable\ngeometric understanding in visual perception. In this work, we focus on the\ngeometric comprehension of these models; specifically targeting the depths and\nheights of objects within a scene. Our observations reveal that, although VLMs\nexcel in basic geometric properties perception such as shape and size, they\nencounter significant challenges in reasoning about the depth and height of\nobjects. To address this, we introduce a suite of benchmark datasets\nencompassing Synthetic 2D, Synthetic 3D, and Real-World scenarios to rigorously\nevaluate these aspects. We benchmark 17 state-of-the-art VLMs using these\ndatasets and find that they consistently struggle with both depth and height\nperception. Our key insights include detailed analyses of the shortcomings in\ndepth and height reasoning capabilities of VLMs and the inherent bias present\nin these models. This study aims to pave the way for the development of VLMs\nwith enhanced geometric understanding, crucial for real-world applications. The\ncode and datasets for our benchmarks will be available at\n\\url{https://tinyurl.com/DH-Bench1}.\n","authors":["Shehreen Azad","Yash Jain","Rishit Garg","Yogesh S Rawat","Vibhav Vineet"],"pdf_url":"https://arxiv.org/pdf/2408.11748v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11747v1","updated":"2024-08-21T16:14:11Z","published":"2024-08-21T16:14:11Z","title":"Open-Ended 3D Point Cloud Instance Segmentation","summary":" Open-Vocab 3D Instance Segmentation methods (OV-3DIS) have recently\ndemonstrated their ability to generalize to unseen objects. However, these\nmethods still depend on predefined class names during testing, restricting the\nautonomy of agents. To mitigate this constraint, we propose a novel problem\ntermed Open-Ended 3D Instance Segmentation (OE-3DIS), which eliminates the\nnecessity for predefined class names during testing. Moreover, we contribute a\ncomprehensive set of strong baselines, derived from OV-3DIS approaches and\nleveraging 2D Multimodal Large Language Models. To assess the performance of\nour OE-3DIS system, we introduce a novel Open-Ended score, evaluating both the\nsemantic and geometric quality of predicted masks and their associated class\nnames, alongside the standard AP score. Our approach demonstrates significant\nperformance improvements over the baselines on the ScanNet200 and ScanNet++\ndatasets. Remarkably, our method surpasses the performance of Open3DIS, the\ncurrent state-of-the-art method in OV-3DIS, even in the absence of ground-truth\nobject class names.\n","authors":["Phuc D. A. Nguyen","Minh Luu","Anh Tran","Cuong Pham","Khoi Nguyen"],"pdf_url":"https://arxiv.org/pdf/2408.11747v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11744v1","updated":"2024-08-21T16:11:01Z","published":"2024-08-21T16:11:01Z","title":"JieHua Paintings Style Feature Extracting Model using Stable Diffusion\n with ControlNet","summary":" This study proposes a novel approach to extract stylistic features of Jiehua:\nthe utilization of the Fine-tuned Stable Diffusion Model with ControlNet\n(FSDMC) to refine depiction techniques from artists' Jiehua. The training data\nfor FSDMC is based on the opensource Jiehua artist's work collected from the\nInternet, which were subsequently manually constructed in the format of\n(Original Image, Canny Edge Features, Text Prompt). By employing the optimal\nhyperparameters identified in this paper, it was observed FSDMC outperforms\nCycleGAN, another mainstream style transfer model. FSDMC achieves FID of 3.27\non the dataset and also surpasses CycleGAN in terms of expert evaluation. This\nnot only demonstrates the model's high effectiveness in extracting Jiehua's\nstyle features, but also preserves the original pre-trained semantic\ninformation. The findings of this study suggest that the application of FSDMC\nwith appropriate hyperparameters can enhance the efficacy of the Stable\nDiffusion Model in the field of traditional art style migration tasks,\nparticularly within the context of Jiehua.\n","authors":["Yujia Gu","Haofeng Li","Xinyu Fang","Zihan Peng","Yinan Peng"],"pdf_url":"https://arxiv.org/pdf/2408.11744v1.pdf","comment":"accepted by ICCSMT 2024"},{"id":"http://arxiv.org/abs/2408.11742v1","updated":"2024-08-21T16:07:49Z","published":"2024-08-21T16:07:49Z","title":"CluMo: Cluster-based Modality Fusion Prompt for Continual Learning in\n Visual Question Answering","summary":" Large vision-language models (VLMs) have shown significant performance boost\nin various application domains. However, adopting them to deal with several\nsequentially encountered tasks has been challenging because finetuning a VLM on\na task normally leads to reducing its generalization power and the capacity of\nlearning new tasks as well as causing catastrophic forgetting on previously\nlearned tasks. Enabling using VLMs in multimodal continual learning (CL)\nsettings can help to address such scenarios. To improve generalization capacity\nand prevent catastrophic forgetting, we propose a novel prompt-based CL method\nfor VLMs, namely $\\textbf{Clu}$ster-based $\\textbf{Mo}$dality Fusion Prompt\n(\\textbf{CluMo}). We design a novel \\textbf{Key-Key-Prompt} pair, where each\nprompt is associated with a visual prompt key and a textual prompt key. We\nadopt a two-stage training strategy. During the first stage, the single-modal\nkeys are trained via $K$-means clustering algorithm to help select the best\nsemantically matched prompt. During the second stage, the prompt keys are\nfrozen, the selected prompt is attached to the input for training the VLM in\nthe CL scenario. Experiments on two benchmarks demonstrate that our method\nachieves SOTA performance.\n","authors":["Yuliang Cai","Mohammad Rostami"],"pdf_url":"https://arxiv.org/pdf/2408.11742v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16344v3","updated":"2024-08-21T16:07:08Z","published":"2024-07-23T09:45:25Z","title":"SOAP: Enhancing Spatio-Temporal Relation and Motion Information\n Capturing for Few-Shot Action Recognition","summary":" High frame-rate (HFR) videos of action recognition improve fine-grained\nexpression while reducing the spatio-temporal relation and motion information\ndensity. Thus, large amounts of video samples are continuously required for\ntraditional data-driven training. However, samples are not always sufficient in\nreal-world scenarios, promoting few-shot action recognition (FSAR) research. We\nobserve that most recent FSAR works build spatio-temporal relation of video\nsamples via temporal alignment after spatial feature extraction, cutting apart\nspatial and temporal features within samples. They also capture motion\ninformation via narrow perspectives between adjacent frames without considering\ndensity, leading to insufficient motion information capturing. Therefore, we\npropose a novel plug-and-play architecture for FSAR called Spatio-tempOral\nfrAme tuPle enhancer (SOAP) in this paper. The model we designed with such\narchitecture refers to SOAP-Net. Temporal connections between different feature\nchannels and spatio-temporal relation of features are considered instead of\nsimple feature extraction. Comprehensive motion information is also captured,\nusing frame tuples with multiple frames containing more motion information than\nadjacent frames. Combining frame tuples of diverse frame counts further\nprovides a broader perspective. SOAP-Net achieves new state-of-the-art\nperformance across well-known benchmarks such as SthSthV2, Kinetics, UCF101,\nand HMDB51. Extensive empirical evaluations underscore the competitiveness,\npluggability, generalization, and robustness of SOAP. The code is released at\nhttps://github.com/wenbohuang1002/SOAP.\n","authors":["Wenbo Huang","Jinghui Zhang","Xuwei Qian","Zhen Wu","Meng Wang","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.16344v3.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2311.15153v5","updated":"2024-08-21T15:57:30Z","published":"2023-11-26T01:05:55Z","title":"Predicting Gradient is Better: Exploring Self-Supervised Learning for\n SAR ATR with a Joint-Embedding Predictive Architecture","summary":" The growing Synthetic Aperture Radar (SAR) data has the potential to build a\nfoundation model through Self-Supervised Learning (SSL) methods, which can\nachieve various SAR Automatic Target Recognition (ATR) tasks with pre-training\nin large-scale unlabeled data and fine-tuning in small labeled samples. SSL\naims to construct supervision signals directly from the data, which minimizes\nthe need for expensive expert annotation and maximizes the use of the expanding\ndata pool for a foundational model. This study investigates an effective SSL\nmethod for SAR ATR, which can pave the way for a foundation model in SAR ATR.\nThe primary obstacles faced in SSL for SAR ATR are the small targets in remote\nsensing and speckle noise in SAR images, corresponding to the SSL approach and\nsignals. To overcome these challenges, we present a novel Joint-Embedding\nPredictive Architecture for SAR ATR (SAR-JEPA), which leverages local masked\npatches to predict the multi-scale SAR gradient representations of unseen\ncontext. The key aspect of SAR-JEPA is integrating SAR domain features to\nensure high-quality self-supervised signals as target features. Besides, we\nemploy local masks and multi-scale features to accommodate the various small\ntargets in remote sensing. By fine-tuning and evaluating our framework on three\ntarget recognition datasets (vehicle, ship, and aircraft) with four other\ndatasets as pre-training, we demonstrate its outperformance over other SSL\nmethods and its effectiveness with increasing SAR data. This study showcases\nthe potential of SSL for SAR target recognition across diverse targets, scenes,\nand sensors.Our codes and weights are available in\n\\url{https://github.com/waterdisappear/SAR-JEPA.\n","authors":["Weijie Li","Yang Wei","Tianpeng Liu","Yuenan Hou","Yuxuan Li","Zhen Liu","Yongxiang Liu","Li Liu"],"pdf_url":"https://arxiv.org/pdf/2311.15153v5.pdf","comment":"15 pages, 7 figures,"},{"id":"http://arxiv.org/abs/2408.11733v1","updated":"2024-08-21T15:57:24Z","published":"2024-08-21T15:57:24Z","title":"Enhancing Cross-Modal Medical Image Segmentation through\n Compositionality","summary":" Cross-modal medical image segmentation presents a significant challenge, as\ndifferent imaging modalities produce images with varying resolutions,\ncontrasts, and appearances of anatomical structures. We introduce\ncompositionality as an inductive bias in a cross-modal segmentation network to\nimprove segmentation performance and interpretability while reducing\ncomplexity. The proposed network is an end-to-end cross-modal segmentation\nframework that enforces compositionality on the learned representations using\nlearnable von Mises-Fisher kernels. These kernels facilitate content-style\ndisentanglement in the learned representations, resulting in compositional\ncontent representations that are inherently interpretable and effectively\ndisentangle different anatomical structures. The experimental results\ndemonstrate enhanced segmentation performance and reduced computational costs\non multiple medical datasets. Additionally, we demonstrate the interpretability\nof the learned compositional features. Code and checkpoints will be publicly\navailable at:\nhttps://github.com/Trustworthy-AI-UU-NKI/Cross-Modal-Segmentation.\n","authors":["Aniek Eijpe","Valentina Corbetta","Kalina Chupetlovska","Regina Beets-Tan","Wilson Silva"],"pdf_url":"https://arxiv.org/pdf/2408.11733v1.pdf","comment":"11 pages, 3 figures, 2 tables. Accepted at Deep Generative Models\n workshop @ MICCAI 2024 (DGM4MICCAI). This is the submitted manuscript with\n added link to github repo, funding acknowledgements and authors' names and\n affiliations. No further post submission improvements or corrections were\n integrated. Final version not published yet"},{"id":"http://arxiv.org/abs/2408.05891v2","updated":"2024-08-21T15:56:15Z","published":"2024-08-12T02:09:25Z","title":"CMAB: A First National-Scale Multi-Attribute Building Dataset in China\n Derived from Open Source Data and GeoAI","summary":" Rapidly acquiring three-dimensional (3D) building data, including geometric\nattributes like rooftop, height and orientations, as well as indicative\nattributes like function, quality, and age, is essential for accurate urban\nanalysis, simulations, and policy updates. Current building datasets suffer\nfrom incomplete coverage of building multi-attributes. This paper introduces a\ngeospatial artificial intelligence (GeoAI) framework for large-scale building\nmodeling, presenting the first national-scale Multi-Attribute Building dataset\n(CMAB), covering 3,667 spatial cities, 29 million buildings, and 21.3 billion\nsquare meters of rooftops with an F1-Score of 89.93% in OCRNet-based\nextraction, totaling 337.7 billion cubic meters of building stock. We trained\nbootstrap aggregated XGBoost models with city administrative classifications,\nincorporating features such as morphology, location, and function. Using\nmulti-source data, including billions of high-resolution Google Earth images\nand 60 million street view images (SVIs), we generated rooftop, height,\nfunction, age, and quality attributes for each building. Accuracy was validated\nthrough model benchmarks, existing similar products, and manual SVI validation,\nmostly above 80%. Our dataset and results are crucial for global SDGs and urban\nplanning.\n","authors":["Yecheng Zhang","Huimin Zhao","Ying Long"],"pdf_url":"https://arxiv.org/pdf/2408.05891v2.pdf","comment":"43 pages, 20 figures"},{"id":"http://arxiv.org/abs/2408.11721v1","updated":"2024-08-21T15:51:46Z","published":"2024-08-21T15:51:46Z","title":"Iterative Object Count Optimization for Text-to-image Diffusion Models","summary":" We address a persistent challenge in text-to-image models: accurately\ngenerating a specified number of objects. Current models, which learn from\nimage-text pairs, inherently struggle with counting, as training data cannot\ndepict every possible number of objects for any given object. To solve this, we\npropose optimizing the generated image based on a counting loss derived from a\ncounting model that aggregates an object\\'s potential. Employing an\nout-of-the-box counting model is challenging for two reasons: first, the model\nrequires a scaling hyperparameter for the potential aggregation that varies\ndepending on the viewpoint of the objects, and second, classifier guidance\ntechniques require modified models that operate on noisy intermediate diffusion\nsteps. To address these challenges, we propose an iterated online training mode\nthat improves the accuracy of inferred images while altering the text\nconditioning embedding and dynamically adjusting hyperparameters. Our method\noffers three key advantages: (i) it can consider non-derivable counting\ntechniques based on detection models, (ii) it is a zero-shot plug-and-play\nsolution facilitating rapid changes to the counting techniques and image\ngeneration methods, and (iii) the optimized counting token can be reused to\ngenerate accurate images without additional optimization. We evaluate the\ngeneration of various objects and show significant improvements in accuracy.\nThe project page is available at https://ozzafar.github.io/count_token.\n","authors":["Oz Zafar","Lior Wolf","Idan Schwartz"],"pdf_url":"https://arxiv.org/pdf/2408.11721v1.pdf","comment":"Pre-print"},{"id":"http://arxiv.org/abs/2408.11720v1","updated":"2024-08-21T15:50:37Z","published":"2024-08-21T15:50:37Z","title":"On Learnable Parameters of Optimal and Suboptimal Deep Learning Models","summary":" We scrutinize the structural and operational aspects of deep learning models,\nparticularly focusing on the nuances of learnable parameters (weight)\nstatistics, distribution, node interaction, and visualization. By establishing\ncorrelations between variance in weight patterns and overall network\nperformance, we investigate the varying (optimal and suboptimal) performances\nof various deep-learning models. Our empirical analysis extends across widely\nrecognized datasets such as MNIST, Fashion-MNIST, and CIFAR-10, and various\ndeep learning models such as deep neural networks (DNNs), convolutional neural\nnetworks (CNNs), and vision transformer (ViT), enabling us to pinpoint\ncharacteristics of learnable parameters that correlate with successful\nnetworks. Through extensive experiments on the diverse architectures of deep\nlearning models, we shed light on the critical factors that influence the\nfunctionality and efficiency of DNNs. Our findings reveal that successful\nnetworks, irrespective of datasets or models, are invariably similar to other\nsuccessful networks in their converged weights statistics and distribution,\nwhile poor-performing networks vary in their weights. In addition, our research\nshows that the learnable parameters of widely varied deep learning models such\nas DNN, CNN, and ViT exhibit similar learning characteristics.\n","authors":["Ziwei Zheng","Huizhi Liang","Vaclav Snasel","Vito Latora","Panos Pardalos","Giuseppe Nicosia","Varun Ojha"],"pdf_url":"https://arxiv.org/pdf/2408.11720v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11711v1","updated":"2024-08-21T15:35:37Z","published":"2024-08-21T15:35:37Z","title":"ControlCol: Controllability in Automatic Speaker Video Colorization","summary":" Adding color to black-and-white speaker videos automatically is a highly\ndesirable technique. It is an artistic process that requires interactivity with\nhumans for the best results. Many existing automatic video colorization systems\nprovide little opportunity for the user to guide the colorization process. In\nthis work, we introduce a novel automatic speaker video colorization system\nwhich provides controllability to the user while also maintaining high\ncolorization quality relative to state-of-the-art techniques. We name this\nsystem ControlCol. ControlCol performs 3.5% better than the previous\nstate-of-the-art DeOldify on the Grid and Lombard Grid datasets when PSNR,\nSSIM, FID and FVD are used as metrics. This result is also supported by our\nhuman evaluation, where in a head-to-head comparison, ControlCol is preferred\n90% of the time to DeOldify. Example videos can be seen in the supplementary\nmaterial.\n","authors":["Rory Ward","John G. Breslin","Peter Corcoran"],"pdf_url":"https://arxiv.org/pdf/2408.11711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.06136v2","updated":"2024-08-21T15:32:26Z","published":"2024-07-08T17:09:39Z","title":"Mamba-FSCIL: Dynamic Adaptation with Selective State Space Model for\n Few-Shot Class-Incremental Learning","summary":" Few-shot class-incremental learning (FSCIL) confronts the challenge of\nintegrating new classes into a model with minimal training samples while\npreserving the knowledge of previously learned classes. Traditional methods\nwidely adopt static adaptation relying on a fixed parameter space to learn from\ndata that arrive sequentially, prone to overfitting to the current session.\nExisting dynamic strategies require the expansion of the parameter space\ncontinually, leading to increased complexity. In this study, we explore the\npotential of Selective State Space Models (SSMs) for FSCIL, leveraging its\ndynamic weights and strong ability in sequence modeling to address these\nchallenges. Concretely, we propose a dual selective SSM projector that\ndynamically adjusts the projection parameters based on the intermediate\nfeatures for dynamic adaptation. The dual design enables the model to maintain\nthe robust features of base classes, while adaptively learning distinctive\nfeature shifts for novel classes. Additionally, we develop a class-sensitive\nselective scan mechanism to guide dynamic adaptation. It minimizes the\ndisruption to base-class representations caused by training on novel data, and\nmeanwhile, forces the selective scan to perform in distinct patterns between\nbase and novel classes. Experiments on miniImageNet, CUB-200, and CIFAR-100\ndemonstrate that our framework outperforms the existing state-of-the-art\nmethods. The code is available at\n\\url{https://github.com/xiaojieli0903/Mamba-FSCIL}.\n","authors":["Xiaojie Li","Yibo Yang","Jianlong Wu","Bernard Ghanem","Liqiang Nie","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.06136v2.pdf","comment":"Code: https://github.com/xiaojieli0903/Mamba-FSCIL"},{"id":"http://arxiv.org/abs/2408.11706v1","updated":"2024-08-21T15:30:35Z","published":"2024-08-21T15:30:35Z","title":"FRAP: Faithful and Realistic Text-to-Image Generation with Adaptive\n Prompt Weighting","summary":" Text-to-image (T2I) diffusion models have demonstrated impressive\ncapabilities in generating high-quality images given a text prompt. However,\nensuring the prompt-image alignment remains a considerable challenge, i.e.,\ngenerating images that faithfully align with the prompt's semantics. Recent\nworks attempt to improve the faithfulness by optimizing the latent code, which\npotentially could cause the latent code to go out-of-distribution and thus\nproduce unrealistic images. In this paper, we propose FRAP, a simple, yet\neffective approach based on adaptively adjusting the per-token prompt weights\nto improve prompt-image alignment and authenticity of the generated images. We\ndesign an online algorithm to adaptively update each token's weight\ncoefficient, which is achieved by minimizing a unified objective function that\nencourages object presence and the binding of object-modifier pairs. Through\nextensive evaluations, we show FRAP generates images with significantly higher\nprompt-image alignment to prompts from complex datasets, while having a lower\naverage latency compared to recent latent code optimization methods, e.g., 4\nseconds faster than D&B on the COCO-Subject dataset. Furthermore, through\nvisual comparisons and evaluation on the CLIP-IQA-Real metric, we show that\nFRAP not only improves prompt-image alignment but also generates more authentic\nimages with realistic appearances. We also explore combining FRAP with prompt\nrewriting LLM to recover their degraded prompt-image alignment, where we\nobserve improvements in both prompt-image alignment and image quality.\n","authors":["Liyao Jiang","Negar Hassanpour","Mohammad Salameh","Mohan Sai Singamsetti","Fengyu Sun","Wei Lu","Di Niu"],"pdf_url":"https://arxiv.org/pdf/2408.11706v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11701v1","updated":"2024-08-21T15:26:21Z","published":"2024-08-21T15:26:21Z","title":"FedGS: Federated Gradient Scaling for Heterogeneous Medical Image\n Segmentation","summary":" Federated Learning (FL) in Deep Learning (DL)-automated medical image\nsegmentation helps preserving privacy by enabling collaborative model training\nwithout sharing patient data. However, FL faces challenges with data\nheterogeneity among institutions, leading to suboptimal global models.\nIntegrating Disentangled Representation Learning (DRL) in FL can enhance\nrobustness by separating data into distinct representations. Existing DRL\nmethods assume heterogeneity lies solely in style features, overlooking\ncontent-based variability like lesion size and shape. We propose FedGS, a novel\nFL aggregation method, to improve segmentation performance on small,\nunder-represented targets while maintaining overall efficacy. FedGS\ndemonstrates superior performance over FedAvg, particularly for small lesions,\nacross PolypGen and LiTS datasets. The code and pre-trained checkpoints are\navailable at the following link:\nhttps://github.com/Trustworthy-AI-UU-NKI/Federated-Learning-Disentanglement\n","authors":["Philip Schutte","Valentina Corbetta","Regina Beets-Tan","Wilson Silva"],"pdf_url":"https://arxiv.org/pdf/2408.11701v1.pdf","comment":"10 pages, 2 figures, 1 table, accepted at MICCAI 2024 Workshop on\n Distributed, Collaborative, & Federated Learning Workshop (DeCaF). This is\n the submitted manuscript with added link to github repo, funding\n acknowledgements and author names and affiliations. No further post\n submission improvements or corrections were integrated. Final version not\n published yet"},{"id":"http://arxiv.org/abs/2303.06807v2","updated":"2024-08-21T15:25:51Z","published":"2023-03-13T01:42:29Z","title":"Vessel-Promoted OCT to OCTA Image Translation by Heuristic Contextual\n Constraints","summary":" Optical Coherence Tomography Angiography (OCTA) is a crucial tool in the\nclinical screening of retinal diseases, allowing for accurate 3D imaging of\nblood vessels through non-invasive scanning. However, the hardware-based\napproach for acquiring OCTA images presents challenges due to the need for\nspecialized sensors and expensive devices. In this paper, we introduce a novel\nmethod called TransPro, which can translate the readily available 3D Optical\nCoherence Tomography (OCT) images into 3D OCTA images without requiring any\nadditional hardware modifications. Our TransPro method is primarily driven by\ntwo novel ideas that have been overlooked by prior work. The first idea is\nderived from a critical observation that the OCTA projection map is generated\nby averaging pixel values from its corresponding B-scans along the Z-axis.\nHence, we introduce a hybrid architecture incorporating a 3D adversarial\ngenerative network and a novel Heuristic Contextual Guidance (HCG) module,\nwhich effectively maintains the consistency of the generated OCTA images\nbetween 3D volumes and projection maps. The second idea is to improve the\nvessel quality in the translated OCTA projection maps. As a result, we propose\na novel Vessel Promoted Guidance (VPG) module to enhance the attention of\nnetwork on retinal vessels. Experimental results on two datasets demonstrate\nthat our TransPro outperforms state-of-the-art approaches, with relative\nimprovements around 11.4% in MAE, 2.7% in PSNR, 2% in SSIM, 40% in VDE, and\n9.1% in VDC compared to the baseline method. The code is available at:\nhttps://github.com/ustlsh/TransPro.\n","authors":["Shuhan Li","Dong Zhang","Xiaomeng Li","Chubin Ou","Lin An","Yanwu Xu","Kwang-Ting Cheng"],"pdf_url":"https://arxiv.org/pdf/2303.06807v2.pdf","comment":"Accepted by Medical Image Analysis"},{"id":"http://arxiv.org/abs/2408.11700v1","updated":"2024-08-21T15:24:40Z","published":"2024-08-21T15:24:40Z","title":"Supervised Representation Learning towards Generalizable Assembly State\n Recognition","summary":" Assembly state recognition facilitates the execution of assembly procedures,\noffering feedback to enhance efficiency and minimize errors. However,\nrecognizing assembly states poses challenges in scalability, since parts are\nfrequently updated, and the robustness to execution errors remains\nunderexplored. To address these challenges, this paper proposes an approach\nbased on representation learning and the novel intermediate-state informed loss\nfunction modification (ISIL). ISIL leverages unlabeled transitions between\nstates and demonstrates significant improvements in clustering and\nclassification performance for all tested architectures and losses. Despite\nbeing trained exclusively on images without execution errors, thorough analysis\non error states demonstrates that our approach accurately distinguishes between\ncorrect states and states with various types of execution errors. The\nintegration of the proposed algorithm can offer meaningful assistance to\nworkers and mitigate unexpected losses due to procedural mishaps in industrial\nsettings. The code is available at: https://timschoonbeek.github.io/state_rec\n","authors":["Tim J. Schoonbeek","Goutham Balachandran","Hans Onvlee","Tim Houben","Shao-Hsuan Hung","Jacek Kustra","Peter H. N. de With","Fons van der Sommen"],"pdf_url":"https://arxiv.org/pdf/2408.11700v1.pdf","comment":"8 pages, 8 figures"},{"id":"http://arxiv.org/abs/2408.11697v1","updated":"2024-08-21T15:21:27Z","published":"2024-08-21T15:21:27Z","title":"Robust 3D Gaussian Splatting for Novel View Synthesis in Presence of\n Distractors","summary":" 3D Gaussian Splatting has shown impressive novel view synthesis results;\nnonetheless, it is vulnerable to dynamic objects polluting the input data of an\notherwise static scene, so called distractors. Distractors have severe impact\non the rendering quality as they get represented as view-dependent effects or\nresult in floating artifacts. Our goal is to identify and ignore such\ndistractors during the 3D Gaussian optimization to obtain a clean\nreconstruction. To this end, we take a self-supervised approach that looks at\nthe image residuals during the optimization to determine areas that have likely\nbeen falsified by a distractor. In addition, we leverage a pretrained\nsegmentation network to provide object awareness, enabling more accurate\nexclusion of distractors. This way, we obtain segmentation masks of distractors\nto effectively ignore them in the loss formulation. We demonstrate that our\napproach is robust to various distractors and strongly improves rendering\nquality on distractor-polluted scenes, improving PSNR by 1.86dB compared to 3D\nGaussian Splatting.\n","authors":["Paul Ungermann","Armin Ettenhofer","Matthias Nießner","Barbara Roessle"],"pdf_url":"https://arxiv.org/pdf/2408.11697v1.pdf","comment":"GCPR 2024, Project Page:\n https://paulungermann.github.io/Robust3DGaussians , Video:\n https://www.youtube.com/watch?v=P9unyR7yK3E"},{"id":"http://arxiv.org/abs/2408.11687v1","updated":"2024-08-21T15:09:09Z","published":"2024-08-21T15:09:09Z","title":"Interpretable Long-term Action Quality Assessment","summary":" Long-term Action Quality Assessment (AQA) evaluates the execution of\nactivities in videos. However, the length presents challenges in fine-grained\ninterpretability, with current AQA methods typically producing a single score\nby averaging clip features, lacking detailed semantic meanings of individual\nclips. Long-term videos pose additional difficulty due to the complexity and\ndiversity of actions, exacerbating interpretability challenges. While\nquery-based transformer networks offer promising long-term modeling\ncapabilities, their interpretability in AQA remains unsatisfactory due to a\nphenomenon we term Temporal Skipping, where the model skips self-attention\nlayers to prevent output degradation. To address this, we propose an attention\nloss function and a query initialization method to enhance performance and\ninterpretability. Additionally, we introduce a weight-score regression module\ndesigned to approximate the scoring patterns observed in human judgments and\nreplace conventional single-score regression, improving the rationality of\ninterpretability. Our approach achieves state-of-the-art results on three\nreal-world, long-term AQA benchmarks. Our code is available at:\nhttps://github.com/dx199771/Interpretability-AQA\n","authors":["Xu Dong","Xinran Liu","Wanqing Li","Anthony Adeyemi-Ejeye","Andrew Gilbert"],"pdf_url":"https://arxiv.org/pdf/2408.11687v1.pdf","comment":"Accepted to British Machine Vision Conference (BMVC) 2024"},{"id":"http://arxiv.org/abs/2408.11682v1","updated":"2024-08-21T15:04:49Z","published":"2024-08-21T15:04:49Z","title":"LiFCal: Online Light Field Camera Calibration via Bundle Adjustment","summary":" We propose LiFCal, a novel geometric online calibration pipeline for\nMLA-based light field cameras. LiFCal accurately determines model parameters\nfrom a moving camera sequence without precise calibration targets, integrating\narbitrary metric scaling constraints. It optimizes intrinsic parameters of the\nlight field camera model, the 3D coordinates of a sparse set of scene points\nand camera poses in a single bundle adjustment defined directly on micro image\npoints.\n We show that LiFCal can reliably and repeatably calibrate a focused plenoptic\ncamera using different input sequences, providing intrinsic camera parameters\nextremely close to state-of-the-art methods, while offering two main\nadvantages: it can be applied in a target-free scene, and it is implemented\nonline in a complete and continuous pipeline.\n Furthermore, we demonstrate the quality of the obtained camera parameters in\ndownstream tasks like depth estimation and SLAM.\n Webpage: https://lifcal.github.io/\n","authors":["Aymeric Fleith","Doaa Ahmed","Daniel Cremers","Niclas Zeller"],"pdf_url":"https://arxiv.org/pdf/2408.11682v1.pdf","comment":"Accepted to the German Conference on Pattern Recognition (GCPR) 2024"},{"id":"http://arxiv.org/abs/2408.10538v2","updated":"2024-08-21T15:02:53Z","published":"2024-08-20T04:32:50Z","title":"Surgical Workflow Recognition and Blocking Effectiveness Detection in\n Laparoscopic Liver Resections with Pringle Maneuver","summary":" Pringle maneuver (PM) in laparoscopic liver resection aims to reduce blood\nloss and provide a clear surgical view by intermittently blocking blood inflow\nof the liver, whereas prolonged PM may cause ischemic injury. To\ncomprehensively monitor this surgical procedure and provide timely warnings of\nineffective and prolonged blocking, we suggest two complementary AI-assisted\nsurgical monitoring tasks: workflow recognition and blocking effectiveness\ndetection in liver resections. The former presents challenges in real-time\ncapturing of short-term PM, while the latter involves the intraoperative\ndiscrimination of long-term liver ischemia states. To address these challenges,\nwe meticulously collect a novel dataset, called PmLR50, consisting of 25,037\nvideo frames covering various surgical phases from 50 laparoscopic liver\nresection procedures. Additionally, we develop an online baseline for PmLR50,\ntermed PmNet. This model embraces Masked Temporal Encoding (MTE) and Compressed\nSequence Modeling (CSM) for efficient short-term and long-term temporal\ninformation modeling, and embeds Contrastive Prototype Separation (CPS) to\nenhance action discrimination between similar intraoperative operations.\nExperimental results demonstrate that PmNet outperforms existing\nstate-of-the-art surgical workflow recognition methods on the PmLR50 benchmark.\nOur research offers potential clinical applications for the laparoscopic liver\nsurgery community. Source code and data will be publicly available.\n","authors":["Diandian Guo","Weixin Si","Zhixi Li","Jialun Pei","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2408.10538v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11679v1","updated":"2024-08-21T14:58:29Z","published":"2024-08-21T14:58:29Z","title":"Exploring Robustness of Visual State Space model against Backdoor\n Attacks","summary":" Visual State Space Model (VSS) has demonstrated remarkable performance in\nvarious computer vision tasks. However, in the process of development, backdoor\nattacks have brought severe challenges to security. Such attacks cause an\ninfected model to predict target labels when a specific trigger is activated,\nwhile the model behaves normally on benign samples. In this paper, we conduct\nsystematic experiments to comprehend on robustness of VSS through the lens of\nbackdoor attacks, specifically how the state space model (SSM) mechanism\naffects robustness. We first investigate the vulnerability of VSS to different\nbackdoor triggers and reveal that the SSM mechanism, which captures contextual\ninformation within patches, makes the VSS model more susceptible to backdoor\ntriggers compared to models without SSM. Furthermore, we analyze the\nsensitivity of the VSS model to patch processing techniques and discover that\nthese triggers are effectively disrupted. Based on these observations, we\nconsider an effective backdoor for the VSS model that recurs in each patch to\nresist patch perturbations. Extensive experiments across three datasets and\nvarious backdoor attacks reveal that the VSS model performs comparably to\nTransformers (ViTs) but is less robust than the Gated CNNs, which comprise only\nstacked Gated CNN blocks without SSM.\n","authors":["Cheng-Yi Lee","Cheng-Chang Tsai","Chia-Mu Yu","Chun-Shien Lu"],"pdf_url":"https://arxiv.org/pdf/2408.11679v1.pdf","comment":"11 pages, 9 figures, under review"},{"id":"http://arxiv.org/abs/2406.17758v2","updated":"2024-08-21T14:56:00Z","published":"2024-06-25T17:42:25Z","title":"MotionBooth: Motion-Aware Customized Text-to-Video Generation","summary":" In this work, we present MotionBooth, an innovative framework designed for\nanimating customized subjects with precise control over both object and camera\nmovements. By leveraging a few images of a specific object, we efficiently\nfine-tune a text-to-video model to capture the object's shape and attributes\naccurately. Our approach presents subject region loss and video preservation\nloss to enhance the subject's learning performance, along with a subject token\ncross-attention loss to integrate the customized subject with motion control\nsignals. Additionally, we propose training-free techniques for managing subject\nand camera motions during inference. In particular, we utilize cross-attention\nmap manipulation to govern subject motion and introduce a novel latent shift\nmodule for camera movement control as well. MotionBooth excels in preserving\nthe appearance of subjects while simultaneously controlling the motions in\ngenerated videos. Extensive quantitative and qualitative evaluations\ndemonstrate the superiority and effectiveness of our method. Our project page\nis at https://jianzongwu.github.io/projects/motionbooth\n","authors":["Jianzong Wu","Xiangtai Li","Yanhong Zeng","Jiangning Zhang","Qianyu Zhou","Yining Li","Yunhai Tong","Kai Chen"],"pdf_url":"https://arxiv.org/pdf/2406.17758v2.pdf","comment":"Project page at https://jianzongwu.github.io/projects/motionbooth"},{"id":"http://arxiv.org/abs/2408.00963v3","updated":"2024-08-21T14:47:26Z","published":"2024-08-02T00:35:18Z","title":"MIS-ME: A Multi-modal Framework for Soil Moisture Estimation","summary":" Soil moisture estimation is an important task to enable precision agriculture\nin creating optimal plans for irrigation, fertilization, and harvest. It is\ncommon to utilize statistical and machine learning models to estimate soil\nmoisture from traditional data sources such as weather forecasts, soil\nproperties, and crop properties. However, there is a growing interest in\nutilizing aerial and geospatial imagery to estimate soil moisture. Although\nthese images capture high-resolution crop details, they are expensive to curate\nand challenging to interpret. Imagine, an AI-enhanced software tool that\npredicts soil moisture using visual cues captured by smartphones and\nstatistical data given by weather forecasts. This work is a first step towards\nthat goal of developing a multi-modal approach for soil moisture estimation. In\nparticular, we curate a dataset consisting of real-world images taken from\nground stations and their corresponding weather data. We also propose MIS-ME -\nMeteorological & Image based Soil Moisture Estimator, a multi-modal framework\nfor soil moisture estimation. Our extensive analysis shows that MIS-ME achieves\na MAPE of 10.14%, outperforming traditional unimodal approaches with a\nreduction of 3.25% in MAPE for meteorological data and 2.15% in MAPE for image\ndata, highlighting the effectiveness of tailored multi-modal approaches. Our\ncode and dataset will be available at\nhttps://github.com/OSU-Complex-Systems/MIS-ME.git.\n","authors":["Mohammed Rakib","Adil Aman Mohammed","D. Cole Diggins","Sumit Sharma","Jeff Michael Sadler","Tyson Ochsner","Arun Bagavathi"],"pdf_url":"https://arxiv.org/pdf/2408.00963v3.pdf","comment":"Accepted by DSAA2024"},{"id":"http://arxiv.org/abs/2308.03139v2","updated":"2024-08-21T14:26:51Z","published":"2023-08-06T15:32:16Z","title":"Unfolded proximal neural networks for robust image Gaussian denoising","summary":" A common approach to solve inverse imaging problems relies on finding a\nmaximum a posteriori (MAP) estimate of the original unknown image, by solving a\nminimization problem. In thiscontext, iterative proximal algorithms are widely\nused, enabling to handle non-smooth functions and linear operators. Recently,\nthese algorithms have been paired with deep learning strategies, to further\nimprove the estimate quality. In particular, proximal neural networks (PNNs)\nhave been introduced, obtained by unrolling a proximal algorithm as for finding\na MAP estimate, but over a fixed number of iterations, with learned linear\noperators and parameters. As PNNs are based on optimization theory, they are\nvery flexible, and can be adapted to any image restoration task, as soon as a\nproximal algorithm can solve it. They further have much lighter architectures\nthan traditional networks. In this article we propose a unified framework to\nbuild PNNs for the Gaussian denoising task, based on both the dual-FB and the\nprimal-dual Chambolle-Pock algorithms. We further show that accelerated\ninertial versions of these algorithms enable skip connections in the associated\nNN layers. We propose different learning strategies for our PNN framework, and\ninvestigate their robustness (Lipschitz property) and denoising efficiency.\nFinally, we assess the robustness of our PNNs when plugged in a\nforward-backward algorithm for an image deblurring problem.\n","authors":["Hoang Trieu Vy Le","Audrey Repetti","Nelly Pustelnik"],"pdf_url":"https://arxiv.org/pdf/2308.03139v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11649v1","updated":"2024-08-21T14:21:53Z","published":"2024-08-21T14:21:53Z","title":"Video-to-Text Pedestrian Monitoring (VTPM): Leveraging Computer Vision\n and Large Language Models for Privacy-Preserve Pedestrian Activity Monitoring\n at Intersections","summary":" Computer vision has advanced research methodologies, enhancing system\nservices across various fields. It is a core component in traffic monitoring\nsystems for improving road safety; however, these monitoring systems don't\npreserve the privacy of pedestrians who appear in the videos, potentially\nrevealing their identities. Addressing this issue, our paper introduces\nVideo-to-Text Pedestrian Monitoring (VTPM), which monitors pedestrian movements\nat intersections and generates real-time textual reports, including traffic\nsignal and weather information. VTPM uses computer vision models for pedestrian\ndetection and tracking, achieving a latency of 0.05 seconds per video frame.\nAdditionally, it detects crossing violations with 90.2% accuracy by\nincorporating traffic signal data. The proposed framework is equipped with\nPhi-3 mini-4k to generate real-time textual reports of pedestrian activity\nwhile stating safety concerns like crossing violations, conflicts, and the\nimpact of weather on their behavior with latency of 0.33 seconds. To enhance\ncomprehensive analysis of the generated textual reports, Phi-3 medium is\nfine-tuned for historical analysis of these generated textual reports. This\nfine-tuning enables more reliable analysis about the pedestrian safety at\nintersections, effectively detecting patterns and safety critical events. The\nproposed VTPM offers a more efficient alternative to video footage by using\ntextual reports reducing memory usage, saving up to 253 million percent,\neliminating privacy issues, and enabling comprehensive interactive historical\nanalysis.\n","authors":["Ahmed S. Abdelrahman","Mohamed Abdel-Aty","Dongdong Wang"],"pdf_url":"https://arxiv.org/pdf/2408.11649v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.09774v2","updated":"2024-08-21T14:17:31Z","published":"2024-07-13T05:02:42Z","title":"ContextualStory: Consistent Visual Storytelling with Spatially-Enhanced\n and Storyline Context","summary":" Visual storytelling involves generating a sequence of coherent frames from a\ntextual storyline while maintaining consistency in characters and scenes.\nExisting autoregressive methods, which rely on previous frame-sentence pairs,\nstruggle with high memory usage, slow generation speeds, and limited context\nintegration. To address these issues, we propose ContextualStory, a novel\nframework designed to generate coherent story frames and extend frames for\nstory continuation. ContextualStory utilizes Spatially-Enhanced Temporal\nAttention to capture spatial and temporal dependencies, handling significant\ncharacter movements effectively. Additionally, we introduces a Storyline\nContextualizer to enrich context in storyline embedding and a StoryFlow Adapter\nto measure scene changes between frames for guiding model. Extensive\nexperiments on PororoSV and FlintstonesSV benchmarks demonstrate that\nContextualStory significantly outperforms existing methods in both story\nvisualization and story continuation.\n","authors":["Sixiao Zheng","Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2407.09774v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08561v2","updated":"2024-08-21T14:16:51Z","published":"2024-08-16T06:52:38Z","title":"A New Chinese Landscape Paintings Generation Model based on Stable\n Diffusion using DreamBooth","summary":" This study mainly introduces a method combining the Stable Diffusion Model\n(SDM) and Parameter-Efficient Fine-Tuning method for generating Chinese\nLandscape Paintings. This training process is accelerated by combining LoRA\nwith pre-trained SDM and DreamBooth with pre-trained SDM, respectively. On the\nChinese Landscape Paintings Internet dataset used in this paper, this study\nfinds that SDM combined with DreamBooth exhibits superior performance,\noutperforming other models, including the generic pre-trained SDM and\nLoRA-based fine-tuning SDM. The SDM combined with DreamBooth achieves a FID of\n12.75 on the dataset and outperforms all other models in terms of expert\nevaluation, highlighting the model's versatility in the field of Chinese\nLandscape Paintings given the unique identifier, high fidelity and high\nquality. This study illustrates the potential of specialised fine-tuning method\nto improve the performance of SDM on domain-specific tasks, particularly in the\ndomain of Landscape Paintings.\n","authors":["Yujia Gu","Xinyu Fang","Xueyuan Deng","Zihan Peng","Yinan Peng"],"pdf_url":"https://arxiv.org/pdf/2408.08561v2.pdf","comment":"accepted by AHPCAI"},{"id":"http://arxiv.org/abs/2405.14334v2","updated":"2024-08-21T13:46:18Z","published":"2024-05-23T09:07:21Z","title":"Hierarchical Salient Patch Identification for Interpretable Fundus\n Disease Localization","summary":" With the widespread application of deep learning technology in medical image\nanalysis, the effective explanation of model predictions and improvement of\ndiagnostic accuracy have become urgent problems that need to be solved.\nAttribution methods have become key tools to help doctors better understand the\ndiagnostic basis of models, and are used to explain and localize diseases in\nmedical images. However, previous methods suffer from inaccurate and incomplete\nlocalization problems for fundus diseases with complex and diverse structures.\nTo solve these problems, we propose a weakly supervised interpretable fundus\ndisease localization method called hierarchical salient patch identification\n(HSPI) that can achieve interpretable disease localization using only\nimage-level labels and a neural network classifier (NNC). First, we propose\nsalient patch identification (SPI), which divides the image into several\npatches and optimizes consistency loss to identify which patch in the input\nimage is most important for the network's prediction, in order to locate the\ndisease. Second, we propose a hierarchical identification strategy to force SPI\nto analyze the importance of different areas to neural network classifier's\nprediction to comprehensively locate disease areas. Conditional peak focusing\nis then introduced to ensure that the mask vector can accurately locate the\ndisease area. Finally, we propose patch selection based on multi-sized\nintersections to filter out incorrectly or additionally identified non-disease\nregions. We conduct disease localization experiments on fundus image datasets\nand achieve the best performance on multiple evaluation metrics compared to\nprevious interpretable attribution methods. Additional ablation studies are\nconducted to verify the effectiveness of each method.\n","authors":["Yitao Peng","Lianghua He","Die Hu"],"pdf_url":"https://arxiv.org/pdf/2405.14334v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.05578v2","updated":"2024-08-21T13:40:48Z","published":"2024-07-08T03:23:13Z","title":"FALIP: Visual Prompt as Foveal Attention Boosts CLIP Zero-Shot\n Performance","summary":" CLIP has achieved impressive zero-shot performance after pre-training on a\nlarge-scale dataset consisting of paired image-text data. Previous works have\nutilized CLIP by incorporating manually designed visual prompts like colored\ncircles and blur masks into the images to guide the model's attention, showing\nenhanced zero-shot performance in downstream tasks. Although these methods have\nachieved promising results, they inevitably alter the original information of\nthe images, which can lead to failure in specific tasks. We propose a\ntrain-free method Foveal-Attention CLIP (FALIP), which adjusts the CLIP's\nattention by inserting foveal attention masks into the multi-head\nself-attention module. We demonstrate FALIP effectively boosts CLIP zero-shot\nperformance in tasks such as referring expressions comprehension, image\nclassification, and 3D point cloud recognition. Experimental results further\nshow that FALIP outperforms existing methods on most metrics and can augment\ncurrent methods to enhance their performance.\n","authors":["Jiedong Zhuang","Jiaqi Hu","Lianrui Mu","Rui Hu","Xiaoyu Liang","Jiangnan Ye","Haoji Hu"],"pdf_url":"https://arxiv.org/pdf/2407.05578v2.pdf","comment":"Accepted by ECCV 2024, code released"},{"id":"http://arxiv.org/abs/2407.21033v2","updated":"2024-08-21T13:09:02Z","published":"2024-07-17T05:42:43Z","title":"Multi-Grained Query-Guided Set Prediction Network for Grounded\n Multimodal Named Entity Recognition","summary":" Grounded Multimodal Named Entity Recognition (GMNER) is an emerging\ninformation extraction (IE) task, aiming to simultaneously extract entity\nspans, types, and corresponding visual regions of entities from given\nsentence-image pairs data. Recent unified methods employing machine reading\ncomprehension or sequence generation-based frameworks show limitations in this\ndifficult task. The former, utilizing human-designed queries, struggles to\ndifferentiate ambiguous entities, such as Jordan (Person) and off-White x\nJordan (Shoes). The latter, following the one-by-one decoding order, suffers\nfrom exposure bias issues. We maintain that these works misunderstand the\nrelationships of multimodal entities. To tackle these, we propose a novel\nunified framework named Multi-grained Query-guided Set Prediction Network\n(MQSPN) to learn appropriate relationships at intra-entity and inter-entity\nlevels. Specifically, MQSPN consists of a Multi-grained Query Set (MQS) and a\nMultimodal Set Prediction Network (MSP). MQS explicitly aligns entity regions\nwith entity spans by employing a set of learnable queries to strengthen\nintra-entity connections. Based on distinct intra-entity modeling, MSP\nreformulates GMNER as a set prediction, guiding models to establish appropriate\ninter-entity relationships from a global matching perspective. Additionally, we\nincorporate a query-guided Fusion Net (QFNet) to work as a glue network between\nMQS and MSP. Extensive experiments demonstrate that our approach achieves\nstate-of-the-art performances in widely used benchmarks.\n","authors":["Jielong Tang","Zhenxing Wang","Ziyang Gong","Jianxing Yu","Xiangwei Zhu","Jian Yin"],"pdf_url":"https://arxiv.org/pdf/2407.21033v2.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2408.11593v1","updated":"2024-08-21T12:59:42Z","published":"2024-08-21T12:59:42Z","title":"MCDubber: Multimodal Context-Aware Expressive Video Dubbing","summary":" Automatic Video Dubbing (AVD) aims to take the given script and generate\nspeech that aligns with lip motion and prosody expressiveness. Current AVD\nmodels mainly utilize visual information of the current sentence to enhance the\nprosody of synthesized speech. However, it is crucial to consider whether the\nprosody of the generated dubbing aligns with the multimodal context, as the\ndubbing will be combined with the original context in the final video. This\naspect has been overlooked in previous studies. To address this issue, we\npropose a Multimodal Context-aware video Dubbing model, termed\n\\textbf{MCDubber}, to convert the modeling object from a single sentence to a\nlonger sequence with context information to ensure the consistency of the\nglobal context prosody. MCDubber comprises three main components: (1) A context\nduration aligner aims to learn the context-aware alignment between the text and\nlip frames; (2) A context prosody predictor seeks to read the global context\nvisual sequence and predict the context-aware global energy and pitch; (3) A\ncontext acoustic decoder ultimately predicts the global context mel-spectrogram\nwith the assistance of adjacent ground-truth mel-spectrograms of the target\nsentence. Through this process, MCDubber fully considers the influence of\nmultimodal context on the prosody expressiveness of the current sentence when\ndubbing. The extracted mel-spectrogram belonging to the target sentence from\nthe output context mel-spectrograms is the final required dubbing audio.\nExtensive experiments on the Chem benchmark dataset demonstrate that our\nMCDubber significantly improves dubbing expressiveness compared to all advanced\nbaselines. The code and demos are available at\nhttps://github.com/XiaoYuanJun-zy/MCDubber.\n","authors":["Yuan Zhao","Zhenqi Jia","Rui Liu","De Hu","Feilong Bao","Guanglai Gao"],"pdf_url":"https://arxiv.org/pdf/2408.11593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11589v1","updated":"2024-08-21T12:54:41Z","published":"2024-08-21T12:54:41Z","title":"Toward Enhancing Vehicle Color Recognition in Adverse Conditions: A\n Dataset and Benchmark","summary":" Vehicle information recognition is crucial in various practical domains,\nparticularly in criminal investigations. Vehicle Color Recognition (VCR) has\ngarnered significant research interest because color is a visually\ndistinguishable attribute of vehicles and is less affected by partial occlusion\nand changes in viewpoint. Despite the success of existing methods for this\ntask, the relatively low complexity of the datasets used in the literature has\nbeen largely overlooked. This research addresses this gap by compiling a new\ndataset representing a more challenging VCR scenario. The images - sourced from\nsix license plate recognition datasets - are categorized into eleven colors,\nand their annotations were validated using official vehicle registration\ninformation. We evaluate the performance of four deep learning models on a\nwidely adopted dataset and our proposed dataset to establish a benchmark. The\nresults demonstrate that our dataset poses greater difficulty for the tested\nmodels and highlights scenarios that require further exploration in VCR.\nRemarkably, nighttime scenes account for a significant portion of the errors\nmade by the best-performing model. This research provides a foundation for\nfuture studies on VCR, while also offering valuable insights for the field of\nfine-grained vehicle classification.\n","authors":["Gabriel E. Lima","Rayson Laroca","Eduardo Santos","Eduil Nascimento Jr.","David Menotti"],"pdf_url":"https://arxiv.org/pdf/2408.11589v1.pdf","comment":"Accepted for presentation at the Conference on Graphics, Patterns and\n Images (SIBGRAPI) 2024"},{"id":"http://arxiv.org/abs/2408.07666v3","updated":"2024-08-21T12:47:31Z","published":"2024-08-14T16:58:48Z","title":"Model Merging in LLMs, MLLMs, and Beyond: Methods, Theories,\n Applications and Opportunities","summary":" Model merging is an efficient empowerment technique in the machine learning\ncommunity that does not require the collection of raw training data and does\nnot require expensive computation. As model merging becomes increasingly\nprevalent across various fields, it is crucial to understand the available\nmodel merging techniques comprehensively. However, there is a significant gap\nin the literature regarding a systematic and thorough review of these\ntechniques. This survey provides a comprehensive overview of model merging\nmethods and theories, their applications in various domains and settings, and\nfuture research directions. Specifically, we first propose a new taxonomic\napproach that exhaustively discusses existing model merging methods. Secondly,\nwe discuss the application of model merging techniques in large language\nmodels, multimodal large language models, and 10+ machine learning subfields,\nincluding continual learning, multi-task learning, few-shot learning, etc.\nFinally, we highlight the remaining challenges of model merging and discuss\nfuture research directions. A comprehensive list of papers about model merging\nis available at\n\\url{https://github.com/EnnengYang/Awesome-Model-Merging-Methods-Theories-Applications}.\n","authors":["Enneng Yang","Li Shen","Guibing Guo","Xingwei Wang","Xiaochun Cao","Jie Zhang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2408.07666v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11576v1","updated":"2024-08-21T12:32:11Z","published":"2024-08-21T12:32:11Z","title":"RaNDT SLAM: Radar SLAM Based on Intensity-Augmented Normal Distributions\n Transform","summary":" Rescue robotics sets high requirements to perception algorithms due to the\nunstructured and potentially vision-denied environments. Pivoting\nFrequency-Modulated Continuous Wave radars are an emerging sensing modality for\nSLAM in this kind of environment. However, the complex noise characteristics of\nradar SLAM makes, particularly indoor, applications computationally demanding\nand slow. In this work, we introduce a novel radar SLAM framework, RaNDT SLAM,\nthat operates fast and generates accurate robot trajectories. The method is\nbased on the Normal Distributions Transform augmented by radar intensity\nmeasures. Motion estimation is based on fusion of motion model, IMU data, and\nregistration of the intensity-augmented Normal Distributions Transform. We\nevaluate RaNDT SLAM in a new benchmark dataset and the Oxford Radar RobotCar\ndataset. The new dataset contains indoor and outdoor environments besides\nmultiple sensing modalities (LiDAR, radar, and IMU).\n","authors":["Maximilian Hilger","Nils Mandischer","Burkhard Corves"],"pdf_url":"https://arxiv.org/pdf/2408.11576v1.pdf","comment":"This work was accepted by the IEEE/RSJ International Conference on\n Intelligent Robots and Systems, 2024"},{"id":"http://arxiv.org/abs/2408.11573v1","updated":"2024-08-21T12:28:56Z","published":"2024-08-21T12:28:56Z","title":"Finite element-based space-time total variation-type regularization of\n the inverse problem in electrocardiographic imaging","summary":" Reconstructing cardiac electrical activity from body surface electric\npotential measurements results in the severely ill-posed inverse problem in\nelectrocardiography. Many different regularization approaches have been\nproposed to improve numerical results and provide unique results. This work\npresents a novel approach for reconstructing the epicardial potential from body\nsurface potential maps based on a space-time total variation-type\nregularization using finite elements, where a first-order primal-dual algorithm\nsolves the underlying convex optimization problem. In several numerical\nexperiments, the superior performance of this method and the benefit of\nspace-time regularization for the reconstruction of epicardial potential on\ntwo-dimensional torso data and a three-dimensional rabbit heart compared to\nstate-of-the-art methods are demonstrated.\n","authors":["Manuel Haas","Thomas Grandits","Thomas Pinetz","Thomas Beiert","Simone Pezzuto","Alexander Effland"],"pdf_url":"https://arxiv.org/pdf/2408.11573v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10775v2","updated":"2024-08-21T12:28:21Z","published":"2024-08-20T12:14:18Z","title":"Generative AI in Industrial Machine Vision -- A Review","summary":" Machine vision enhances automation, quality control, and operational\nefficiency in industrial applications by enabling machines to interpret and act\non visual data. While traditional computer vision algorithms and approaches\nremain widely utilized, machine learning has become pivotal in current research\nactivities. In particular, generative AI demonstrates promising potential by\nimproving pattern recognition capabilities, through data augmentation,\nincreasing image resolution, and identifying anomalies for quality control.\nHowever, the application of generative AI in machine vision is still in its\nearly stages due to challenges in data diversity, computational requirements,\nand the necessity for robust validation methods. A comprehensive literature\nreview is essential to understand the current state of generative AI in\nindustrial machine vision, focusing on recent advancements, applications, and\nresearch trends. Thus, a literature review based on the PRISMA guidelines was\nconducted, analyzing over 1,200 papers on generative AI in industrial machine\nvision. Our findings reveal various patterns in current research, with the\nprimary use of generative AI being data augmentation, for machine vision tasks\nsuch as classification and object detection. Furthermore, we gather a\ncollection of application challenges together with data requirements to enable\na successful application of generative AI in industrial machine vision. This\noverview aims to provide researchers with insights into the different areas and\napplications within current research, highlighting significant advancements and\nidentifying opportunities for future work.\n","authors":["Hans Aoyang Zhou","Dominik Wolfschläger","Constantinos Florides","Jonas Werheid","Hannes Behnen","Jan-Henrick Woltersmann","Tiago C. Pinto","Marco Kemmerling","Anas Abdelrazeq","Robert H. Schmitt"],"pdf_url":"https://arxiv.org/pdf/2408.10775v2.pdf","comment":"44 pages, 7 figures, This work has been submitted to the Journal of\n Intelligent Manufacturing"},{"id":"http://arxiv.org/abs/2408.11571v1","updated":"2024-08-21T12:27:36Z","published":"2024-08-21T12:27:36Z","title":"CHOTA: A Higher Order Accuracy Metric for Cell Tracking","summary":" The evaluation of cell tracking results steers the development of tracking\nmethods, significantly impacting biomedical research. This is quantitatively\nachieved by means of evaluation metrics. Unfortunately, current metrics favor\nlocal correctness and weakly reward global coherence, impeding high-level\nbiological analysis. To also foster global coherence, we propose the CHOTA\nmetric (Cell-specific Higher Order Tracking Accuracy) which unifies the\nevaluation of all relevant aspects of cell tracking: cell detections and local\nassociations, global coherence, and lineage tracking. We achieve this by\nintroducing a new definition of the term 'trajectory' that includes the entire\ncell lineage and by including this into the well-established HOTA metric from\ngeneral multiple object tracking. Furthermore, we provide a detailed survey of\ncontemporary cell tracking metrics to compare our novel CHOTA metric and to\nshow its advantages. All metrics are extensively evaluated on state-of-the-art\nreal-data cell tracking results and synthetic results that simulate specific\ntracking errors. We show that CHOTA is sensitive to all tracking errors and\ngives a good indication of the biologically relevant capability of a method to\nreconstruct the full lineage of cells. It introduces a robust and comprehensive\nalternative to the currently used metrics in cell tracking. Python code is\navailable at https://github.com/CellTrackingChallenge/py-ctcmetrics .\n","authors":["Timo Kaiser","Vladimir Ulman","Bodo Rosenhahn"],"pdf_url":"https://arxiv.org/pdf/2408.11571v1.pdf","comment":"Accepted at BIC Workshop at European Conference on Computer Vision\n 2024, 14 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2312.09243v3","updated":"2024-08-21T12:24:49Z","published":"2023-12-14T18:58:52Z","title":"OccNeRF: Advancing 3D Occupancy Prediction in LiDAR-Free Environments","summary":" Occupancy prediction reconstructs 3D structures of surrounding environments.\nIt provides detailed information for autonomous driving planning and\nnavigation. However, most existing methods heavily rely on the LiDAR point\nclouds to generate occupancy ground truth, which is not available in the\nvision-based system. In this paper, we propose an OccNeRF method for training\noccupancy networks without 3D supervision. Different from previous works which\nconsider a bounded scene, we parameterize the reconstructed occupancy fields\nand reorganize the sampling strategy to align with the cameras' infinite\nperceptive range. The neural rendering is adopted to convert occupancy fields\nto multi-camera depth maps, supervised by multi-frame photometric consistency.\nMoreover, for semantic occupancy prediction, we design several strategies to\npolish the prompts and filter the outputs of a pretrained open-vocabulary 2D\nsegmentation model. Extensive experiments for both self-supervised depth\nestimation and 3D occupancy prediction tasks on nuScenes and SemanticKITTI\ndatasets demonstrate the effectiveness of our method.\n","authors":["Chubin Zhang","Juncheng Yan","Yi Wei","Jiaxin Li","Li Liu","Yansong Tang","Yueqi Duan","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2312.09243v3.pdf","comment":"Code: https://github.com/LinShan-Bin/OccNeRF"},{"id":"http://arxiv.org/abs/2408.11567v1","updated":"2024-08-21T12:18:34Z","published":"2024-08-21T12:18:34Z","title":"Positional Prompt Tuning for Efficient 3D Representation Learning","summary":" Point cloud analysis has achieved significant development and is\nwell-performed in multiple downstream tasks like point cloud classification and\nsegmentation, etc. Being conscious of the simplicity of the position encoding\nstructure in Transformer-based architectures, we attach importance to the\nposition encoding as a high-dimensional part and the patch encoder to offer\nmulti-scale information. Together with the sequential Transformer, the whole\nmodule with position encoding comprehensively constructs a multi-scale feature\nabstraction module that considers both the local parts from the patch and the\nglobal parts from center points as position encoding. With only a few\nparameters, the position embedding module fits the setting of PEFT\n(Parameter-Efficient Fine-Tuning) tasks pretty well. Thus we unfreeze these\nparameters as a fine-tuning part. At the same time, we review the existing\nprompt and adapter tuning methods, proposing a fresh way of prompts and\nsynthesizing them with adapters as dynamic adjustments. Our Proposed method of\nPEFT tasks, namely PPT, with only 1.05% of parameters for training, gets\nstate-of-the-art results in several mainstream datasets, such as 95.01%\naccuracy in the ScanObjectNN OBJ_BG dataset. Codes will be released at\nhttps://github.com/zsc000722/PPT.\n","authors":["Shaochen Zhang","Zekun Qi","Runpei Dong","Xiuxiu Bai","Xing Wei"],"pdf_url":"https://arxiv.org/pdf/2408.11567v1.pdf","comment":"tech report"},{"id":"http://arxiv.org/abs/2408.11564v1","updated":"2024-08-21T12:18:22Z","published":"2024-08-21T12:18:22Z","title":"AutoDirector: Online Auto-scheduling Agents for Multi-sensory\n Composition","summary":" With the advancement of generative models, the synthesis of different sensory\nelements such as music, visuals, and speech has achieved significant realism.\nHowever, the approach to generate multi-sensory outputs has not been fully\nexplored, limiting the application on high-value scenarios such as of directing\na film. Developing a movie director agent faces two major challenges: (1) Lack\nof parallelism and online scheduling with production steps: In the production\nof multi-sensory films, there are complex dependencies between different\nsensory elements, and the production time for each element varies. (2) Diverse\nneeds and clear communication demands with users: Users often cannot clearly\nexpress their needs until they see a draft, which requires human-computer\ninteraction and iteration to continually adjust and optimize the film content\nbased on user feedback. To address these issues, we introduce AutoDirector, an\ninteractive multi-sensory composition framework that supports long shots,\nspecial effects, music scoring, dubbing, and lip-syncing. This framework\nimproves the efficiency of multi-sensory film production through automatic\nscheduling and supports the modification and improvement of interactive tasks\nto meet user needs. AutoDirector not only expands the application scope of\nhuman-machine collaboration but also demonstrates the potential of AI in\ncollaborating with humans in the role of a film director to complete\nmulti-sensory films.\n","authors":["Minheng Ni","Chenfei Wu","Huaying Yuan","Zhengyuan Yang","Ming Gong","Lijuan Wang","Zicheng Liu","Wangmeng Zuo","Nan Duan"],"pdf_url":"https://arxiv.org/pdf/2408.11564v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11561v1","updated":"2024-08-21T12:15:20Z","published":"2024-08-21T12:15:20Z","title":"Self-Supervised Iterative Refinement for Anomaly Detection in Industrial\n Quality Control","summary":" This study introduces the Iterative Refinement Process (IRP), a robust\nanomaly detection methodology designed for high-stakes industrial quality\ncontrol. The IRP enhances defect detection accuracy through a cyclic data\nrefinement strategy, iteratively removing misleading data points to improve\nmodel performance and robustness. We validate the IRP's effectiveness using two\nbenchmark datasets, Kolektor SDD2 (KSDD2) and MVTec AD, covering a wide range\nof industrial products and defect types. Our experimental results demonstrate\nthat the IRP consistently outperforms traditional anomaly detection models,\nparticularly in environments with high noise levels. This study highlights the\nIRP's potential to significantly enhance anomaly detection processes in\nindustrial settings, effectively managing the challenges of sparse and noisy\ndata.\n","authors":["Muhammad Aqeel","Shakiba Sharifi","Marco Cristani","Francesco Setti"],"pdf_url":"https://arxiv.org/pdf/2408.11561v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11559v1","updated":"2024-08-21T12:13:18Z","published":"2024-08-21T12:13:18Z","title":"Semi-supervised 3D Semantic Scene Completion with 2D Vision Foundation\n Model Guidance","summary":" Accurate prediction of 3D semantic occupancy from 2D visual images is vital\nin enabling autonomous agents to comprehend their surroundings for planning and\nnavigation. State-of-the-art methods typically employ fully supervised\napproaches, necessitating a huge labeled dataset acquired through expensive\nLiDAR sensors and meticulous voxel-wise labeling by human annotators. The\nresource-intensive nature of this annotating process significantly hampers the\napplication and scalability of these methods. We introduce a novel\nsemi-supervised framework to alleviate the dependency on densely annotated\ndata. Our approach leverages 2D foundation models to generate essential 3D\nscene geometric and semantic cues, facilitating a more efficient training\nprocess. Our framework exhibits notable properties: (1) Generalizability,\napplicable to various 3D semantic scene completion approaches, including 2D-3D\nlifting and 3D-2D transformer methods. (2) Effectiveness, as demonstrated\nthrough experiments on SemanticKITTI and NYUv2, wherein our method achieves up\nto 85% of the fully-supervised performance using only 10% labeled data. This\napproach not only reduces the cost and labor associated with data annotation\nbut also demonstrates the potential for broader adoption in camera-based\nsystems for 3D semantic occupancy prediction.\n","authors":["Duc-Hai Pham","Duc Dung Nguyen","Hoang-Anh Pham","Ho Lai Tuan","Phong Ha Nguyen","Khoi Nguyen","Rang Nguyen"],"pdf_url":"https://arxiv.org/pdf/2408.11559v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11558v1","updated":"2024-08-21T12:12:37Z","published":"2024-08-21T12:12:37Z","title":"GSTran: Joint Geometric and Semantic Coherence for Point Cloud\n Segmentation","summary":" Learning meaningful local and global information remains a challenge in point\ncloud segmentation tasks. When utilizing local information, prior studies\nindiscriminately aggregates neighbor information from different classes to\nupdate query points, potentially compromising the distinctive feature of query\npoints. In parallel, inaccurate modeling of long-distance contextual\ndependencies when utilizing global information can also impact model\nperformance. To address these issues, we propose GSTran, a novel transformer\nnetwork tailored for the segmentation task. The proposed network mainly\nconsists of two principal components: a local geometric transformer and a\nglobal semantic transformer. In the local geometric transformer module, we\nexplicitly calculate the geometric disparity within the local region. This\nenables amplifying the affinity with geometrically similar neighbor points\nwhile suppressing the association with other neighbors. In the global semantic\ntransformer module, we design a multi-head voting strategy. This strategy\nevaluates semantic similarity across the entire spatial range, facilitating the\nprecise capture of contextual dependencies. Experiments on ShapeNetPart and\nS3DIS benchmarks demonstrate the effectiveness of the proposed method, showing\nits superiority over other algorithms. The code is available at\nhttps://github.com/LAB123-tech/GSTran.\n","authors":["Abiao Li","Chenlei Lv","Guofeng Mei","Yifan Zuo","Jian Zhang","Yuming Fang"],"pdf_url":"https://arxiv.org/pdf/2408.11558v1.pdf","comment":"ICPR 2024"},{"id":"http://arxiv.org/abs/2408.11553v1","updated":"2024-08-21T12:04:32Z","published":"2024-08-21T12:04:32Z","title":"AnyDesign: Versatile Area Fashion Editing via Mask-Free Diffusion","summary":" Fashion image editing aims to modify a person's appearance based on a given\ninstruction. Existing methods require auxiliary tools like segmenters and\nkeypoint extractors, lacking a flexible and unified framework. Moreover, these\nmethods are limited in the variety of clothing types they can handle, as most\ndatasets focus on people in clean backgrounds and only include generic garments\nsuch as tops, pants, and dresses. These limitations restrict their\napplicability in real-world scenarios. In this paper, we first extend an\nexisting dataset for human generation to include a wider range of apparel and\nmore complex backgrounds. This extended dataset features people wearing diverse\nitems such as tops, pants, dresses, skirts, headwear, scarves, shoes, socks,\nand bags. Additionally, we propose AnyDesign, a diffusion-based method that\nenables mask-free editing on versatile areas. Users can simply input a human\nimage along with a corresponding prompt in either text or image format. Our\napproach incorporates Fashion DiT, equipped with a Fashion-Guidance Attention\n(FGA) module designed to fuse explicit apparel types and CLIP-encoded apparel\nfeatures. Both Qualitative and quantitative experiments demonstrate that our\nmethod delivers high-quality fashion editing and outperforms contemporary\ntext-guided fashion editing methods.\n","authors":["Yunfang Niu","Lingxiang Wu","Dong Yi","Jie Peng","Ning Jiang","Haiying Wu","Jinqiao Wang"],"pdf_url":"https://arxiv.org/pdf/2408.11553v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11545v1","updated":"2024-08-21T11:53:53Z","published":"2024-08-21T11:53:53Z","title":"UNetMamba: Efficient UNet-Like Mamba for Semantic Segmentation of\n High-Resolution Remote Sensing Images","summary":" The semantic segmentation of high-resolution remote sensing images plays a\ncrucial role in downstream applications such as urban planning and disaster\nassessment. However, existing Transformer-based methods suffer from the\nconstraint between accuracy and efficiency. To overcome this dilemma, we\npropose UNetMamba, a novel Mamba-based semantic segmentation model. It\nincorporates a Mamba Segmentation Decoder (MSD) that can efficiently decode the\ncomplex information within high-resolution images, and a Local Supervision\nModule (LSM), which is train-only but can significantly enhance the perception\nof local contents. Extensive experiments demonstrate that UNet-Mamba\noutperforms the state-of-the-art methods with the mIoU increased by 0.87% on\nLoveDA and 0.36% on ISPRS Vaihingen, while achieving high efficiency through\nlight weight, low memory footprint and low computational cost. The source code\nwill soon be publicly available at https://github.com/EnzeZhu2001/UNetMamba.\n","authors":["Enze Zhu","Zhan Chen","Dingkai Wang","Hanru Shi","Xiaoxuan Liu","Lei Wang"],"pdf_url":"https://arxiv.org/pdf/2408.11545v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10872v2","updated":"2024-08-21T11:40:49Z","published":"2024-08-20T14:03:30Z","title":"V-RoAst: A New Dataset for Visual Road Assessment","summary":" Road traffic crashes cause millions of deaths annually and have a significant\neconomic impact, particularly in low- and middle-income countries (LMICs). This\npaper presents an approach using Vision Language Models (VLMs) for road safety\nassessment, overcoming the limitations of traditional Convolutional Neural\nNetworks (CNNs). We introduce a new task ,V-RoAst (Visual question answering\nfor Road Assessment), with a real-world dataset. Our approach optimizes prompt\nengineering and evaluates advanced VLMs, including Gemini-1.5-flash and\nGPT-4o-mini. The models effectively examine attributes for road assessment.\nUsing crowdsourced imagery from Mapillary, our scalable solution influentially\nestimates road safety levels. In addition, this approach is designed for local\nstakeholders who lack resources, as it does not require training data. It\noffers a cost-effective and automated methods for global road safety\nassessments, potentially saving lives and reducing economic burdens.\n","authors":["Natchapon Jongwiriyanurak","Zichao Zeng","June Moh Goo","Xinglei Wang","Ilya Ilyankou","Kerkritt Srirrongvikrai","Meihui Wang","James Haworth"],"pdf_url":"https://arxiv.org/pdf/2408.10872v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11541v1","updated":"2024-08-21T11:40:40Z","published":"2024-08-21T11:40:40Z","title":"Evolution of Detection Performance throughout the Online Lifespan of\n Synthetic Images","summary":" Synthetic images disseminated online significantly differ from those used\nduring the training and evaluation of the state-of-the-art detectors. In this\nwork, we analyze the performance of synthetic image detectors as deceptive\nsynthetic images evolve throughout their online lifespan. Our study reveals\nthat, despite advancements in the field, current state-of-the-art detectors\nstruggle to distinguish between synthetic and real images in the wild.\nMoreover, we show that the time elapsed since the initial online appearance of\na synthetic image negatively affects the performance of most detectors.\nUltimately, by employing a retrieval-assisted detection approach, we\ndemonstrate the feasibility to maintain initial detection performance\nthroughout the whole online lifespan of an image and enhance the average\ndetection efficacy across several state-of-the-art detectors by 6.7% and 7.8%\nfor balanced accuracy and AUC metrics, respectively.\n","authors":["Dimitrios Karageorgiou","Quentin Bammey","Valentin Porcellini","Bertrand Goupil","Denis Teyssou","Symeon Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2408.11541v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11540v1","updated":"2024-08-21T11:39:18Z","published":"2024-08-21T11:39:18Z","title":"DeRainGS: Gaussian Splatting for Enhanced Scene Reconstruction in Rainy","summary":" Reconstruction under adverse rainy conditions poses significant challenges\ndue to reduced visibility and the distortion of visual perception. These\nconditions can severely impair the quality of geometric maps, which is\nessential for applications ranging from autonomous planning to environmental\nmonitoring. In response to these challenges, this study introduces the novel\ntask of 3D Reconstruction in Rainy Environments (3DRRE), specifically designed\nto address the complexities of reconstructing 3D scenes under rainy conditions.\nTo benchmark this task, we construct the HydroViews dataset that comprises a\ndiverse collection of both synthesized and real-world scene images\ncharacterized by various intensities of rain streaks and raindrops.\nFurthermore, we propose DeRainGS, the first 3DGS method tailored for\nreconstruction in adverse rainy environments. Extensive experiments across a\nwide range of rain scenarios demonstrate that our method delivers\nstate-of-the-art performance, remarkably outperforming existing occlusion-free\nmethods by a large margin.\n","authors":["Shuhong Liu","Xiang Chen","Hongming Chen","Quanfeng Xu","Mingrui Li"],"pdf_url":"https://arxiv.org/pdf/2408.11540v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10501v2","updated":"2024-08-21T11:36:47Z","published":"2024-04-16T12:19:54Z","title":"Self-Supervised Visual Preference Alignment","summary":" This paper makes the first attempt towards unsupervised preference alignment\nin Vision-Language Models (VLMs). We generate chosen and rejected responses\nwith regard to the original and augmented image pairs, and conduct preference\nalignment with direct preference optimization. It is based on a core idea:\nproperly designed augmentation to the image input will induce VLM to generate\nfalse but hard negative responses, which helps the model to learn from and\nproduce more robust and powerful answers. The whole pipeline no longer hinges\non supervision from GPT-4 or human involvement during alignment, and is highly\nefficient with few lines of code. With only 8k randomly sampled unsupervised\ndata, it achieves 90\\% relative score to GPT-4 on complex reasoning in\nLLaVA-Bench, and improves LLaVA-7B/13B by 6.7\\%/5.6\\% score on complex\nmulti-modal benchmark MM-Vet. Visualizations shows its improved ability to\nalign with user-intentions. A series of ablations are firmly conducted to\nreveal the latent mechanism of the approach, which also indicates its potential\ntowards further scaling. Code are available in\nhttps://github.com/Kevinz-code/SeVa.\n","authors":["Ke Zhu","Zheng Ge","Liang Zhao","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.10501v2.pdf","comment":"MM2024 oral"},{"id":"http://arxiv.org/abs/2403.06168v2","updated":"2024-08-21T11:35:15Z","published":"2024-03-10T10:39:32Z","title":"DiffuMatting: Synthesizing Arbitrary Objects with Matting-level\n Annotation","summary":" Due to the difficulty and labor-consuming nature of getting highly accurate\nor matting annotations, there only exists a limited amount of highly accurate\nlabels available to the public. To tackle this challenge, we propose a\nDiffuMatting which inherits the strong Everything generation ability of\ndiffusion and endows the power of \"matting anything\". Our DiffuMatting can 1).\nact as an anything matting factory with high accurate annotations 2). be\nwell-compatible with community LoRAs or various conditional control approaches\nto achieve the community-friendly art design and controllable generation.\nSpecifically, inspired by green-screen-matting, we aim to teach the diffusion\nmodel to paint on a fixed green screen canvas. To this end, a large-scale\ngreenscreen dataset (Green100K) is collected as a training dataset for\nDiffuMatting. Secondly, a green background control loss is proposed to keep the\ndrawing board as a pure green color to distinguish the foreground and\nbackground. To ensure the synthesized object has more edge details, a\ndetailed-enhancement of transition boundary loss is proposed as a guideline to\ngenerate objects with more complicated edge structures. Aiming to\nsimultaneously generate the object and its matting annotation, we build a\nmatting head to make a green color removal in the latent space of the VAE\ndecoder. Our DiffuMatting shows several potential applications (e.g.,\nmatting-data generator, community-friendly art design and controllable\ngeneration). As a matting-data generator, DiffuMatting synthesizes general\nobject and portrait matting sets, effectively reducing the relative MSE error\nby 15.4% in General Object Matting and 11.4% in Portrait Matting tasks. The\ndataset is released in our project page at\n\\url{https://diffumatting.github.io}.\n","authors":["Xiaobin Hu","Xu Peng","Donghao Luo","Xiaozhong Ji","Jinlong Peng","Zhengkai Jiang","Jiangning Zhang","Taisong Jin","Chengjie Wang","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2403.06168v2.pdf","comment":"This paper was accepted by ECCV 2024, and the project page is\n accessible at: \\url{https://diffumatting.github.io}"},{"id":"http://arxiv.org/abs/2408.11537v1","updated":"2024-08-21T11:32:09Z","published":"2024-08-21T11:32:09Z","title":"A Survey of Embodied Learning for Object-Centric Robotic Manipulation","summary":" Embodied learning for object-centric robotic manipulation is a rapidly\ndeveloping and challenging area in embodied AI. It is crucial for advancing\nnext-generation intelligent robots and has garnered significant interest\nrecently. Unlike data-driven machine learning methods, embodied learning\nfocuses on robot learning through physical interaction with the environment and\nperceptual feedback, making it especially suitable for robotic manipulation. In\nthis paper, we provide a comprehensive survey of the latest advancements in\nthis field and categorize the existing work into three main branches: 1)\nEmbodied perceptual learning, which aims to predict object pose and affordance\nthrough various data representations; 2) Embodied policy learning, which\nfocuses on generating optimal robotic decisions using methods such as\nreinforcement learning and imitation learning; 3) Embodied task-oriented\nlearning, designed to optimize the robot's performance based on the\ncharacteristics of different tasks in object grasping and manipulation. In\naddition, we offer an overview and discussion of public datasets, evaluation\nmetrics, representative applications, current challenges, and potential future\nresearch directions. A project associated with this survey has been established\nat https://github.com/RayYoh/OCRM_survey.\n","authors":["Ying Zheng","Lei Yao","Yuejiao Su","Yi Zhang","Yi Wang","Sicheng Zhao","Yiyi Zhang","Lap-Pui Chau"],"pdf_url":"https://arxiv.org/pdf/2408.11537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11535v1","updated":"2024-08-21T11:18:35Z","published":"2024-08-21T11:18:35Z","title":"SAM-REF: Rethinking Image-Prompt Synergy for Refinement in Segment\n Anything","summary":" The advent of the Segment Anything Model (SAM) marks a significant milestone\nfor interactive segmentation using generalist models. As a late fusion model,\nSAM extracts image embeddings once and merges them with prompts in later\ninteractions. This strategy limits the models ability to extract detailed\ninformation from the prompted target zone. Current specialist models utilize\nthe early fusion strategy that encodes the combination of images and prompts to\ntarget the prompted objects, yet repetitive complex computations on the images\nresult in high latency. The key to these issues is efficiently synergizing the\nimages and prompts. We propose SAM-REF, a two-stage refinement framework that\nfully integrates images and prompts globally and locally while maintaining the\naccuracy of early fusion and the efficiency of late fusion. The first-stage\nGlobalDiff Refiner is a lightweight early fusion network that combines the\nwhole image and prompts, focusing on capturing detailed information for the\nentire object. The second-stage PatchDiff Refiner locates the object detail\nwindow according to the mask and prompts, then refines the local details of the\nobject. Experimentally, we demonstrated the high effectiveness and efficiency\nof our method in tackling complex cases with multiple interactions. Our SAM-REF\nmodel outperforms the current state-of-the-art method in most metrics on\nsegmentation quality without compromising efficiency.\n","authors":["Chongkai Yu","Anqi Li","Xiaochao Qu","Luoqi Liu","Ting Liu"],"pdf_url":"https://arxiv.org/pdf/2408.11535v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11531v1","updated":"2024-08-21T11:12:50Z","published":"2024-08-21T11:12:50Z","title":"Just Project! Multi-Channel Despeckling, the Easy Way","summary":" Reducing speckle fluctuations in multi-channel SAR images is essential in\nmany applications of SAR imaging such as polarimetric classification or\ninterferometric height estimation. While single-channel despeckling has widely\nbenefited from the application of deep learning techniques, extensions to\nmulti-channel SAR images are much more challenging.This paper introduces\nMuChaPro, a generic framework that exploits existing single-channel despeckling\nmethods. The key idea is to generate numerous single-channel projections,\nrestore these projections, and recombine them into the final multi-channel\nestimate. This simple approach is shown to be effective in polarimetric and/or\ninterferometric modalities. A special appeal of MuChaPro is the possibility to\napply a self-supervised training strategy to learn sensor-specific networks for\nsingle-channel despeckling.\n","authors":["Loïc Denis","Emanuele Dalsasso","Florence Tupin"],"pdf_url":"https://arxiv.org/pdf/2408.11531v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02954v2","updated":"2024-08-21T11:09:11Z","published":"2024-05-05T14:48:13Z","title":"Source-Free Domain Adaptation Guided by Vision and Vision-Language\n Pre-Training","summary":" Source-free domain adaptation (SFDA) aims to adapt a source model trained on\na fully-labeled source domain to a related but unlabeled target domain. While\nthe source model is a key avenue for acquiring target pseudolabels, the\ngenerated pseudolabels may exhibit source bias. In the conventional SFDA\npipeline, a large data (e.g. ImageNet) pre-trained feature extractor is used to\ninitialize the source model at the start of source training, and subsequently\ndiscarded. Despite having diverse features important for generalization, the\npre-trained feature extractor can overfit to the source data distribution\nduring source training and forget relevant target domain knowledge. Rather than\ndiscarding this valuable knowledge, we introduce an integrated framework to\nincorporate pre-trained networks into the target adaptation process. The\nproposed framework is flexible and allows us to plug modern pre-trained\nnetworks into the adaptation process to leverage their stronger representation\nlearning capabilities. For adaptation, we propose the Co-learn algorithm to\nimprove target pseudolabel quality collaboratively through the source model and\na pre-trained feature extractor. Building on the recent success of the\nvision-language model CLIP in zero-shot image recognition, we present an\nextension Co-learn++ to further incorporate CLIP's zero-shot classification\ndecisions. We evaluate on 4 benchmark datasets and include more challenging\nscenarios such as open-set, partial-set and open-partial SFDA. Experimental\nresults demonstrate that our proposed strategy improves adaptation performance\nand can be successfully integrated with existing SFDA methods.\n","authors":["Wenyu Zhang","Li Shen","Chuan-Sheng Foo"],"pdf_url":"https://arxiv.org/pdf/2405.02954v2.pdf","comment":"Extension of ICCV paper arXiv:2212.07585, accepted to IJCV"},{"id":"http://arxiv.org/abs/2408.11518v1","updated":"2024-08-21T10:51:12Z","published":"2024-08-21T10:51:12Z","title":"EmoFace: Emotion-Content Disentangled Speech-Driven 3D Talking Face with\n Mesh Attention","summary":" The creation of increasingly vivid 3D virtual digital humans has become a hot\ntopic in recent years. Currently, most speech-driven work focuses on training\nmodels to learn the relationship between phonemes and visemes to achieve more\nrealistic lips. However, they fail to capture the correlations between emotions\nand facial expressions effectively. To solve this problem, we propose a new\nmodel, termed EmoFace. EmoFace employs a novel Mesh Attention mechanism, which\nhelps to learn potential feature dependencies between mesh vertices in time and\nspace. We also adopt, for the first time to our knowledge, an effective\nself-growing training scheme that combines teacher-forcing and scheduled\nsampling in a 3D face animation task. Additionally, since EmoFace is an\nautoregressive model, there is no requirement that the first frame of the\ntraining data must be a silent frame, which greatly reduces the data\nlimitations and contributes to solve the current dilemma of insufficient\ndatasets. Comprehensive quantitative and qualitative evaluations on our\nproposed high-quality reconstructed 3D emotional facial animation dataset,\n3D-RAVDESS ($5.0343\\times 10^{-5}$mm for LVE and $1.0196\\times 10^{-5}$mm for\nEVE), and publicly available dataset VOCASET ($2.8669\\times 10^{-5}$mm for LVE\nand $0.4664\\times 10^{-5}$mm for EVE), demonstrate that our algorithm achieves\nstate-of-the-art performance.\n","authors":["Yihong Lin","Liang Peng","Jianqiao Hu","Xiandong Li","Wenxiong Kang","Songju Lei","Xianjia Wu","Huang Xu"],"pdf_url":"https://arxiv.org/pdf/2408.11518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06814v2","updated":"2024-08-21T10:32:43Z","published":"2024-08-13T11:10:26Z","title":"Structure-preserving Planar Simplification for Indoor Environments","summary":" This paper presents a novel approach for structure-preserving planar\nsimplification of indoor scene point clouds for both simulated and real-world\nenvironments. Initially, the scene point cloud undergoes preprocessing steps,\nincluding noise reduction and Manhattan world alignment, to ensure robustness\nand coherence in subsequent analyses. We segment each captured scene into\nstructured (walls-ceiling-floor) and non-structured (indoor objects) scenes.\nLeveraging a RANSAC algorithm, we extract primitive planes from the input point\ncloud, facilitating the segmentation and simplification of the structured\nscene. The best-fitting wall meshes are then generated from the primitives,\nfollowed by adjacent mesh merging with the vertex-translation algorithm which\npreserves the mesh layout. To accurately represent ceilings and floors, we\nemploy the mesh clipping algorithm which clips the ceiling and floor meshes\nwith respect to wall normals. In the case of indoor scenes, we apply a surface\nreconstruction technique to enhance the fidelity. This paper focuses on the\nintricate steps of the proposed scene simplification methodology, addressing\ncomplex scenarios such as multi-story and slanted walls and ceilings. We also\nconduct qualitative and quantitative performance comparisons against popular\nsurface reconstruction, shape approximation, and floorplan generation\napproaches.\n","authors":["Bishwash Khanal","Sanjay Rijal","Manish Awale","Vaghawan Ojha"],"pdf_url":"https://arxiv.org/pdf/2408.06814v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05966v2","updated":"2024-08-21T10:28:18Z","published":"2024-08-12T07:44:19Z","title":"Freehand Sketch Generation from Mechanical Components","summary":" Drawing freehand sketches of mechanical components on multimedia devices for\nAI-based engineering modeling has become a new trend. However, its development\nis being impeded because existing works cannot produce suitable sketches for\ndata-driven research. These works either generate sketches lacking a freehand\nstyle or utilize generative models not originally designed for this task\nresulting in poor effectiveness. To address this issue, we design a two-stage\ngenerative framework mimicking the human sketching behavior pattern, called\nMSFormer, which is the first time to produce humanoid freehand sketches\ntailored for mechanical components. The first stage employs Open CASCADE\ntechnology to obtain multi-view contour sketches from mechanical components,\nfiltering perturbing signals for the ensuing generation process. Meanwhile, we\ndesign a view selector to simulate viewpoint selection tasks during human\nsketching for picking out information-rich sketches. The second stage\ntranslates contour sketches into freehand sketches by a transformer-based\ngenerator. To retain essential modeling features as much as possible and\nrationalize stroke distribution, we introduce a novel edge-constraint stroke\ninitialization. Furthermore, we utilize a CLIP vision encoder and a new loss\nfunction incorporating the Hausdorff distance to enhance the generalizability\nand robustness of the model. Extensive experiments demonstrate that our\napproach achieves state-of-the-art performance for generating freehand sketches\nin the mechanical domain. Project page: https://mcfreeskegen.github.io .\n","authors":["Zhichao Liao","Di Huang","Heming Fang","Yue Ma","Fengyuan Piao","Xinghui Li","Long Zeng","Pingfa Feng"],"pdf_url":"https://arxiv.org/pdf/2408.05966v2.pdf","comment":"Published at ACM Multimedia (ACM MM) 2024"},{"id":"http://arxiv.org/abs/2408.11505v1","updated":"2024-08-21T10:25:51Z","published":"2024-08-21T10:25:51Z","title":"MSCPT: Few-shot Whole Slide Image Classification with Multi-scale and\n Context-focused Prompt Tuning","summary":" Multiple instance learning (MIL) has become a standard paradigm for weakly\nsupervised classification of whole slide images (WSI). However, this paradigm\nrelies on the use of a large number of labelled WSIs for training. The lack of\ntraining data and the presence of rare diseases present significant challenges\nfor these methods. Prompt tuning combined with the pre-trained Vision-Language\nmodels (VLMs) is an effective solution to the Few-shot Weakly Supervised WSI\nclassification (FSWC) tasks. Nevertheless, applying prompt tuning methods\ndesigned for natural images to WSIs presents three significant challenges: 1)\nThese methods fail to fully leverage the prior knowledge from the VLM's text\nmodality; 2) They overlook the essential multi-scale and contextual information\nin WSIs, leading to suboptimal results; and 3) They lack exploration of\ninstance aggregation methods. To address these problems, we propose a\nMulti-Scale and Context-focused Prompt Tuning (MSCPT) method for FSWC tasks.\nSpecifically, MSCPT employs the frozen large language model to generate\npathological visual language prior knowledge at multi-scale, guiding\nhierarchical prompt tuning. Additionally, we design a graph prompt tuning\nmodule to learn essential contextual information within WSI, and finally, a\nnon-parametric cross-guided instance aggregation module has been introduced to\nget the WSI-level features. Based on two VLMs, extensive experiments and\nvisualizations on three datasets demonstrated the powerful performance of our\nMSCPT.\n","authors":["Minghao Han","Linhao Qu","Dingkang Yang","Xukun Zhang","Xiaoying Wang","Lihua Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.11505v1.pdf","comment":"11 pages, 5 figures, 5tables"},{"id":"http://arxiv.org/abs/2403.09577v2","updated":"2024-08-21T10:06:03Z","published":"2024-03-14T17:11:49Z","title":"The NeRFect Match: Exploring NeRF Features for Visual Localization","summary":" In this work, we propose the use of Neural Radiance Fields (NeRF) as a scene\nrepresentation for visual localization. Recently, NeRF has been employed to\nenhance pose regression and scene coordinate regression models by augmenting\nthe training database, providing auxiliary supervision through rendered images,\nor serving as an iterative refinement module. We extend its recognized\nadvantages -- its ability to provide a compact scene representation with\nrealistic appearances and accurate geometry -- by exploring the potential of\nNeRF's internal features in establishing precise 2D-3D matches for\nlocalization. To this end, we conduct a comprehensive examination of NeRF's\nimplicit knowledge, acquired through view synthesis, for matching under various\nconditions. This includes exploring different matching network architectures,\nextracting encoder features at multiple layers, and varying training\nconfigurations. Significantly, we introduce NeRFMatch, an advanced 2D-3D\nmatching function that capitalizes on the internal knowledge of NeRF learned\nvia view synthesis. Our evaluation of NeRFMatch on standard localization\nbenchmarks, within a structure-based pipeline, sets a new state-of-the-art for\nlocalization performance on Cambridge Landmarks.\n","authors":["Qunjie Zhou","Maxim Maximov","Or Litany","Laura Leal-Taixé"],"pdf_url":"https://arxiv.org/pdf/2403.09577v2.pdf","comment":"ECCV24 camera ready"},{"id":"http://arxiv.org/abs/2408.11493v1","updated":"2024-08-21T10:05:22Z","published":"2024-08-21T10:05:22Z","title":"XDT-CXR: Investigating Cross-Disease Transferability in Zero-Shot Binary\n Classification of Chest X-Rays","summary":" This study explores the concept of cross-disease transferability (XDT) in\nmedical imaging, focusing on the potential of binary classifiers trained on one\ndisease to perform zero-shot classification on another disease affecting the\nsame organ. Utilizing chest X-rays (CXR) as the primary modality, we\ninvestigate whether a model trained on one pulmonary disease can make\npredictions about another novel pulmonary disease, a scenario with significant\nimplications for medical settings with limited data on emerging diseases. The\nXDT framework leverages the embedding space of a vision encoder, which, through\nkernel transformation, aids in distinguishing between diseased and non-diseased\nclasses in the latent space. This capability is especially beneficial in\nresource-limited environments or in regions with low prevalence of certain\ndiseases, where conventional diagnostic practices may fail. However, the XDT\nframework is currently limited to binary classification, determining only the\npresence or absence of a disease rather than differentiating among multiple\ndiseases. This limitation underscores the supplementary role of XDT to\ntraditional diagnostic tests in clinical settings. Furthermore, results show\nthat XDT-CXR as a framework is able to make better predictions compared to\nother zero-shot learning (ZSL) baselines.\n","authors":["Umaima Rahman","Abhishek Basu","Muhammad Uzair Khattak","Aniq Ur Rahman"],"pdf_url":"https://arxiv.org/pdf/2408.11493v1.pdf","comment":"Accepted in Machine Learning for Healthcare Conference MLHC 2024"},{"id":"http://arxiv.org/abs/2407.19832v3","updated":"2024-08-21T09:52:52Z","published":"2024-07-29T09:38:15Z","title":"ML-Mamba: Efficient Multi-Modal Large Language Model Utilizing Mamba-2","summary":" Multimodal Large Language Models (MLLMs) have attracted much attention for\ntheir multifunctionality. However, traditional Transformer architectures incur\nsignificant overhead due to their secondary computational complexity. To\naddress this issue, we introduce ML-Mamba, a multimodal language model, which\nutilizes the latest and efficient Mamba-2 model for inference. Mamba-2 is known\nfor its linear scalability and fast processing of long sequences. We replace\nthe Transformer-based backbone with a pre-trained Mamba-2 model and explore\nmethods for integrating 2D visual selective scanning mechanisms into multimodal\nlearning while also trying various visual encoders and Mamba-2 model variants.\nOur extensive experiments in various multimodal benchmark tests demonstrate the\ncompetitive performance of ML-Mamba and highlight the potential of state space\nmodels in multimodal tasks. The experimental results show that: (1) we\nempirically explore how to effectively apply the 2D vision selective scan\nmechanism for multimodal learning. We propose a novel multimodal connector\ncalled the Mamba-2 Scan Connector (MSC), which enhances representational\ncapabilities. (2) ML-Mamba achieves performance comparable to state-of-the-art\nmethods such as TinyLaVA and MobileVLM v2 through its linear sequential\nmodeling while faster inference speed; (3) Compared to multimodal models\nutilizing Mamba-1, the Mamba-2-based ML-Mamba exhibits superior inference\nperformance and effectiveness.\n","authors":["Wenjun Huang","Jiakai Pan","Jiahao Tang","Yanyu Ding","Yifei Xing","Yuhe Wang","Zhengzhuo Wang","Jianguo Hu"],"pdf_url":"https://arxiv.org/pdf/2407.19832v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11481v1","updated":"2024-08-21T09:49:32Z","published":"2024-08-21T09:49:32Z","title":"E-Bench: Subjective-Aligned Benchmark Suite for Text-Driven Video\n Editing Quality Assessment","summary":" Text-driven video editing has recently experienced rapid development. Despite\nthis, evaluating edited videos remains a considerable challenge. Current\nmetrics tend to fail to align with human perceptions, and effective\nquantitative metrics for video editing are still notably absent. To address\nthis, we introduce E-Bench, a benchmark suite tailored to the assessment of\ntext-driven video editing. This suite includes E-Bench DB, a video quality\nassessment (VQA) database for video editing. E-Bench DB encompasses a diverse\nset of source videos featuring various motions and subjects, along with\nmultiple distinct editing prompts, editing results from 8 different models, and\nthe corresponding Mean Opinion Scores (MOS) from 24 human annotators. Based on\nE-Bench DB, we further propose E-Bench QA, a quantitative human-aligned\nmeasurement for the text-driven video editing task. In addition to the\naesthetic, distortion, and other visual quality indicators that traditional VQA\nmethods emphasize, E-Bench QA focuses on the text-video alignment and the\nrelevance modeling between source and edited videos. It proposes a new\nassessment network for video editing that attains superior performance in\nalignment with human preferences. To the best of our knowledge, E-Bench\nintroduces the first quality assessment dataset for video editing and an\neffective subjective-aligned quantitative metric for this domain. All data and\ncode will be publicly available at https://github.com/littlespray/E-Bench.\n","authors":["Shangkun Sun","Xiaoyu Liang","Songlin Fan","Wenxu Gao","Wei Gao"],"pdf_url":"https://arxiv.org/pdf/2408.11481v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11480v1","updated":"2024-08-21T09:47:54Z","published":"2024-08-21T09:47:54Z","title":"OAPT: Offset-Aware Partition Transformer for Double JPEG Artifacts\n Removal","summary":" Deep learning-based methods have shown remarkable performance in single JPEG\nartifacts removal task. However, existing methods tend to degrade on double\nJPEG images, which are prevalent in real-world scenarios. To address this\nissue, we propose Offset-Aware Partition Transformer for double JPEG artifacts\nremoval, termed as OAPT. We conduct an analysis of double JPEG compression that\nresults in up to four patterns within each 8x8 block and design our model to\ncluster the similar patterns to remedy the difficulty of restoration. Our OAPT\nconsists of two components: compression offset predictor and image\nreconstructor. Specifically, the predictor estimates pixel offsets between the\nfirst and second compression, which are then utilized to divide different\npatterns. The reconstructor is mainly based on several Hybrid Partition\nAttention Blocks (HPAB), combining vanilla window-based self-attention and\nsparse attention for clustered pattern features. Extensive experiments\ndemonstrate that OAPT outperforms the state-of-the-art method by more than\n0.16dB in double JPEG image restoration task. Moreover, without increasing any\ncomputation cost, the pattern clustering module in HPAB can serve as a plugin\nto enhance other transformer-based image restoration methods. The code will be\navailable at https://github.com/QMoQ/OAPT.git .\n","authors":["Qiao Mo","Yukang Ding","Jinhua Hao","Qiang Zhu","Ming Sun","Chao Zhou","Feiyu Chen","Shuyuan Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.11480v1.pdf","comment":"14 pages, 9 figures. Codes and models are available at\n https://github.com/QMoQ/OAPT.git"},{"id":"http://arxiv.org/abs/2408.11478v1","updated":"2024-08-21T09:43:27Z","published":"2024-08-21T09:43:27Z","title":"LAKD-Activation Mapping Distillation Based on Local Learning","summary":" Knowledge distillation is widely applied in various fundamental vision models\nto enhance the performance of compact models. Existing knowledge distillation\nmethods focus on designing different distillation targets to acquire knowledge\nfrom teacher models. However, these methods often overlook the efficient\nutilization of distilled information, crudely coupling different types of\ninformation, making it difficult to explain how the knowledge from the teacher\nnetwork aids the student network in learning. This paper proposes a novel\nknowledge distillation framework, Local Attention Knowledge Distillation\n(LAKD), which more efficiently utilizes the distilled information from teacher\nnetworks, achieving higher interpretability and competitive performance. The\nframework establishes an independent interactive training mechanism through a\nseparation-decoupling mechanism and non-directional activation mapping. LAKD\ndecouples the teacher's features and facilitates progressive interaction\ntraining from simple to complex. Specifically, the student network is divided\ninto local modules with independent gradients to decouple the knowledge\ntransferred from the teacher. The non-directional activation mapping helps the\nstudent network integrate knowledge from different local modules by learning\ncoarse-grained feature knowledge. We conducted experiments on the CIFAR-10,\nCIFAR-100, and ImageNet datasets, and the results show that our LAKD method\nsignificantly outperforms existing methods, consistently achieving\nstate-of-the-art performance across different datasets.\n","authors":["Yaoze Zhang","Yuming Zhang","Yu Zhao","Yue Zhang","Feiyu Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.11478v1.pdf","comment":"8 pages,7 figures"},{"id":"http://arxiv.org/abs/2408.11475v1","updated":"2024-08-21T09:42:04Z","published":"2024-08-21T09:42:04Z","title":"TrackGo: A Flexible and Efficient Method for Controllable Video\n Generation","summary":" Recent years have seen substantial progress in diffusion-based controllable\nvideo generation. However, achieving precise control in complex scenarios,\nincluding fine-grained object parts, sophisticated motion trajectories, and\ncoherent background movement, remains a challenge. In this paper, we introduce\nTrackGo, a novel approach that leverages free-form masks and arrows for\nconditional video generation. This method offers users with a flexible and\nprecise mechanism for manipulating video content. We also propose the\nTrackAdapter for control implementation, an efficient and lightweight adapter\ndesigned to be seamlessly integrated into the temporal self-attention layers of\na pretrained video generation model. This design leverages our observation that\nthe attention map of these layers can accurately activate regions corresponding\nto motion in videos. Our experimental results demonstrate that our new\napproach, enhanced by the TrackAdapter, achieves state-of-the-art performance\non key metrics such as FVD, FID, and ObjMC scores. The project page of TrackGo\ncan be found at: https://zhtjtcz.github.io/TrackGo-Page/\n","authors":["Haitao Zhou","Chuang Wang","Rui Nie","Jinxiao Lin","Dongdong Yu","Qian Yu","Changhu Wang"],"pdf_url":"https://arxiv.org/pdf/2408.11475v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11465v1","updated":"2024-08-21T09:35:16Z","published":"2024-08-21T09:35:16Z","title":"MeTTA: Single-View to 3D Textured Mesh Reconstruction with Test-Time\n Adaptation","summary":" Reconstructing 3D from a single view image is a long-standing challenge. One\nof the popular approaches to tackle this problem is learning-based methods, but\ndealing with the test cases unfamiliar with training data (Out-of-distribution;\nOoD) introduces an additional challenge. To adapt for unseen samples in test\ntime, we propose MeTTA, a test-time adaptation (TTA) exploiting generative\nprior. We design joint optimization of 3D geometry, appearance, and pose to\nhandle OoD cases with only a single view image. However, the alignment between\nthe reference image and the 3D shape via the estimated viewpoint could be\nerroneous, which leads to ambiguity. To address this ambiguity, we carefully\ndesign learnable virtual cameras and their self-calibration. In our\nexperiments, we demonstrate that MeTTA effectively deals with OoD scenarios at\nfailure cases of existing learning-based 3D reconstruction models and enables\nobtaining a realistic appearance with physically based rendering (PBR)\ntextures.\n","authors":["Kim Yu-Ji","Hyunwoo Ha","Kim Youwang","Jaeheung Surh","Hyowon Ha","Tae-Hyun Oh"],"pdf_url":"https://arxiv.org/pdf/2408.11465v1.pdf","comment":"Accepted at BMVC 2024. [Project page] https://metta3d.github.io/"},{"id":"http://arxiv.org/abs/2408.11464v1","updated":"2024-08-21T09:29:45Z","published":"2024-08-21T09:29:45Z","title":"MambaOcc: Visual State Space Model for BEV-based Occupancy Prediction\n with Local Adaptive Reordering","summary":" Occupancy prediction has attracted intensive attention and shown great\nsuperiority in the development of autonomous driving systems. The fine-grained\nenvironmental representation brought by occupancy prediction in terms of both\ngeometry and semantic information has facilitated the general perception and\nsafe planning under open scenarios. However, it also brings high computation\ncosts and heavy parameters in existing works that utilize voxel-based 3d dense\nrepresentation and Transformer-based quadratic attention. To address these\nchallenges, in this paper, we propose a Mamba-based occupancy prediction method\n(MambaOcc) adopting BEV features to ease the burden of 3D scenario\nrepresentation, and linear Mamba-style attention to achieve efficient\nlong-range perception. Besides, to address the sensitivity of Mamba to sequence\norder, we propose a local adaptive reordering (LAR) mechanism with deformable\nconvolution and design a hybrid BEV encoder comprised of convolution layers and\nMamba. Extensive experiments on the Occ3D-nuScenes dataset demonstrate that\nMambaOcc achieves state-of-the-art performance in terms of both accuracy and\ncomputational efficiency. For example, compared to FlashOcc, MambaOcc delivers\nsuperior results while reducing the number of parameters by 42\\% and\ncomputational costs by 39\\%. Code will be available at\nhttps://github.com/Hub-Tian/MambaOcc.\n","authors":["Yonglin Tian","Songlin Bai","Zhiyao Luo","Yutong Wang","Yisheng Lv","Fei-Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2408.11464v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10605v2","updated":"2024-08-21T09:29:37Z","published":"2024-08-20T07:37:23Z","title":"MUSES: 3D-Controllable Image Generation via Multi-Modal Agent\n Collaboration","summary":" Despite recent advancements in text-to-image generation, most existing\nmethods struggle to create images with multiple objects and complex spatial\nrelationships in 3D world. To tackle this limitation, we introduce a generic AI\nsystem, namely MUSES, for 3D-controllable image generation from user queries.\nSpecifically, our MUSES addresses this challenging task by developing a\nprogressive workflow with three key components, including (1) Layout Manager\nfor 2D-to-3D layout lifting, (2) Model Engineer for 3D object acquisition and\ncalibration, (3) Image Artist for 3D-to-2D image rendering. By mimicking the\ncollaboration of human professionals, this multi-modal agent pipeline\nfacilitates the effective and automatic creation of images with 3D-controllable\nobjects, through an explainable integration of top-down planning and bottom-up\ngeneration. Additionally, we find that existing benchmarks lack detailed\ndescriptions of complex 3D spatial relationships of multiple objects. To fill\nthis gap, we further construct a new benchmark of T2I-3DisBench (3D image\nscene), which describes diverse 3D image scenes with 50 detailed prompts.\nExtensive experiments show the state-of-the-art performance of MUSES on both\nT2I-CompBench and T2I-3DisBench, outperforming recent strong competitors such\nas DALL-E 3 and Stable Diffusion 3. These results demonstrate a significant\nstep of MUSES forward in bridging natural language, 2D image generation, and 3D\nworld.\n","authors":["Yanbo Ding","Shaobin Zhuang","Kunchang Li","Zhengrong Yue","Yu Qiao","Yali Wang"],"pdf_url":"https://arxiv.org/pdf/2408.10605v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12822v2","updated":"2024-08-21T09:28:04Z","published":"2023-05-22T08:29:43Z","title":"Quantifying the effect of X-ray scattering for data generation in\n real-time defect detection","summary":" Background: X-ray imaging is widely used for the non-destructive detection of\ndefects in industrial products on a conveyor belt. In-line detection requires\nhighly accurate, robust, and fast algorithms. Deep Convolutional Neural\nNetworks (DCNNs) satisfy these requirements when a large amount of labeled data\nis available. To overcome the challenge of collecting these data, different\nmethods of X-ray image generation are considered.\n Objective: Depending on the desired degree of similarity to real data,\ndifferent physical effects should either be simulated or can be ignored. X-ray\nscattering is known to be computationally expensive to simulate, and this\neffect can greatly affect the accuracy of a generated X-ray image. We aim to\nquantitatively evaluate the effect of scattering on defect detection.\n Methods: Monte-Carlo simulation is used to generate X-ray scattering\ndistribution. DCNNs are trained on the data with and without scattering and\napplied to the same test datasets. Probability of Detection (POD) curves are\ncomputed to compare their performance, characterized by the size of the\nsmallest detectable defect.\n Results: We apply the methodology to a model problem of defect detection in\ncylinders. When trained on data without scattering, DCNNs reliably detect\ndefects larger than 1.3 mm, and using data with scattering improves performance\nby less than 5%. If the analysis is performed on the cases with large\nscattering-to-primary ratio ($1 < SPR < 5$), the difference in performance\ncould reach 15% (approx. 0.4 mm).\n Conclusion: Excluding the scattering signal from the training data has the\nlargest effect on the smallest detectable defects, and the difference decreases\nfor larger defects. The scattering-to-primary ratio has a significant effect on\ndetection performance and the required accuracy of data generation.\n","authors":["Vladyslav Andriiashen","Robert van Liere","Tristan van Leeuwen","K. Joost Batenburg"],"pdf_url":"https://arxiv.org/pdf/2305.12822v2.pdf","comment":"This paper appears in: Journal of X-Ray Science and Technology, vol.\n 32, no. 4, pp. 1099-1119, 2024. Print ISSN: 0895-3996 Online ISSN: 1095-9114\n Digital Object Identifier: https://doi.org/10.3233/XST-230389"},{"id":"http://arxiv.org/abs/2408.11463v1","updated":"2024-08-21T09:27:57Z","published":"2024-08-21T09:27:57Z","title":"Low-Light Object Tracking: A Benchmark","summary":" In recent years, the field of visual tracking has made significant progress\nwith the application of large-scale training datasets. These datasets have\nsupported the development of sophisticated algorithms, enhancing the accuracy\nand stability of visual object tracking. However, most research has primarily\nfocused on favorable illumination circumstances, neglecting the challenges of\ntracking in low-ligh environments. In low-light scenes, lighting may change\ndramatically, targets may lack distinct texture features, and in some\nscenarios, targets may not be directly observable. These factors can lead to a\nsevere decline in tracking performance. To address this issue, we introduce\nLLOT, a benchmark specifically designed for Low-Light Object Tracking. LLOT\ncomprises 269 challenging sequences with a total of over 132K frames, each\ncarefully annotated with bounding boxes. This specially designed dataset aims\nto promote innovation and advancement in object tracking techniques for\nlow-light conditions, addressing challenges not adequately covered by existing\nbenchmarks. To assess the performance of existing methods on LLOT, we conducted\nextensive tests on 39 state-of-the-art tracking algorithms. The results\nhighlight a considerable gap in low-light tracking performance. In response, we\npropose H-DCPT, a novel tracker that incorporates historical and darkness clue\nprompts to set a stronger baseline. H-DCPT outperformed all 39 evaluated\nmethods in our experiments, demonstrating significant improvements. We hope\nthat our benchmark and H-DCPT will stimulate the development of novel and\naccurate methods for tracking objects in low-light conditions. The LLOT and\ncode are available at https://github.com/OpenCodeGithub/H-DCPT.\n","authors":["Pengzhi Zhong","Xiaoyu Guo","Defeng Huang","Xiaojun Peng","Yian Li","Qijun Zhao","Shuiwang Li"],"pdf_url":"https://arxiv.org/pdf/2408.11463v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11448v1","updated":"2024-08-21T09:07:20Z","published":"2024-08-21T09:07:20Z","title":"Lookism: The overlooked bias in computer vision","summary":" In recent years, there have been significant advancements in computer vision\nwhich have led to the widespread deployment of image recognition and generation\nsystems in socially relevant applications, from hiring to security screening.\nHowever, the prevalence of biases within these systems has raised significant\nethical and social concerns. The most extensively studied biases in this\ncontext are related to gender, race and age. Yet, other biases are equally\npervasive and harmful, such as lookism, i.e., the preferential treatment of\nindividuals based on their physical appearance. Lookism remains under-explored\nin computer vision but can have profound implications not only by perpetuating\nharmful societal stereotypes but also by undermining the fairness and\ninclusivity of AI technologies. Thus, this paper advocates for the systematic\nstudy of lookism as a critical bias in computer vision models. Through a\ncomprehensive review of existing literature, we identify three areas of\nintersection between lookism and computer vision. We illustrate them by means\nof examples and a user study. We call for an interdisciplinary approach to\naddress lookism, urging researchers, developers, and policymakers to prioritize\nthe development of equitable computer vision systems that respect and reflect\nthe diversity of human appearances.\n","authors":["Aditya Gulati","Bruno Lepri","Nuria Oliver"],"pdf_url":"https://arxiv.org/pdf/2408.11448v1.pdf","comment":"Paper accepted at the ECCV 2024 workshop named \"Fairness and ethics\n towards transparent AI: facing the chalLEnge through model Debiasing\n (FAILED)\", https://failed-workshop-eccv-2024.github.io/"},{"id":"http://arxiv.org/abs/2408.11447v1","updated":"2024-08-21T09:06:30Z","published":"2024-08-21T09:06:30Z","title":"GaussianOcc: Fully Self-supervised and Efficient 3D Occupancy Estimation\n with Gaussian Splatting","summary":" We introduce GaussianOcc, a systematic method that investigates the two\nusages of Gaussian splatting for fully self-supervised and efficient 3D\noccupancy estimation in surround views. First, traditional methods for\nself-supervised 3D occupancy estimation still require ground truth 6D poses\nfrom sensors during training. To address this limitation, we propose Gaussian\nSplatting for Projection (GSP) module to provide accurate scale information for\nfully self-supervised training from adjacent view projection. Additionally,\nexisting methods rely on volume rendering for final 3D voxel representation\nlearning using 2D signals (depth maps, semantic maps), which is both\ntime-consuming and less effective. We propose Gaussian Splatting from Voxel\nspace (GSV) to leverage the fast rendering properties of Gaussian splatting. As\na result, the proposed GaussianOcc method enables fully self-supervised (no\nground truth pose) 3D occupancy estimation in competitive performance with low\ncomputational cost (2.7 times faster in training and 5 times faster in\nrendering).\n","authors":["Wanshui Gan","Fang Liu","Hongbin Xu","Ningkai Mo","Naoto Yokoya"],"pdf_url":"https://arxiv.org/pdf/2408.11447v1.pdf","comment":"Project page: https://ganwanshui.github.io/GaussianOcc/"},{"id":"http://arxiv.org/abs/2408.11439v1","updated":"2024-08-21T08:50:31Z","published":"2024-08-21T08:50:31Z","title":"BAdd: Bias Mitigation through Bias Addition","summary":" Computer vision (CV) datasets often exhibit biases that are perpetuated by\ndeep learning models. While recent efforts aim to mitigate these biases and\nfoster fair representations, they fail in complex real-world scenarios. In\nparticular, existing methods excel in controlled experiments involving\nbenchmarks with single-attribute injected biases, but struggle with\nmulti-attribute biases being present in well-established CV datasets. Here, we\nintroduce BAdd, a simple yet effective method that allows for learning fair\nrepresentations invariant to the attributes introducing bias by incorporating\nfeatures representing these attributes into the backbone. BAdd is evaluated on\nseven benchmarks and exhibits competitive performance, surpassing\nstate-of-the-art methods on both single- and multi-attribute benchmarks.\nNotably, BAdd achieves +27.5% and +5.5% absolute accuracy improvements on the\nchallenging multi-attribute benchmarks, FB-Biased-MNIST and CelebA,\nrespectively.\n","authors":["Ioannis Sarridis","Christos Koutlis","Symeon Papadopoulos","Christos Diou"],"pdf_url":"https://arxiv.org/pdf/2408.11439v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11438v1","updated":"2024-08-21T08:50:19Z","published":"2024-08-21T08:50:19Z","title":"DABench: A Benchmark Dataset for Data-Driven Weather Data Assimilation","summary":" Recent advancements in deep learning (DL) have led to the development of\nseveral Large Weather Models (LWMs) that rival state-of-the-art (SOTA)\nnumerical weather prediction (NWP) systems. Up to now, these models still rely\non traditional NWP-generated analysis fields as input and are far from being an\nautonomous system. While researchers are exploring data-driven data\nassimilation (DA) models to generate accurate initial fields for LWMs, the lack\nof a standard benchmark impedes the fair evaluation among different data-driven\nDA algorithms. Here, we introduce DABench, a benchmark dataset utilizing ERA5\ndata as ground truth to guide the development of end-to-end data-driven weather\nprediction systems. DABench contributes four standard features: (1) sparse and\nnoisy simulated observations under the guidance of the observing system\nsimulation experiment method; (2) a skillful pre-trained weather prediction\nmodel to generate background fields while fairly evaluating the impact of\nassimilation outcomes on predictions; (3) standardized evaluation metrics for\nmodel comparison; (4) a strong baseline called the DA Transformer (DaT). DaT\nintegrates the four-dimensional variational DA prior knowledge into the\nTransformer model and outperforms the SOTA in physical state reconstruction,\nnamed 4DVarNet. Furthermore, we exemplify the development of an end-to-end\ndata-driven weather prediction system by integrating DaT with the prediction\nmodel. Researchers can leverage DABench to develop their models and compare\nperformance against established baselines, which will benefit the future\nadvancements of data-driven weather prediction systems. The code is available\non this Github repository and the dataset is available at the Baidu Drive.\n","authors":["Wuxin Wang","Weicheng Ni","Tao Han","Lei Bai","Boheng Duan","Kaijun Ren"],"pdf_url":"https://arxiv.org/pdf/2408.11438v1.pdf","comment":"37pages, 12 figures, 6 tables"},{"id":"http://arxiv.org/abs/2406.10943v4","updated":"2024-08-21T08:47:04Z","published":"2024-06-16T13:47:40Z","title":"Rectified Iterative Disparity for Stereo Matching","summary":" Both uncertainty-assisted and iteration-based methods have achieved great\nsuccess in stereo matching. However, existing uncertainty estimation methods\ntake a single image and the corresponding disparity as input, which imposes\nhigher demands on the estimation network. In this paper, we propose Cost\nvolume-based disparity Uncertainty Estimation (UEC). Based on the rich\nsimilarity information in the cost volume coming from the image pairs, the\nproposed UEC can achieve competitive performance with low computational cost.\nSecondly, we propose two methods of uncertainty-assisted disparity estimation,\nUncertainty-based Disparity Rectification (UDR) and Uncertainty-based Disparity\nupdate Conditioning (UDC). These two methods optimise the disparity update\nprocess of the iterative-based approach without adding extra parameters. In\naddition, we propose Disparity Rectification loss that significantly improves\nthe accuracy of small amount of disparity updates. We present a\nhigh-performance stereo architecture, DR Stereo, which is a combination of the\nproposed methods. Experimental results from SceneFlow, KITTI, Middlebury 2014,\nand ETH3D show that DR-Stereo achieves very competitive disparity estimation\nperformance.\n","authors":["Weiqing Xiao","Wei Zhao"],"pdf_url":"https://arxiv.org/pdf/2406.10943v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.15975v4","updated":"2024-08-21T08:45:12Z","published":"2023-03-28T13:47:16Z","title":"Large-scale Pre-trained Models are Surprisingly Strong in Incremental\n Novel Class Discovery","summary":" Discovering novel concepts in unlabelled datasets and in a continuous manner\nis an important desideratum of lifelong learners. In the literature such\nproblems have been partially addressed under very restricted settings, where\nnovel classes are learned by jointly accessing a related labelled set (e.g.,\nNCD) or by leveraging only a supervisedly pre-trained model (e.g., class-iNCD).\nIn this work we challenge the status quo in class-iNCD and propose a learning\nparadigm where class discovery occurs continuously and truly unsupervisedly,\nwithout needing any related labelled set. In detail, we propose to exploit the\nricher priors from strong self-supervised pre-trained models (PTM). To this\nend, we propose simple baselines, composed of a frozen PTM backbone and a\nlearnable linear classifier, that are not only simple to implement but also\nresilient under longer learning scenarios. We conduct extensive empirical\nevaluation on a multitude of benchmarks and show the effectiveness of our\nproposed baselines when compared with sophisticated state-of-the-art methods.\nThe code is open source.\n","authors":["Mingxuan Liu","Subhankar Roy","Zhun Zhong","Nicu Sebe","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2303.15975v4.pdf","comment":"Accepted as a conference paper to ICPR 2024"},{"id":"http://arxiv.org/abs/2408.11432v1","updated":"2024-08-21T08:40:45Z","published":"2024-08-21T08:40:45Z","title":"T2VIndexer: A Generative Video Indexer for Efficient Text-Video\n Retrieval","summary":" Current text-video retrieval methods mainly rely on cross-modal matching\nbetween queries and videos to calculate their similarity scores, which are then\nsorted to obtain retrieval results. This method considers the matching between\neach candidate video and the query, but it incurs a significant time cost and\nwill increase notably with the increase of candidates. Generative models are\ncommon in natural language processing and computer vision, and have been\nsuccessfully applied in document retrieval, but their application in multimodal\nretrieval remains unexplored. To enhance retrieval efficiency, in this paper,\nwe introduce a model-based video indexer named T2VIndexer, which is a\nsequence-to-sequence generative model directly generating video identifiers and\nretrieving candidate videos with constant time complexity. T2VIndexer aims to\nreduce retrieval time while maintaining high accuracy. To achieve this goal, we\npropose video identifier encoding and query-identifier augmentation approaches\nto represent videos as short sequences while preserving their semantic\ninformation. Our method consistently enhances the retrieval efficiency of\ncurrent state-of-the-art models on four standard datasets. It enables baselines\nwith only 30\\%-50\\% of the original retrieval time to achieve better retrieval\nperformance on MSR-VTT (+1.0%), MSVD (+1.8%), ActivityNet (+1.5%), and DiDeMo\n(+0.2%). The code is available at\nhttps://github.com/Lilidamowang/T2VIndexer-generativeSearch.\n","authors":["Yili Li","Jing Yu","Keke Gai","Bang Liu","Gang Xiong","Qi Wu"],"pdf_url":"https://arxiv.org/pdf/2408.11432v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11424v1","updated":"2024-08-21T08:28:40Z","published":"2024-08-21T08:28:40Z","title":"EMO-LLaMA: Enhancing Facial Emotion Understanding with Instruction\n Tuning","summary":" Facial expression recognition (FER) is an important research topic in\nemotional artificial intelligence. In recent decades, researchers have made\nremarkable progress. However, current FER paradigms face challenges in\ngeneralization, lack semantic information aligned with natural language, and\nstruggle to process both images and videos within a unified framework, making\ntheir application in multimodal emotion understanding and human-computer\ninteraction difficult. Multimodal Large Language Models (MLLMs) have recently\nachieved success, offering advantages in addressing these issues and\npotentially overcoming the limitations of current FER paradigms. However,\ndirectly applying pre-trained MLLMs to FER still faces several challenges. Our\nzero-shot evaluations of existing open-source MLLMs on FER indicate a\nsignificant performance gap compared to GPT-4V and current supervised\nstate-of-the-art (SOTA) methods. In this paper, we aim to enhance MLLMs'\ncapabilities in understanding facial expressions. We first generate instruction\ndata for five FER datasets with Gemini. We then propose a novel MLLM, named\nEMO-LLaMA, which incorporates facial priors from a pretrained facial analysis\nnetwork to enhance human facial information. Specifically, we design a Face\nInfo Mining module to extract both global and local facial information.\nAdditionally, we utilize a handcrafted prompt to introduce age-gender-race\nattributes, considering the emotional differences across different human\ngroups. Extensive experiments show that EMO-LLaMA achieves SOTA-comparable or\ncompetitive results across both static and dynamic FER datasets. The\ninstruction dataset and code are available at\nhttps://github.com/xxtars/EMO-LLaMA.\n","authors":["Bohao Xing","Zitong Yu","Xin Liu","Kaishen Yuan","Qilang Ye","Weicheng Xie","Huanjing Yue","Jingyu Yang","Heikki Kälviäinen"],"pdf_url":"https://arxiv.org/pdf/2408.11424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11413v1","updated":"2024-08-21T08:19:12Z","published":"2024-08-21T08:19:12Z","title":"Pano2Room: Novel View Synthesis from a Single Indoor Panorama","summary":" Recent single-view 3D generative methods have made significant advancements\nby leveraging knowledge distilled from extensive 3D object datasets. However,\nchallenges persist in the synthesis of 3D scenes from a single view, primarily\ndue to the complexity of real-world environments and the limited availability\nof high-quality prior resources. In this paper, we introduce a novel approach\ncalled Pano2Room, designed to automatically reconstruct high-quality 3D indoor\nscenes from a single panoramic image. These panoramic images can be easily\ngenerated using a panoramic RGBD inpainter from captures at a single location\nwith any camera. The key idea is to initially construct a preliminary mesh from\nthe input panorama, and iteratively refine this mesh using a panoramic RGBD\ninpainter while collecting photo-realistic 3D-consistent pseudo novel views.\nFinally, the refined mesh is converted into a 3D Gaussian Splatting field and\ntrained with the collected pseudo novel views. This pipeline enables the\nreconstruction of real-world 3D scenes, even in the presence of large\nocclusions, and facilitates the synthesis of photo-realistic novel views with\ndetailed geometry. Extensive qualitative and quantitative experiments have been\nconducted to validate the superiority of our method in single-panorama indoor\nnovel synthesis compared to the state-of-the-art. Our code and data are\navailable at \\url{https://github.com/TrickyGo/Pano2Room}.\n","authors":["Guo Pu","Yiming Zhao","Zhouhui Lian"],"pdf_url":"https://arxiv.org/pdf/2408.11413v1.pdf","comment":"SIGGRAPH Asia 2024 Conference Papers (SA Conference Papers '24),\n December 3--6, 2024, Tokyo, Japan"},{"id":"http://arxiv.org/abs/2408.11411v1","updated":"2024-08-21T08:17:22Z","published":"2024-08-21T08:17:22Z","title":"SelfDRSC++: Self-Supervised Learning for Dual Reversed Rolling Shutter\n Correction","summary":" Modern consumer cameras commonly employ the rolling shutter (RS) imaging\nmechanism, via which images are captured by scanning scenes row-by-row,\nresulting in RS distortion for dynamic scenes. To correct RS distortion,\nexisting methods adopt a fully supervised learning manner that requires high\nframerate global shutter (GS) images as ground-truth for supervision. In this\npaper, we propose an enhanced Self-supervised learning framework for Dual\nreversed RS distortion Correction (SelfDRSC++). Firstly, we introduce a\nlightweight DRSC network that incorporates a bidirectional correlation matching\nblock to refine the joint optimization of optical flows and corrected RS\nfeatures, thereby improving correction performance while reducing network\nparameters. Subsequently, to effectively train the DRSC network, we propose a\nself-supervised learning strategy that ensures cycle consistency between input\nand reconstructed dual reversed RS images. The RS reconstruction in SelfDRSC++\ncan be interestingly formulated as a specialized instance of video frame\ninterpolation, where each row in reconstructed RS images is interpolated from\npredicted GS images by utilizing RS distortion time maps. By achieving superior\nperformance while simplifying the training process, SelfDRSC++ enables feasible\none-stage self-supervised training. Additionally, besides start and end RS\nscanning time, SelfDRSC++ allows supervision of GS images at arbitrary\nintermediate scanning times, thus enabling the learned DRSC network to generate\nhigh framerate GS videos. The code and trained models are available at\n\\url{https://github.com/shangwei5/SelfDRSC_plusplus}.\n","authors":["Wei Shang","Dongwei Ren","Wanying Zhang","Qilong Wang","Pengfei Zhu","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2408.11411v1.pdf","comment":"13 pages, 9 figures, and the code is available at\n \\url{https://github.com/shangwei5/SelfDRSC_plusplus}"},{"id":"http://arxiv.org/abs/2408.11408v1","updated":"2024-08-21T08:06:06Z","published":"2024-08-21T08:06:06Z","title":"Latent Feature and Attention Dual Erasure Attack against Multi-View\n Diffusion Models for 3D Assets Protection","summary":" Multi-View Diffusion Models (MVDMs) enable remarkable improvements in the\nfield of 3D geometric reconstruction, but the issue regarding intellectual\nproperty has received increasing attention due to unauthorized imitation.\nRecently, some works have utilized adversarial attacks to protect copyright.\nHowever, all these works focus on single-image generation tasks which only need\nto consider the inner feature of images. Previous methods are inefficient in\nattacking MVDMs because they lack the consideration of disrupting the geometric\nand visual consistency among the generated multi-view images. This paper is the\nfirst to address the intellectual property infringement issue arising from\nMVDMs. Accordingly, we propose a novel latent feature and attention dual\nerasure attack to disrupt the distribution of latent feature and the\nconsistency across the generated images from multi-view and multi-domain\nsimultaneously. The experiments conducted on SOTA MVDMs indicate that our\napproach achieves superior performances in terms of attack effectiveness,\ntransferability, and robustness against defense methods. Therefore, this paper\nprovides an efficient solution to protect 3D assets from MVDMs-based 3D\ngeometry reconstruction.\n","authors":["Jingwei Sun","Xuchong Zhang","Changfeng Sun","Qicheng Bai","Hongbin Sun"],"pdf_url":"https://arxiv.org/pdf/2408.11408v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11407v1","updated":"2024-08-21T08:05:03Z","published":"2024-08-21T08:05:03Z","title":"Domain-invariant Progressive Knowledge Distillation for UAV-based Object\n Detection","summary":" Knowledge distillation (KD) is an effective method for compressing models in\nobject detection tasks. Due to limited computational capability, UAV-based\nobject detection (UAV-OD) widely adopt the KD technique to obtain lightweight\ndetectors. Existing methods often overlook the significant differences in\nfeature space caused by the large gap in scale between the teacher and student\nmodels. This limitation hampers the efficiency of knowledge transfer during the\ndistillation process. Furthermore, the complex backgrounds in UAV images make\nit challenging for the student model to efficiently learn the object features.\nIn this paper, we propose a novel knowledge distillation framework for UAV-OD.\nSpecifically, a progressive distillation approach is designed to alleviate the\nfeature gap between teacher and student models. Then a new feature alignment\nmethod is provided to extract object-related features for enhancing student\nmodel's knowledge reception efficiency. Finally, extensive experiments are\nconducted to validate the effectiveness of our proposed approach. The results\ndemonstrate that our proposed method achieves state-of-the-art (SoTA)\nperformance in two UAV-OD datasets.\n","authors":["Liang Yao","Fan Liu","Chuanyi Zhang","Zhiquan Ou","Ting Wu"],"pdf_url":"https://arxiv.org/pdf/2408.11407v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11402v1","updated":"2024-08-21T08:01:00Z","published":"2024-08-21T08:01:00Z","title":"Video Diffusion Models are Strong Video Inpainter","summary":" Propagation-based video inpainting using optical flow at the pixel or feature\nlevel has recently garnered significant attention. However, it has limitations\nsuch as the inaccuracy of optical flow prediction and the propagation of noise\nover time. These issues result in non-uniform noise and time consistency\nproblems throughout the video, which are particularly pronounced when the\nremoved area is large and involves substantial movement. To address these\nissues, we propose a novel First Frame Filling Video Diffusion Inpainting model\n(FFF-VDI). We design FFF-VDI inspired by the capabilities of pre-trained\nimage-to-video diffusion models that can transform the first frame image into a\nhighly natural video. To apply this to the video inpainting task, we propagate\nthe noise latent information of future frames to fill the masked areas of the\nfirst frame's noise latent code. Next, we fine-tune the pre-trained\nimage-to-video diffusion model to generate the inpainted video. The proposed\nmodel addresses the limitations of existing methods that rely on optical flow\nquality, producing much more natural and temporally consistent videos. This\nproposed approach is the first to effectively integrate image-to-video\ndiffusion models into video inpainting tasks. Through various comparative\nexperiments, we demonstrate that the proposed model can robustly handle diverse\ninpainting types with high quality.\n","authors":["Minhyeok Lee","Suhwan Cho","Chajin Shin","Jungho Lee","Sunghun Yang","Sangyoun Lee"],"pdf_url":"https://arxiv.org/pdf/2408.11402v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11401v1","updated":"2024-08-21T07:58:34Z","published":"2024-08-21T07:58:34Z","title":"Revisiting FunnyBirds evaluation framework for prototypical parts\n networks","summary":" Prototypical parts networks, such as ProtoPNet, became popular due to their\npotential to produce more genuine explanations than post-hoc methods. However,\nfor a long time, this potential has been strictly theoretical, and no\nsystematic studies have existed to support it. That changed recently with the\nintroduction of the FunnyBirds benchmark, which includes metrics for evaluating\ndifferent aspects of explanations.\n However, this benchmark employs attribution maps visualization for all\nexplanation techniques except for the ProtoPNet, for which the bounding boxes\nare used. This choice significantly influences the metric scores and questions\nthe conclusions stated in FunnyBirds publication.\n In this study, we comprehensively compare metric scores obtained for two\ntypes of ProtoPNet visualizations: bounding boxes and similarity maps. Our\nanalysis indicates that employing similarity maps aligns better with the\nessence of ProtoPNet, as evidenced by different metric scores obtained from\nFunnyBirds. Therefore, we advocate using similarity maps as a visualization\ntechnique for prototypical parts networks in explainability evaluation\nbenchmarks.\n","authors":["Szymon Opłatek","Dawid Rymarczyk","Bartosz Zieliński"],"pdf_url":"https://arxiv.org/pdf/2408.11401v1.pdf","comment":"Published at 2nd XAI World Conference"},{"id":"http://arxiv.org/abs/2408.10024v2","updated":"2024-08-21T07:58:02Z","published":"2024-08-19T14:18:21Z","title":"Towards Robust Federated Image Classification: An Empirical Study of\n Weight Selection Strategies in Manufacturing","summary":" In the realm of Federated Learning (FL), particularly within the\nmanufacturing sector, the strategy for selecting client weights for server\naggregation is pivotal for model performance. This study investigates the\ncomparative effectiveness of two weight selection strategies: Final Epoch\nWeight Selection (FEWS) and Optimal Epoch Weight Selection (OEWS). Designed for\nmanufacturing contexts where collaboration typically involves a limited number\nof partners (two to four clients), our research focuses on federated image\nclassification tasks. We employ various neural network architectures, including\nEfficientNet, ResNet, and VGG, to assess the impact of these weight selection\nstrategies on model convergence and robustness. Our research aims to determine\nwhether FEWS or OEWS enhances the global FL model's performance across\ncommunication rounds (CRs). Through empirical analysis and rigorous\nexperimentation, we seek to provide valuable insights for optimizing FL\nimplementations in manufacturing, ensuring that collaborative efforts yield the\nmost effective and reliable models with a limited number of participating\nclients. The findings from this study are expected to refine FL practices\nsignificantly in manufacturing, thereby enhancing the efficiency and\nperformance of collaborative machine learning endeavors in this vital sector.\n","authors":["Vinit Hegiste","Tatjana Legler","Martin Ruskowski"],"pdf_url":"https://arxiv.org/pdf/2408.10024v2.pdf","comment":"Submitted to The 2nd IEEE International Conference on Federated\n Learning Technologies and Applications (FLTA24)"},{"id":"http://arxiv.org/abs/2408.11397v1","updated":"2024-08-21T07:43:50Z","published":"2024-08-21T07:43:50Z","title":"EAGLE: Elevating Geometric Reasoning through LLM-empowered Visual\n Instruction Tuning","summary":" Multi-modal Large Language Models have recently experienced rapid\ndevelopments and excel in various multi-modal tasks. However, they still\nstruggle with mathematical geometric problem solving, which requires\nexceptional visual perception proficiency. Existing MLLMs mostly optimize the\nLLM backbone to acquire geometric reasoning capabilities, while rarely\nemphasizing improvements in visual comprehension. In this paper, we first\ninvestigate the visual perception performance of MLLMs when facing geometric\ndiagrams. Our findings reveal that current MLLMs severely suffer from\ninaccurate geometric perception and hallucinations. To address these\nlimitations, we propose EAGLE, a novel two-stage end-to-end visual enhancement\nMLLM framework designed to ElevAte Geometric reasoning through LLM-Empowered\nvisual instruction tuning. Specifically, in the preliminary stage, we feed\ngeometric image-caption pairs into our MLLM that contains a fully fine-tuning\nCLIP ViT and a frozen LLM, aiming to endow our model with basic geometric\nknowledge. In the subsequent advanced stage, we incorporate LoRA modules into\nthe vision encoder and unfreeze the LLM backbone. This enables the model to\nleverage the inherent CoT rationales within question-answer pairs, guiding the\nMLLM to focus on nuanced visual cues and enhancing its overall perceptual\ncapacity. Moreover, we optimize the cross-modal projector in both stages to\nfoster adaptive visual-linguistic alignments. After the two-stage visual\nenhancement, we develop the geometry expert model EAGLE-7B. Extensive\nexperiments on popular benchmarks demonstrate the effectiveness of our model.\nFor example, on the GeoQA benchmark, EAGLE-7B not only surpasses the exemplary\nG-LLaVA 7B model by 2.9%, but also marginally outperforms the larger G-LLaVA\n13B model. On the MathVista benchmark, EAGLE-7B achieves remarkable 3.8%\nimprovements compared with the proprietary model GPT-4V.\n","authors":["Zhihao Li","Yao Du","Yang Liu","Yan Zhang","Yufang Liu","Mengdi Zhang","Xunliang Cai"],"pdf_url":"https://arxiv.org/pdf/2408.11397v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.17419v3","updated":"2024-08-21T07:42:02Z","published":"2023-10-26T14:23:45Z","title":"AntifakePrompt: Prompt-Tuned Vision-Language Models are Fake Image\n Detectors","summary":" Deep generative models can create remarkably photorealistic fake images while\nraising concerns about misinformation and copyright infringement, known as\ndeepfake threats. Deepfake detection technique is developed to distinguish\nbetween real and fake images, where the existing methods typically learn\nclassifiers in the image domain or various feature domains. However, the\ngeneralizability of deepfake detection against emerging and more advanced\ngenerative models remains challenging. In this paper, being inspired by the\nzero-shot advantages of Vision-Language Models (VLMs), we propose a novel\napproach called AntifakePrompt, using VLMs (e.g., InstructBLIP) and prompt\ntuning techniques to improve the deepfake detection accuracy over unseen data.\nWe formulate deepfake detection as a visual question answering problem, and\ntune soft prompts for InstructBLIP to answer the real/fake information of a\nquery image. We conduct full-spectrum experiments on datasets from a diversity\nof 3 held-in and 20 held-out generative models, covering modern text-to-image\ngeneration, image editing and adversarial image attacks. These testing datasets\nprovide useful benchmarks in the realm of deepfake detection for further\nresearch. Moreover, results demonstrate that (1) the deepfake detection\naccuracy can be significantly and consistently improved (from 71.06% to 92.11%,\nin average accuracy over unseen domains) using pretrained vision-language\nmodels with prompt tuning; (2) our superior performance is at less cost of\ntraining data and trainable parameters, resulting in an effective and efficient\nsolution for deepfake detection. Code and models can be found at\nhttps://github.com/nctu-eva-lab/AntifakePrompt.\n","authors":["You-Ming Chang","Chen Yeh","Wei-Chen Chiu","Ning Yu"],"pdf_url":"https://arxiv.org/pdf/2310.17419v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11392v1","updated":"2024-08-21T07:37:19Z","published":"2024-08-21T07:37:19Z","title":"Fairness measures for biometric quality assessment","summary":" Quality assessment algorithms measure the quality of a captured biometric\nsample. Since the sample quality strongly affects the recognition performance\nof a biometric system, it is essential to only process samples of sufficient\nquality and discard samples of low-quality. Even though quality assessment\nalgorithms are not intended to yield very different quality scores across\ndemographic groups, quality score discrepancies are possible, resulting in\ndifferent discard ratios. To ensure that quality assessment algorithms do not\ntake demographic characteristics into account when assessing sample quality and\nconsequently to ensure that the quality algorithms perform equally for all\nindividuals, it is crucial to develop a fairness measure. In this work we\npropose and compare multiple fairness measures for evaluating quality\ncomponents across demographic groups. Proposed measures, could be used as\npotential candidates for an upcoming standard in this important field.\n","authors":["André Dörsch","Torsten Schlett","Peter Munch","Christian Rathgeb","Christoph Busch"],"pdf_url":"https://arxiv.org/pdf/2408.11392v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01716v5","updated":"2024-08-21T06:47:01Z","published":"2023-04-04T11:25:44Z","title":"Decoupling Dynamic Monocular Videos for Dynamic View Synthesis","summary":" The challenge of dynamic view synthesis from dynamic monocular videos, i.e.,\nsynthesizing novel views for free viewpoints given a monocular video of a\ndynamic scene captured by a moving camera, mainly lies in accurately modeling\nthe \\textbf{dynamic objects} of a scene using limited 2D frames, each with a\nvarying timestamp and viewpoint. Existing methods usually require pre-processed\n2D optical flow and depth maps by off-the-shelf methods to supervise the\nnetwork, making them suffer from the inaccuracy of the pre-processed\nsupervision and the ambiguity when lifting the 2D information to 3D. In this\npaper, we tackle this challenge in an unsupervised fashion. Specifically, we\ndecouple the motion of the dynamic objects into object motion and camera\nmotion, respectively regularized by proposed unsupervised surface consistency\nand patch-based multi-view constraints. The former enforces the 3D geometric\nsurfaces of moving objects to be consistent over time, while the latter\nregularizes their appearances to be consistent across different viewpoints.\nSuch a fine-grained motion formulation can alleviate the learning difficulty\nfor the network, thus enabling it to produce not only novel views with higher\nquality but also more accurate scene flows and depth than existing methods\nrequiring extra supervision.\n","authors":["Meng You","Junhui Hou"],"pdf_url":"https://arxiv.org/pdf/2304.01716v5.pdf","comment":"Accepted to TVCG"},{"id":"http://arxiv.org/abs/2408.11365v1","updated":"2024-08-21T06:21:56Z","published":"2024-08-21T06:21:56Z","title":"Current Status and Trends in Image Anti-Forensics Research: A\n Bibliometric Analysis","summary":" Image anti-forensics is a critical topic in the field of image privacy and\nsecurity research. With the increasing ease of manipulating or generating human\nfaces in images, the potential misuse of such forged images is a growing\nconcern. This study aims to comprehensively review the knowledge structure and\nresearch hotspots related to image anti-forensics by analyzing publications in\nthe Web of Science Core Collection (WoSCC) database. The bibliometric analysis\nconducted using VOSViewer software has revealed the research trends, major\nresearch institutions, most influential publications, top publishing venues,\nand most active contributors in this field. This is the first comprehensive\nbibliometric study summarizing research trends and developments in image\nanti-forensics. The information highlights recent and primary research\ndirections, serving as a reference for future research in image anti-forensics.\n","authors":["Yihong Lu","Jianyi Liu","Ru Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.11365v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11116v2","updated":"2024-08-21T06:21:08Z","published":"2024-03-17T06:53:44Z","title":"PhD: A Prompted Visual Hallucination Evaluation Dataset","summary":" Multimodal Large Language Models (MLLMs) hallucinate, resulting in an\nemerging topic of visual hallucination evaluation (VHE). We introduce in this\npaper PhD, a large-scale benchmark for VHE. The essence of VHE is to ask an\nMLLM the right questions concerning a specific image. Depending on what to ask\n(objects, attributes, sentiment, etc.) and how the questions are asked, we\nstructure PhD along two dimensions, i.e. task and mode. Five visual recognition\ntasks, ranging from low-level (object / attribute recognition) to middle-level\n(sentiment / position recognition and counting), are considered. Besides a\nnormal visual QA mode, which we term VHE-base, PhD also asks questions with\ninaccurate context (VHE-iac) or with incorrect context (VHE-icc), or with\nAI-generated counter common sense images (VHE-ccs). We construct PhD by a\nChatGPT-assisted semi-automated pipeline, encompassing four pivotal modules:\ntask-specific hallucinatory element (hitem) selection, hitem-embedded question\ngeneration, inaccurate / incorrect context generation, and CCS image\ngeneration. With over 102k VQA triplets in total, PhD reveals considerable\nvariability in MLLMs' performance across various modes, offering valuable\ninsights into the nature of hallucination issues. As such, PhD stands as a\npotent tool not only for VHE but may also play a significant role in the\nrefinement of MLLMs.\n","authors":["Jiazhen Liu","Yuhan Fu","Ruobing Xie","Runquan Xie","Xingwu Sun","Fengzong Lian","Zhanhui Kang","Xirong Li"],"pdf_url":"https://arxiv.org/pdf/2403.11116v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10942v6","updated":"2024-08-21T06:13:53Z","published":"2023-10-17T02:38:09Z","title":"UNK-VQA: A Dataset and a Probe into the Abstention Ability of\n Multi-modal Large Models","summary":" Teaching Visual Question Answering (VQA) models to refrain from answering\nunanswerable questions is necessary for building a trustworthy AI system.\nExisting studies, though have explored various aspects of VQA but somewhat\nignored this particular attribute. This paper aims to bridge the research gap\nby contributing a comprehensive dataset, called UNK-VQA. The dataset is\nspecifically designed to address the challenge of questions that models do not\nknow. To this end, we first augment the existing data via deliberate\nperturbations on either the image or question. In specific, we carefully ensure\nthat the question-image semantics remain close to the original unperturbed\ndistribution. By this means, the identification of unanswerable questions\nbecomes challenging, setting our dataset apart from others that involve mere\nimage replacement. We then extensively evaluate the zero- and few-shot\nperformance of several emerging multi-modal large models and discover their\nsignificant limitations when applied to our dataset. Additionally, we also\npropose a straightforward method to tackle these unanswerable questions. This\ndataset, we believe, will serve as a valuable benchmark for enhancing the\nabstention capability of VQA models, thereby leading to increased\ntrustworthiness of AI systems. We have made the dataset\n(https://github.com/guoyang9/UNK-VQA) available to facilitate further\nexploration in this area.\n","authors":["Yangyang Guo","Fangkai Jiao","Zhiqi Shen","Liqiang Nie","Mohan Kankanhalli"],"pdf_url":"https://arxiv.org/pdf/2310.10942v6.pdf","comment":"Accepted by TPAMI"},{"id":"http://arxiv.org/abs/2107.04795v3","updated":"2024-08-21T06:13:47Z","published":"2021-07-10T08:53:14Z","title":"Semi-Supervised Learning with Multi-Head Co-Training","summary":" Co-training, extended from self-training, is one of the frameworks for\nsemi-supervised learning. Without natural split of features, single-view\nco-training works at the cost of training extra classifiers, where the\nalgorithm should be delicately designed to prevent individual classifiers from\ncollapsing into each other. To remove these obstacles which deter the adoption\nof single-view co-training, we present a simple and efficient algorithm\nMulti-Head Co-Training. By integrating base learners into a multi-head\nstructure, the model is in a minimal amount of extra parameters. Every\nclassification head in the unified model interacts with its peers through a\n\"Weak and Strong Augmentation\" strategy, in which the diversity is naturally\nbrought by the strong data augmentation. Therefore, the proposed method\nfacilitates single-view co-training by 1). promoting diversity implicitly and\n2). only requiring a small extra computational overhead. The effectiveness of\nMulti-Head Co-Training is demonstrated in an empirical study on standard\nsemi-supervised learning benchmarks.\n","authors":["Mingcai Chen","Yuntao Du","Yi Zhang","Shuwei Qian","Chongjun Wang"],"pdf_url":"https://arxiv.org/pdf/2107.04795v3.pdf","comment":"The 36th AAAI Conference on Artificial Intelligence (AAAI-22)"},{"id":"http://arxiv.org/abs/2408.11357v1","updated":"2024-08-21T06:00:11Z","published":"2024-08-21T06:00:11Z","title":"HumanCoser: Layered 3D Human Generation via Semantic-Aware Diffusion\n Model","summary":" This paper aims to generate physically-layered 3D humans from text prompts.\nExisting methods either generate 3D clothed humans as a whole or support only\ntight and simple clothing generation, which limits their applications to\nvirtual try-on and part-level editing. To achieve physically-layered 3D human\ngeneration with reusable and complex clothing, we propose a novel layer-wise\ndressed human representation based on a physically-decoupled diffusion model.\nSpecifically, to achieve layer-wise clothing generation, we propose a\ndual-representation decoupling framework for generating clothing decoupled from\nthe human body, in conjunction with an innovative multi-layer fusion volume\nrendering method. To match the clothing with different body shapes, we propose\nan SMPL-driven implicit field deformation network that enables the free\ntransfer and reuse of clothing. Extensive experiments demonstrate that our\napproach not only achieves state-of-the-art layered 3D human generation with\ncomplex clothing but also supports virtual try-on and layered human animation.\n","authors":["Yi Wang","Jian Ma","Ruizhi Shao","Qiao Feng","Yu-kun Lai","Kun Li"],"pdf_url":"https://arxiv.org/pdf/2408.11357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.07594v2","updated":"2024-08-21T05:32:47Z","published":"2024-05-13T09:56:28Z","title":"RGBD-Glue: General Feature Combination for Robust RGB-D Point Cloud\n Registration","summary":" Point cloud registration is a fundamental task for estimating rigid\ntransformations between point clouds. Previous studies have used geometric\ninformation for extracting features, matching and estimating transformation.\nRecently, owing to the advancement of RGB-D sensors, researchers have attempted\nto combine visual and geometric information to improve registration\nperformance. However, these studies focused on extracting distinctive features\nby deep feature fusion, which cannot effectively solve the negative effects of\neach feature's weakness, and cannot sufficiently leverage the valid\ninformation. In this paper, we propose a new feature combination framework,\nwhich applies a looser but more effective combination. An explicit filter based\non transformation consistency is designed for the combination framework, which\ncan overcome each feature's weakness. And an adaptive threshold determined by\nthe error distribution is proposed to extract more valid information from the\ntwo types of features. Owing to the distinctive design, our proposed framework\ncan estimate more accurate correspondences and is applicable to both\nhand-crafted and learning-based feature descriptors. Experiments on ScanNet and\n3DMatch show that our method achieves a state-of-the-art performance.\n","authors":["Congjia Chen","Xiaoyu Jia","Yanhong Zheng","Yufu Qu"],"pdf_url":"https://arxiv.org/pdf/2405.07594v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11349v1","updated":"2024-08-21T05:30:06Z","published":"2024-08-21T05:30:06Z","title":"Image Score: Learning and Evaluating Human Preferences for Mercari\n Search","summary":" Mercari is the largest C2C e-commerce marketplace in Japan, having more than\n20 million active monthly users. Search being the fundamental way to discover\ndesired items, we have always had a substantial amount of data with implicit\nfeedback. Although we actively take advantage of that to provide the best\nservice for our users, the correlation of implicit feedback for such tasks as\nimage quality assessment is not trivial. Many traditional lines of research in\nMachine Learning (ML) are similarly motivated by the insatiable appetite of\nDeep Learning (DL) models for well-labelled training data. Weak supervision is\nabout leveraging higher-level and/or noisier supervision over unlabeled data.\nLarge Language Models (LLMs) are being actively studied and used for data\nlabelling tasks. We present how we leverage a Chain-of-Thought (CoT) to enable\nLLM to produce image aesthetics labels that correlate well with human behavior\nin e-commerce settings. Leveraging LLMs is more cost-effective compared to\nexplicit human judgment, while significantly improving the explainability of\ndeep image quality evaluation which is highly important for customer journey\noptimization at Mercari. We propose a cost-efficient LLM-driven approach for\nassessing and predicting image quality in e-commerce settings, which is very\nconvenient for proof-of-concept testing. We show that our LLM-produced labels\ncorrelate with user behavior on Mercari. Finally, we show our results from an\nonline experimentation, where we achieved a significant growth in sales on the\nweb platform.\n","authors":["Chingis Oinar","Miao Cao","Shanshan Fu"],"pdf_url":"https://arxiv.org/pdf/2408.11349v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05635v2","updated":"2024-08-21T05:24:19Z","published":"2024-08-10T21:23:08Z","title":"Visual SLAM with 3D Gaussian Primitives and Depth Priors Enabling Novel\n View Synthesis","summary":" Conventional geometry-based SLAM systems lack dense 3D reconstruction\ncapabilities since their data association usually relies on feature\ncorrespondences. Additionally, learning-based SLAM systems often fall short in\nterms of real-time performance and accuracy. Balancing real-time performance\nwith dense 3D reconstruction capabilities is a challenging problem. In this\npaper, we propose a real-time RGB-D SLAM system that incorporates a novel view\nsynthesis technique, 3D Gaussian Splatting, for 3D scene representation and\npose estimation. This technique leverages the real-time rendering performance\nof 3D Gaussian Splatting with rasterization and allows for differentiable\noptimization in real time through CUDA implementation. We also enable mesh\nreconstruction from 3D Gaussians for explicit dense 3D reconstruction. To\nestimate accurate camera poses, we utilize a rotation-translation decoupled\nstrategy with inverse optimization. This involves iteratively updating both in\nseveral iterations through gradient-based optimization. This process includes\ndifferentiably rendering RGB, depth, and silhouette maps and updating the\ncamera parameters to minimize a combined loss of photometric loss, depth\ngeometry loss, and visibility loss, given the existing 3D Gaussian map.\nHowever, 3D Gaussian Splatting (3DGS) struggles to accurately represent\nsurfaces due to the multi-view inconsistency of 3D Gaussians, which can lead to\nreduced accuracy in both camera pose estimation and scene reconstruction. To\naddress this, we utilize depth priors as additional regularization to enforce\ngeometric constraints, thereby improving the accuracy of both pose estimation\nand 3D reconstruction. We also provide extensive experimental results on public\nbenchmark datasets to demonstrate the effectiveness of our proposed methods in\nterms of pose accuracy, geometric accuracy, and rendering performance.\n","authors":["Zhongche Qu","Zhi Zhang","Cong Liu","Jianhua Yin"],"pdf_url":"https://arxiv.org/pdf/2408.05635v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08772v2","updated":"2024-08-21T05:00:04Z","published":"2024-06-13T03:04:28Z","title":"MMFakeBench: A Mixed-Source Multimodal Misinformation Detection\n Benchmark for LVLMs","summary":" Current multimodal misinformation detection (MMD) methods often assume a\nsingle source and type of forgery for each sample, which is insufficient for\nreal-world scenarios where multiple forgery sources coexist. The lack of a\nbenchmark for mixed-source misinformation has hindered progress in this field.\nTo address this, we introduce MMFakeBench, the first comprehensive benchmark\nfor mixed-source MMD. MMFakeBench includes 3 critical sources: textual veracity\ndistortion, visual veracity distortion, and cross-modal consistency distortion,\nalong with 12 sub-categories of misinformation forgery types. We further\nconduct an extensive evaluation of 6 prevalent detection methods and 15 large\nvision-language models (LVLMs) on MMFakeBench under a zero-shot setting. The\nresults indicate that current methods struggle under this challenging and\nrealistic mixed-source MMD setting. Additionally, we propose an innovative\nunified framework, which integrates rationales, actions, and tool-use\ncapabilities of LVLM agents, significantly enhancing accuracy and\ngeneralization. We believe this study will catalyze future research into more\nrealistic mixed-source multimodal misinformation and provide a fair evaluation\nof misinformation detection methods.\n","authors":["Xuannan Liu","Zekun Li","Peipei Li","Shuhan Xia","Xing Cui","Linzhi Huang","Huaibo Huang","Weihong Deng","Zhaofeng He"],"pdf_url":"https://arxiv.org/pdf/2406.08772v2.pdf","comment":"Project page: https://liuxuannan.github.io/MMFakeBench.github.io/"},{"id":"http://arxiv.org/abs/2408.11336v1","updated":"2024-08-21T04:40:18Z","published":"2024-08-21T04:40:18Z","title":"FATE: Focal-modulated Attention Encoder for Temperature Prediction","summary":" One of the major challenges of the twenty-first century is climate change,\nevidenced by rising sea levels, melting glaciers, and increased storm\nfrequency. Accurate temperature forecasting is vital for understanding and\nmitigating these impacts. Traditional data-driven models often use recurrent\nneural networks (RNNs) but face limitations in parallelization, especially with\nlonger sequences. To address this, we introduce a novel approach based on the\nFocalNet Transformer architecture. Our Focal modulation Attention Encoder\n(FATE) framework operates in a multi-tensor format, utilizing tensorized\nmodulation to capture spatial and temporal nuances in meteorological data.\nComparative evaluations against existing transformer encoders, 3D CNNs, LSTM,\nand ConvLSTM models show that FATE excels at identifying complex patterns in\ntemperature data. Additionally, we present a new labeled dataset, the Climate\nChange Parameter dataset (CCPD), containing 40 years of data from Jammu and\nKashmir on seven climate-related parameters. Experiments with real-world\ntemperature datasets from the USA, Canada, and Europe show accuracy\nimprovements of 12\\%, 23\\%, and 28\\%, respectively, over current\nstate-of-the-art models. Our CCPD dataset also achieved a 24\\% improvement in\naccuracy. To support reproducible research, we have released the source code\nand pre-trained FATE model at\n\\href{https://github.com/Tajamul21/FATE}{https://github.com/Tajamul21/FATE}.\n","authors":["Tajamul Ashraf","Janibul Bashir"],"pdf_url":"https://arxiv.org/pdf/2408.11336v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11323v1","updated":"2024-08-21T04:10:56Z","published":"2024-08-21T04:10:56Z","title":"Optimizing Transmit Field Inhomogeneity of Parallel RF Transmit Design\n in 7T MRI using Deep Learning","summary":" Ultrahigh field (UHF) Magnetic Resonance Imaging (MRI) provides a higher\nsignal-to-noise ratio and, thereby, higher spatial resolution. However, UHF MRI\nintroduces challenges such as transmit radiofrequency (RF) field (B1+)\ninhomogeneities, leading to uneven flip angles and image intensity anomalies.\nThese issues can significantly degrade imaging quality and its medical\napplications. This study addresses B1+ field homogeneity through a novel deep\nlearning-based strategy. Traditional methods like Magnitude Least Squares (MLS)\noptimization have been effective but are time-consuming and dependent on the\npatient's presence. Recent machine learning approaches, such as RF Shim\nPrediction by Iteratively Projected Ridge Regression and deep learning\nframeworks, have shown promise but face limitations like extensive training\ntimes and oversimplified architectures. We propose a two-step deep learning\nstrategy. First, we obtain the desired reference RF shimming weights from\nmulti-channel B1+ fields using random-initialized Adaptive Moment Estimation.\nThen, we employ Residual Networks (ResNets) to train a model that maps B1+\nfields to target RF shimming outputs. Our approach does not rely on\npre-calculated reference optimizations for the testing process and efficiently\nlearns residual functions. Comparative studies with traditional MLS\noptimization demonstrate our method's advantages in terms of speed and\naccuracy. The proposed strategy achieves a faster and more efficient RF\nshimming design, significantly improving imaging quality at UHF. This\nadvancement holds potential for broader applications in medical imaging and\ndiagnostics.\n","authors":["Zhengyi Lu","Hao Liang","Xiao Wang","Xinqiang Yan","Yuankai Huo"],"pdf_url":"https://arxiv.org/pdf/2408.11323v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11318v1","updated":"2024-08-21T03:56:27Z","published":"2024-08-21T03:56:27Z","title":"TWLV-I: Analysis and Insights from Holistic Evaluation on Video\n Foundation Models","summary":" In this work, we discuss evaluating video foundation models in a fair and\nrobust manner. Unlike language or image foundation models, many video\nfoundation models are evaluated with differing parameters (such as sampling\nrate, number of frames, pretraining steps, etc.), making fair and robust\ncomparisons challenging. Therefore, we present a carefully designed evaluation\nframework for measuring two core capabilities of video comprehension:\nappearance and motion understanding. Our findings reveal that existing video\nfoundation models, whether text-supervised like UMT or InternVideo2, or\nself-supervised like V-JEPA, exhibit limitations in at least one of these\ncapabilities. As an alternative, we introduce TWLV-I, a new video foundation\nmodel that constructs robust visual representations for both motion- and\nappearance-based videos. Based on the average top-1 accuracy of linear probing\non five action recognition benchmarks, pretrained only on publicly accessible\ndatasets, our model shows a 4.6%p improvement compared to V-JEPA (ViT-L) and a\n7.7%p improvement compared to UMT (ViT-L). Even when compared to much larger\nmodels, our model demonstrates a 7.2%p improvement compared to DFN (ViT-H), a\n2.7%p improvement compared to V-JEPA~(ViT-H) and a 2.8%p improvement compared\nto InternVideo2 (ViT-g). We provide embedding vectors obtained by TWLV-I from\nvideos of several commonly used video benchmarks, along with evaluation source\ncode that can directly utilize these embeddings. The code is available on\n\"https://github.com/twelvelabs-io/video-embeddings-evaluation-framework\".\n","authors":["Hyeongmin Lee","Jin-Young Kim","Kyungjune Baek","Jihwan Kim","Hyojun Go","Seongsu Ha","Seokjin Han","Jiho Jang","Raehyuk Jung","Daewoo Kim","GeunOh Kim","JongMok Kim","Jongseok Kim","Junwan Kim","Soonwoo Kwon","Jangwon Lee","Seungjoon Park","Minjoon Seo","Jay Suh","Jaehyuk Yi","Aiden Lee"],"pdf_url":"https://arxiv.org/pdf/2408.11318v1.pdf","comment":"17 pages; Twelve Labs Technical Report"},{"id":"http://arxiv.org/abs/2407.18970v2","updated":"2024-08-21T03:40:32Z","published":"2024-07-22T00:08:18Z","title":"Region Guided Attention Network for Retinal Vessel Segmentation","summary":" Retinal imaging has emerged as a promising method of addressing this\nchallenge, taking advantage of the unique structure of the retina. The retina\nis an embryonic extension of the central nervous system, providing a direct in\nvivo window into neurological health. Recent studies have shown that specific\nstructural changes in retinal vessels can not only serve as early indicators of\nvarious diseases but also help to understand disease progression. In this work,\nwe present a lightweight retinal vessel segmentation network based on the\nencoder-decoder mechanism with region-guided attention. We introduce inverse\naddition attention blocks with region guided attention to focus on the\nforeground regions and improve the segmentation of regions of interest. To\nfurther boost the model's performance on retinal vessel segmentation, we employ\na weighted dice loss. This choice is particularly effective in addressing the\nclass imbalance issues frequently encountered in retinal vessel segmentation\ntasks. Dice loss penalises false positives and false negatives equally,\nencouraging the model to generate more accurate segmentation with improved\nobject boundary delineation and reduced fragmentation. Extensive experiments on\na benchmark dataset show better performance (0.8285, 0.8098, 0.9677, and 0.8166\nrecall, precision, accuracy and F1 score respectively) compared to\nstate-of-the-art methods.\n","authors":["Syed Javed","Tariq M. Khan","Abdul Qayyum","Arcot Sowmya","Imran Razzak"],"pdf_url":"https://arxiv.org/pdf/2407.18970v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11312v1","updated":"2024-08-21T03:31:30Z","published":"2024-08-21T03:31:30Z","title":"Swarm Intelligence in Geo-Localization: A Multi-Agent Large\n Vision-Language Model Collaborative Framework","summary":" Visual geo-localization demands in-depth knowledge and advanced reasoning\nskills to associate images with real-world geographic locations precisely. In\ngeneral, traditional methods based on data-matching are hindered by the\nimpracticality of storing adequate visual records of global landmarks.\nRecently, Large Vision-Language Models (LVLMs) have demonstrated the capability\nof geo-localization through Visual Question Answering (VQA), enabling a\nsolution that does not require external geo-tagged image records. However, the\nperformance of a single LVLM is still limited by its intrinsic knowledge and\nreasoning capabilities. Along this line, in this paper, we introduce a novel\nvisual geo-localization framework called \\name\\ that integrates the inherent\nknowledge of multiple LVLM agents via inter-agent communication to achieve\neffective geo-localization of images. Furthermore, our framework employs a\ndynamic learning strategy to optimize the communication patterns among agents,\nreducing unnecessary discussions among agents and improving the efficiency of\nthe framework. To validate the effectiveness of the proposed framework, we\nconstruct GeoGlobe, a novel dataset for visual geo-localization tasks.\nExtensive testing on the dataset demonstrates that our approach significantly\noutperforms state-of-the-art methods.\n","authors":["Xiao Han","Chen Zhu","Xiangyu Zhao","Hengshu Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.11312v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11309v1","updated":"2024-08-21T03:26:16Z","published":"2024-08-21T03:26:16Z","title":"Improving Out-of-Distribution Data Handling and Corruption Resistance\n via Modern Hopfield Networks","summary":" This study explores the potential of Modern Hopfield Networks (MHN) in\nimproving the ability of computer vision models to handle out-of-distribution\ndata. While current computer vision models can generalize to unseen samples\nfrom the same distribution, they are susceptible to minor perturbations such as\nblurring, which limits their effectiveness in real-world applications. We\nsuggest integrating MHN into the baseline models to enhance their robustness.\nThis integration can be implemented during the test time for any model and\ncombined with any adversarial defense method. Our research shows that the\nproposed integration consistently improves model performance on the MNIST-C\ndataset, achieving a state-of-the-art increase of 13.84% in average corruption\naccuracy, a 57.49% decrease in mean Corruption Error (mCE), and a 60.61%\ndecrease in relative mCE compared to the baseline model. Additionally, we\ninvestigate the capability of MHN to converge to the original non-corrupted\ndata. Notably, our method does not require test-time adaptation or augmentation\nwith corruptions, underscoring its practical viability for real-world\ndeployment. (Source code publicly available at:\nhttps://github.com/salehsargolzaee/Hopfield-integrated-test)\n","authors":["Saleh Sargolzaei","Luis Rueda"],"pdf_url":"https://arxiv.org/pdf/2408.11309v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11305v1","updated":"2024-08-21T03:17:20Z","published":"2024-08-21T03:17:20Z","title":"UniFashion: A Unified Vision-Language Model for Multimodal Fashion\n Retrieval and Generation","summary":" The fashion domain encompasses a variety of real-world multimodal tasks,\nincluding multimodal retrieval and multimodal generation. The rapid\nadvancements in artificial intelligence generated content, particularly in\ntechnologies like large language models for text generation and diffusion\nmodels for visual generation, have sparked widespread research interest in\napplying these multimodal models in the fashion domain. However, tasks\ninvolving embeddings, such as image-to-text or text-to-image retrieval, have\nbeen largely overlooked from this perspective due to the diverse nature of the\nmultimodal fashion domain. And current research on multi-task single models\nlack focus on image generation. In this work, we present UniFashion, a unified\nframework that simultaneously tackles the challenges of multimodal generation\nand retrieval tasks within the fashion domain, integrating image generation\nwith retrieval tasks and text generation tasks. UniFashion unifies embedding\nand generative tasks by integrating a diffusion model and LLM, enabling\ncontrollable and high-fidelity generation. Our model significantly outperforms\nprevious single-task state-of-the-art models across diverse fashion tasks, and\ncan be readily adapted to manage complex vision-language tasks. This work\ndemonstrates the potential learning synergy between multimodal generation and\nretrieval, offering a promising direction for future research in the fashion\ndomain. The source code is available at\nhttps://github.com/xiangyu-mm/UniFashion.\n","authors":["Xiangyu Zhao","Yuehan Zhang","Wenlong Zhang","Xiao-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2408.11305v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11297v1","updated":"2024-08-21T03:01:11Z","published":"2024-08-21T03:01:11Z","title":"Making Large Vision Language Models to be Good Few-shot Learners","summary":" Few-shot classification (FSC) is a fundamental yet challenging task in\ncomputer vision that involves recognizing novel classes from limited data.\nWhile previous methods have focused on enhancing visual features or\nincorporating additional modalities, Large Vision Language Models (LVLMs) offer\na promising alternative due to their rich knowledge and strong visual\nperception. However, LVLMs risk learning specific response formats rather than\neffectively extracting useful information from support data in FSC tasks. In\nthis paper, we investigate LVLMs' performance in FSC and identify key issues\nsuch as insufficient learning and the presence of severe positional biases. To\ntackle the above challenges, we adopt the meta-learning strategy to teach\nmodels \"learn to learn\". By constructing a rich set of meta-tasks for\ninstruction fine-tuning, LVLMs enhance the ability to extract information from\nfew-shot support data for classification. Additionally, we further boost LVLM's\nfew-shot learning capabilities through label augmentation and candidate\nselection in the fine-tuning and inference stage, respectively. Label\naugmentation is implemented via a character perturbation strategy to ensure the\nmodel focuses on support information. Candidate selection leverages attribute\ndescriptions to filter out unreliable candidates and simplify the task.\nExtensive experiments demonstrate that our approach achieves superior\nperformance on both general and fine-grained datasets. Furthermore, our\ncandidate selection strategy has been proven beneficial for training-free\nLVLMs.\n","authors":["Fan Liu","Wenwen Cai","Jian Huo","Chuanyi Zhang","Delong Chen","Jun Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.11297v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.05044v5","updated":"2024-08-21T02:51:36Z","published":"2022-09-12T06:52:42Z","title":"Predicting the Next Action by Modeling the Abstract Goal","summary":" The problem of anticipating human actions is an inherently uncertain one.\nHowever, we can reduce this uncertainty if we have a sense of the goal that the\nactor is trying to achieve. Here, we present an action anticipation model that\nleverages goal information for the purpose of reducing the uncertainty in\nfuture predictions. Since we do not possess goal information or the observed\nactions during inference, we resort to visual representation to encapsulate\ninformation about both actions and goals. Through this, we derive a novel\nconcept called abstract goal which is conditioned on observed sequences of\nvisual features for action anticipation. We design the abstract goal as a\ndistribution whose parameters are estimated using a variational recurrent\nnetwork. We sample multiple candidates for the next action and introduce a goal\nconsistency measure to determine the best candidate that follows from the\nabstract goal. Our method obtains impressive results on the very challenging\nEpic-Kitchens55 (EK55), EK100, and EGTEA Gaze+ datasets. We obtain absolute\nimprovements of +13.69, +11.24, and +5.19 for Top-1 verb, Top-1 noun, and Top-1\naction anticipation accuracy respectively over prior state-of-the-art methods\nfor seen kitchens (S1) of EK55. Similarly, we also obtain significant\nimprovements in the unseen kitchens (S2) set for Top-1 verb (+10.75), noun\n(+5.84) and action (+2.87) anticipation. Similar trend is observed for EGTEA\nGaze+ dataset, where absolute improvement of +9.9, +13.1 and +6.8 is obtained\nfor noun, verb, and action anticipation. It is through the submission of this\npaper that our method is currently the new state-of-the-art for action\nanticipation in EK55 and EGTEA Gaze+\nhttps://competitions.codalab.org/competitions/20071#results Code available at\nhttps://github.com/debadityaroy/Abstract_Goal\n","authors":["Debaditya Roy","Basura Fernando"],"pdf_url":"https://arxiv.org/pdf/2209.05044v5.pdf","comment":"Accepted at the 27th International Conference on Pattern Recognition\n (ICPR)"},{"id":"http://arxiv.org/abs/2404.07514v3","updated":"2024-08-21T02:50:28Z","published":"2024-04-11T07:11:43Z","title":"Generalization Gap in Data Augmentation: Insights from Illumination","summary":" In the field of computer vision, data augmentation is widely used to enrich\nthe feature complexity of training datasets with deep learning techniques.\nHowever, regarding the generalization capabilities of models, the difference in\nartificial features generated by data augmentation and natural visual features\nhas not been fully revealed. This study introduces the concept of \"visual\nrepresentation variables\" to define the possible visual variations in a task as\na joint distribution of these variables. We focus on the visual representation\nvariable \"illumination\", by simulating its distribution degradation and\nexamining how data augmentation techniques enhance model performance on a\nclassification task. Our goal is to investigate the differences in\ngeneralization between models trained with augmented data and those trained\nunder real-world illumination conditions. Results indicate that after applying\nvarious data augmentation methods, model performance has significantly\nimproved. Yet, a noticeable generalization gap still exists after utilizing\nvarious data augmentation methods, emphasizing the critical role of feature\ndiversity in the training set for enhancing model generalization.\n","authors":["Jianqiang Xiao","Weiwen Guo","Junfeng Liu","Mengze Li"],"pdf_url":"https://arxiv.org/pdf/2404.07514v3.pdf","comment":"Accepted in ICPR 2024"},{"id":"http://arxiv.org/abs/2405.17913v2","updated":"2024-08-21T02:40:34Z","published":"2024-05-28T07:33:27Z","title":"OV-DQUO: Open-Vocabulary DETR with Denoising Text Query Training and\n Open-World Unknown Objects Supervision","summary":" Open-vocabulary detection aims to detect objects from novel categories beyond\nthe base categories on which the detector is trained. However, existing\nopen-vocabulary detectors trained on base category data tend to assign higher\nconfidence to trained categories and confuse novel categories with the\nbackground. To resolve this, we propose OV-DQUO, an\n\\textbf{O}pen-\\textbf{V}ocabulary DETR with \\textbf{D}enoising text\n\\textbf{Q}uery training and open-world \\textbf{U}nknown \\textbf{O}bjects\nsupervision. Specifically, we introduce a wildcard matching method. This method\nenables the detector to learn from pairs of unknown objects recognized by the\nopen-world detector and text embeddings with general semantics, mitigating the\nconfidence bias between base and novel categories. Additionally, we propose a\ndenoising text query training strategy. It synthesizes foreground and\nbackground query-box pairs from open-world unknown objects to train the\ndetector through contrastive learning, enhancing its ability to distinguish\nnovel objects from the background. We conducted extensive experiments on the\nchallenging OV-COCO and OV-LVIS benchmarks, achieving new state-of-the-art\nresults of 45.6 AP50 and 39.3 mAP on novel categories respectively, without the\nneed for additional training data. Models and code are released at\n\\url{https://github.com/xiaomoguhz/OV-DQUO}\n","authors":["Junjie Wang","Bin Chen","Bin Kang","Yulin Li","YiChi Chen","Weizhi Xian","Huifeng Chang","Yong Xu"],"pdf_url":"https://arxiv.org/pdf/2405.17913v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01782v2","updated":"2024-08-21T02:39:06Z","published":"2024-07-01T20:21:09Z","title":"Addressing a fundamental limitation in deep vision models: lack of\n spatial attention","summary":" The primary aim of this manuscript is to underscore a significant limitation\nin current deep learning models, particularly vision models. Unlike human\nvision, which efficiently selects only the essential visual areas for further\nprocessing, leading to high speed and low energy consumption, deep vision\nmodels process the entire image. In this work, we examine this issue from a\nbroader perspective and propose a solution that could pave the way for the next\ngeneration of more efficient vision models. Basically, convolution and pooling\noperations are selectively applied to altered regions, with a change map sent\nto subsequent layers. This map indicates which computations need to be\nrepeated. The code is available at\nhttps://github.com/aliborji/spatial_attention.\n","authors":["Ali Borji"],"pdf_url":"https://arxiv.org/pdf/2407.01782v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11289v1","updated":"2024-08-21T02:25:14Z","published":"2024-08-21T02:25:14Z","title":"HMT-UNet: A hybird Mamba-Transformer Vision UNet for Medical Image\n Segmentation","summary":" In the field of medical image segmentation, models based on both CNN and\nTransformer have been thoroughly investigated. However, CNNs have limited\nmodeling capabilities for long-range dependencies, making it challenging to\nexploit the semantic information within images fully. On the other hand, the\nquadratic computational complexity poses a challenge for Transformers. State\nSpace Models (SSMs), such as Mamba, have been recognized as a promising method.\nThey not only demonstrate superior performance in modeling long-range\ninteractions, but also preserve a linear computational complexity. The hybrid\nmechanism of SSM (State Space Model) and Transformer, after meticulous design,\ncan enhance its capability for efficient modeling of visual features. Extensive\nexperiments have demonstrated that integrating the self-attention mechanism\ninto the hybrid part behind the layers of Mamba's architecture can greatly\nimprove the modeling capacity to capture long-range spatial dependencies. In\nthis paper, leveraging the hybrid mechanism of SSM, we propose a U-shape\narchitecture model for medical image segmentation, named Hybird Transformer\nvision Mamba UNet (HTM-UNet). We conduct comprehensive experiments on the\nISIC17, ISIC18, CVC-300, CVC-ClinicDB, Kvasir, CVC-ColonDB, ETIS-Larib PolypDB\npublic datasets and ZD-LCI-GIM private dataset. The results indicate that\nHTM-UNet exhibits competitive performance in medical image segmentation tasks.\nOur code is available at https://github.com/simzhangbest/HMT-Unet.\n","authors":["Mingya Zhang","Limei Gu","Tingshen Ling","Xianping Tao"],"pdf_url":"https://arxiv.org/pdf/2408.11289v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2403.09157; text overlap\n with arXiv:2407.08083 by other authors"},{"id":"http://arxiv.org/abs/2408.10533v2","updated":"2024-08-21T02:24:43Z","published":"2024-08-20T04:20:11Z","title":"FAGStyle: Feature Augmentation on Geodesic Surface for Zero-shot\n Text-guided Diffusion Image Style Transfer","summary":" The goal of image style transfer is to render an image guided by a style\nreference while maintaining the original content. Existing image-guided methods\nrely on specific style reference images, restricting their wider application\nand potentially compromising result quality. As a flexible alternative,\ntext-guided methods allow users to describe the desired style using text\nprompts. Despite their versatility, these methods often struggle with\nmaintaining style consistency, reflecting the described style accurately, and\npreserving the content of the target image. To address these challenges, we\nintroduce FAGStyle, a zero-shot text-guided diffusion image style transfer\nmethod. Our approach enhances inter-patch information interaction by\nincorporating the Sliding Window Crop technique and Feature Augmentation on\nGeodesic Surface into our style control loss. Furthermore, we integrate a\nPre-Shape self-correlation consistency loss to ensure content consistency.\nFAGStyle demonstrates superior performance over existing methods, consistently\nachieving stylization that retains the semantic content of the source image.\nExperimental results confirms the efficacy of FAGStyle across a diverse range\nof source contents and styles, both imagined and common.\n","authors":["Yuexing Han","Liheng Ruan","Bing Wang"],"pdf_url":"https://arxiv.org/pdf/2408.10533v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11287v1","updated":"2024-08-21T02:19:54Z","published":"2024-08-21T02:19:54Z","title":"Taming Generative Diffusion for Universal Blind Image Restoration","summary":" Diffusion models have been widely utilized for image restoration. However,\nprevious blind image restoration methods still need to assume the type of\ndegradation model while leaving the parameters to be optimized, limiting their\nreal-world applications. Therefore, we aim to tame generative diffusion prior\nfor universal blind image restoration dubbed BIR-D, which utilizes an\noptimizable convolutional kernel to simulate the degradation model and\ndynamically update the parameters of the kernel in the diffusion steps,\nenabling it to achieve blind image restoration results even in various complex\nsituations. Besides, based on mathematical reasoning, we have provided an\nempirical formula for the chosen of adaptive guidance scale, eliminating the\nneed for a grid search for the optimal parameter. Experimentally, Our BIR-D has\ndemonstrated superior practicality and versatility than off-the-shelf\nunsupervised methods across various tasks both on real-world and synthetic\ndatasets, qualitatively and quantitatively. BIR-D is able to fulfill\nmulti-guidance blind image restoration. Moreover, BIR-D can also restore images\nthat undergo multiple and complicated degradations, demonstrating the practical\napplications.\n","authors":["Siwei Tu","Weidong Yang","Ben Fei"],"pdf_url":"https://arxiv.org/pdf/2408.11287v1.pdf","comment":"14 pages, 9 figures, 8 tables"},{"id":"http://arxiv.org/abs/2408.11286v1","updated":"2024-08-21T02:17:18Z","published":"2024-08-21T02:17:18Z","title":"Video Emotion Open-vocabulary Recognition Based on Multimodal Large\n Language Model","summary":" Multimodal emotion recognition is a task of great concern. However,\ntraditional data sets are based on fixed labels, resulting in models that often\nfocus on main emotions and ignore detailed emotional changes in complex scenes.\nThis report introduces the solution of using MLLMs technology to generate\nopen-vocabulary emotion labels from a video. The solution includes the use of\nframework, data generation and processing, training methods, results generation\nand multi-model co-judgment. In the MER-OV (Open-Word Emotion Recognition) of\nthe MER2024 challenge, our method achieved significant advantages, leading to\nits superior capabilities in complex emotion computation.\n","authors":["Mengying Ge","Dongkai Tang","Mingyang Li"],"pdf_url":"https://arxiv.org/pdf/2408.11286v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.02034v2","updated":"2024-08-21T02:15:52Z","published":"2024-07-02T08:06:58Z","title":"TrAME: Trajectory-Anchored Multi-View Editing for Text-Guided 3D\n Gaussian Splatting Manipulation","summary":" Despite significant strides in the field of 3D scene editing, current methods\nencounter substantial challenge, particularly in preserving 3D consistency in\nmulti-view editing process. To tackle this challenge, we propose a progressive\n3D editing strategy that ensures multi-view consistency via a\nTrajectory-Anchored Scheme (TAS) with a dual-branch editing mechanism.\nSpecifically, TAS facilitates a tightly coupled iterative process between 2D\nview editing and 3D updating, preventing error accumulation yielded from\ntext-to-image process. Additionally, we explore the relationship between\noptimization-based methods and reconstruction-based methods, offering a unified\nperspective for selecting superior design choice, supporting the rationale\nbehind the designed TAS. We further present a tuning-free View-Consistent\nAttention Control (VCAC) module that leverages cross-view semantic and\ngeometric reference from the source branch to yield aligned views from the\ntarget branch during the editing of 2D views. To validate the effectiveness of\nour method, we analyze 2D examples to demonstrate the improved consistency with\nthe VCAC module. Further extensive quantitative and qualitative results in\ntext-guided 3D scene editing indicate that our method achieves superior editing\nquality compared to state-of-the-art methods. We will make the complete\ncodebase publicly available following the conclusion of the review process.\n","authors":["Chaofan Luo","Donglin Di","Xun Yang","Yongjia Ma","Zhou Xue","Chen Wei","Yebin Liu"],"pdf_url":"https://arxiv.org/pdf/2407.02034v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11280v1","updated":"2024-08-21T02:03:03Z","published":"2024-08-21T02:03:03Z","title":"Exploring Scene Coherence for Semi-Supervised 3D Semantic Segmentation","summary":" Semi-supervised semantic segmentation, which efficiently addresses the\nlimitation of acquiring dense annotations, is essential for 3D scene\nunderstanding. Most methods leverage the teacher model to generate pseudo\nlabels, and then guide the learning of the student model on unlabeled scenes.\nHowever, they focus only on points with pseudo labels while directly\noverlooking points without pseudo labels, namely intra-scene inconsistency,\nleading to semantic ambiguity. Moreover, inter-scene correlation between\nlabeled and unlabeled scenes contribute to transferring rich annotation\ninformation, yet this has not been explored for the semi-supervised tasks. To\naddress these two problems, we propose to explore scene coherence for\nsemi-supervised 3D semantic segmentation, dubbed CoScene. Inspired by the\nunstructured and unordered nature of the point clouds, our CoScene adopts the\nstraightforward point erasure strategy to ensure the intra-scene consistency.\nMoreover, patch-based data augmentation is proposed to enhance the inter-scene\ninformation transfer between labeled and unlabeled scenes at both scene and\ninstance levels. Extensive experimental results on SemanticKITTI and nuScenes\nshow that our approach outperforms existing methods.\n","authors":["Chuandong Liu","Shuguo Jiang","Xingxing Weng","Lei Yu","Pengcheng Li","Gui-Song Xia"],"pdf_url":"https://arxiv.org/pdf/2408.11280v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11278v1","updated":"2024-08-21T02:01:15Z","published":"2024-08-21T02:01:15Z","title":"The Key of Parameter Skew in Federated Learning","summary":" Federated Learning (FL) has emerged as an excellent solution for performing\ndeep learning on different data owners without exchanging raw data. However,\nstatistical heterogeneity in FL presents a key challenge, leading to a\nphenomenon of skewness in local model parameter distributions that researchers\nhave largely overlooked. In this work, we propose the concept of parameter skew\nto describe the phenomenon that can substantially affect the accuracy of global\nmodel parameter estimation. Additionally, we introduce FedSA, an aggregation\nstrategy to obtain a high-quality global model, to address the implication from\nparameter skew. Specifically, we categorize parameters into high-dispersion and\nlow-dispersion groups based on the coefficient of variation. For\nhigh-dispersion parameters, Micro-Classes (MIC) and Macro-Classes (MAC)\nrepresent the dispersion at the micro and macro levels, respectively, forming\nthe foundation of FedSA. To evaluate the effectiveness of FedSA, we conduct\nextensive experiments with different FL algorithms on three computer vision\ndatasets. FedSA outperforms eight state-of-the-art baselines by about 4.7% in\ntest accuracy.\n","authors":["Sifan Wang","Junfeng Liao","Ye Yuan","Riquan Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.11278v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09731v2","updated":"2024-08-21T01:58:43Z","published":"2024-08-19T06:34:01Z","title":"Reconstruct Spine CT from Biplanar X-Rays via Diffusion Learning","summary":" Intraoperative CT imaging serves as a crucial resource for surgical guidance;\nhowever, it may not always be readily accessible or practical to implement. In\nscenarios where CT imaging is not an option, reconstructing CT scans from\nX-rays can offer a viable alternative. In this paper, we introduce an\ninnovative method for 3D CT reconstruction utilizing biplanar X-rays. Distinct\nfrom previous research that relies on conventional image generation techniques,\nour approach leverages a conditional diffusion process to tackle the task of\nreconstruction. More precisely, we employ a diffusion-based probabilistic model\ntrained to produce 3D CT images based on orthogonal biplanar X-rays. To improve\nthe structural integrity of the reconstructed images, we incorporate a novel\nprojection loss function. Experimental results validate that our proposed\nmethod surpasses existing state-of-the-art benchmarks in both visual image\nquality and multiple evaluative metrics. Specifically, our technique achieves a\nhigher Structural Similarity Index (SSIM) of 0.83, a relative increase of 10\\%,\nand a lower Fr\\'echet Inception Distance (FID) of 83.43, which represents a\nrelative decrease of 25\\%.\n","authors":["Zhi Qiao","Xuhui Liu","Xiaopeng Wang","Runkun Liu","Xiantong Zhen","Pei Dong","Zhen Qian"],"pdf_url":"https://arxiv.org/pdf/2408.09731v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10919v2","updated":"2024-08-21T01:57:15Z","published":"2024-08-20T15:04:14Z","title":"CrossFi: A Cross Domain Wi-Fi Sensing Framework Based on Siamese Network","summary":" In recent years, Wi-Fi sensing has garnered significant attention due to its\nnumerous benefits, such as privacy protection, low cost, and penetration\nability. Extensive research has been conducted in this field, focusing on areas\nsuch as gesture recognition, people identification, and fall detection.\nHowever, many data-driven methods encounter challenges related to domain shift,\nwhere the model fails to perform well in environments different from the\ntraining data. One major factor contributing to this issue is the limited\navailability of Wi-Fi sensing datasets, which makes models learn excessive\nirrelevant information and over-fit to the training set. Unfortunately,\ncollecting large-scale Wi-Fi sensing datasets across diverse scenarios is a\nchallenging task. To address this problem, we propose CrossFi, a siamese\nnetwork-based approach that excels in both in-domain scenario and cross-domain\nscenario, including few-shot, zero-shot scenarios, and even works in few-shot\nnew-class scenario where testing set contains new categories. The core\ncomponent of CrossFi is a sample-similarity calculation network called CSi-Net,\nwhich improves the structure of the siamese network by using an attention\nmechanism to capture similarity information, instead of simply calculating the\ndistance or cosine similarity. Based on it, we develop an extra Weight-Net that\ncan generate a template for each class, so that our CrossFi can work in\ndifferent scenarios. Experimental results demonstrate that our CrossFi achieves\nstate-of-the-art performance across various scenarios. In gesture recognition\ntask, our CrossFi achieves an accuracy of 98.17% in in-domain scenario, 91.72%\nin one-shot cross-domain scenario, 64.81% in zero-shot cross-domain scenario,\nand 84.75% in one-shot new-class scenario. To facilitate future research, we\nwill release the code for our model upon publication.\n","authors":["Zijian Zhao","Tingwei Chen","Zhijie Cai","Xiaoyang Li","Hang Li","Qimei Chen","Guangxu Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.10919v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11271v1","updated":"2024-08-21T01:47:06Z","published":"2024-08-21T01:47:06Z","title":"On Missing Scores in Evolving Multibiometric Systems","summary":" The use of multiple modalities (e.g., face and fingerprint) or multiple\nalgorithms (e.g., three face comparators) has shown to improve the recognition\naccuracy of an operational biometric system. Over time a biometric system may\nevolve to add new modalities, retire old modalities, or be merged with other\nbiometric systems. This can lead to scenarios where there are missing scores\ncorresponding to the input probe set. Previous work on this topic has focused\non either the verification or identification tasks, but not both. Further, the\nproportion of missing data considered has been less than 50%. In this work, we\nstudy the impact of missing score data for both the verification and\nidentification tasks. We show that the application of various score imputation\nmethods along with simple sum fusion can improve recognition accuracy, even\nwhen the proportion of missing scores increases to 90%. Experiments show that\nfusion after score imputation outperforms fusion with no imputation.\nSpecifically, iterative imputation with K nearest neighbors consistently\nsurpasses other imputation methods in both the verification and identification\ntasks, regardless of the amount of scores missing, and provides imputed values\nthat are consistent with the ground truth complete dataset.\n","authors":["Melissa R Dale","Anil Jain","Arun Ross"],"pdf_url":"https://arxiv.org/pdf/2408.11271v1.pdf","comment":"2022 26th International Conference on Pattern Recognition (ICPR)"},{"id":"http://arxiv.org/abs/2207.08794v4","updated":"2024-08-21T01:45:17Z","published":"2022-07-18T17:47:39Z","title":"D$^3$FlowSLAM: Self-Supervised Dynamic SLAM with Flow Motion\n Decomposition and DINO Guidance","summary":" In this paper, we introduce a self-supervised deep SLAM method that robustly\noperates in dynamic scenes while accurately identifying dynamic components. Our\nmethod leverages a dual-flow representation for static flow and dynamic flow,\nfacilitating effective scene decomposition in dynamic environments. We propose\na dynamic update module based on this representation and develop a dense SLAM\nsystem that excels in dynamic scenarios. In addition, we design a\nself-supervised training scheme using DINO as a prior, enabling label-free\ntraining. Our method achieves superior accuracy compared to other\nself-supervised methods. It also matches or even surpasses the performance of\nexisting supervised methods in some cases. All code and data will be made\npublicly available upon acceptance.\n","authors":["Xingyuan Yu","Weicai Ye","Xiyue Guo","Yuhang Ming","Jinyu Li","Hujun Bao","Zhaopeng Cui","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2207.08794v4.pdf","comment":"Homepage: https://zju3dv.github.io/deflowslam"},{"id":"http://arxiv.org/abs/2408.04145v3","updated":"2024-08-21T01:36:27Z","published":"2024-08-08T01:12:21Z","title":"ComKD-CLIP: Comprehensive Knowledge Distillation for Contrastive\n Language-Image Pre-traning Model","summary":" Contrastive Language-Image Pre-training (CLIP) models excel in integrating\nsemantic information between images and text through contrastive learning\ntechniques. It has achieved remarkable performance in various multimodal tasks.\nHowever, the deployment of large CLIP models is hindered in resource-limited\nenvironments, while smaller models frequently fail to meet the performance\nbenchmarks required for practical applications. In this paper, we propose a\nnovel approach, ComKD-CLIP: Comprehensive Knowledge Distillation for\nContrastive Language-Image Pre-traning Model, which aims to comprehensively\ndistill the knowledge from a large teacher CLIP model into a smaller student\nmodel, ensuring comparable performance with significantly reduced parameters.\nComKD-CLIP is composed of two key mechanisms: Image Feature Alignment (IFAlign)\nand Educational Attention (EduAttention). IFAlign makes the image features\nextracted by the student model closely match those extracted by the teacher\nmodel, enabling the student to learn teacher's knowledge of extracting image\nfeatures. EduAttention explores the cross-relationships between text features\nextracted by the teacher model and image features extracted by the student\nmodel, enabling the student model to learn how the teacher model integrates\ntext-image features. In addition, ComKD-CLIP can refine the knowledge distilled\nfrom IFAlign and EduAttention by leveraging the text-image feature fusion\nresults of the teacher model, ensuring the student model accurately absorbs the\nteacher's knowledge. Extensive experiments conducted on 11 datasets have\ndemonstrated the superiority of the proposed method.\n","authors":["Yifan Chen","Xiaozhen Qiao","Zhe Sun","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2408.04145v3.pdf","comment":"update"},{"id":"http://arxiv.org/abs/2309.00928v2","updated":"2024-08-21T01:28:39Z","published":"2023-09-02T12:36:38Z","title":"S$^3$-MonoDETR: Supervised Shape&Scale-perceptive Deformable Transformer\n for Monocular 3D Object Detection","summary":" Recently, transformer-based methods have shown exceptional performance in\nmonocular 3D object detection, which can predict 3D attributes from a single 2D\nimage. These methods typically use visual and depth representations to generate\nquery points on objects, whose quality plays a decisive role in the detection\naccuracy. However, current unsupervised attention mechanisms without any\ngeometry appearance awareness in transformers are susceptible to producing\nnoisy features for query points, which severely limits the network performance\nand also makes the model have a poor ability to detect multi-category objects\nin a single training process. To tackle this problem, this paper proposes a\nnovel ``Supervised Shape&Scale-perceptive Deformable Attention'' (S$^3$-DA)\nmodule for monocular 3D object detection. Concretely, S$^3$-DA utilizes visual\nand depth features to generate diverse local features with various shapes and\nscales and predict the corresponding matching distribution simultaneously to\nimpose valuable shape&scale perception for each query. Benefiting from this,\nS$^3$-DA effectively estimates receptive fields for query points belonging to\nany category, enabling them to generate robust query features. Besides, we\npropose a Multi-classification-based Shape&Scale Matching (MSM) loss to\nsupervise the above process. Extensive experiments on KITTI and Waymo Open\ndatasets demonstrate that S$^3$-DA significantly improves the detection\naccuracy, yielding state-of-the-art performance of single-category and\nmulti-category 3D object detection in a single training process compared to the\nexisting approaches. The source code will be made publicly available at\nhttps://github.com/mikasa3lili/S3-MonoDETR.\n","authors":["Xuan He","Jin Yuan","Kailun Yang","Zhenchao Zeng","Zhiyong Li"],"pdf_url":"https://arxiv.org/pdf/2309.00928v2.pdf","comment":"The source code will be made publicly available at\n https://github.com/mikasa3lili/S3-MonoDETR"},{"id":"http://arxiv.org/abs/2403.16276v2","updated":"2024-08-21T01:15:20Z","published":"2024-03-24T19:50:49Z","title":"Empowering LLMs with Pseudo-Untrimmed Videos for Audio-Visual Temporal\n Understanding","summary":" Large language models (LLMs) have demonstrated remarkable capabilities in\nnatural language and multimodal domains. By fine-tuning multimodal LLMs with\ntemporal annotations from well-annotated datasets, e.g., dense video captioning\ndatasets, their temporal understanding capacity in video-language tasks can be\nobtained. However, there is a notable lack of untrimmed audio-visual video\ndatasets with precise temporal annotations for events. This deficiency hinders\nLLMs from learning the alignment between time, audio-visual events, and text\ntokens, thus impairing their ability to temporally localize audio-visual events\nin videos. To address this gap, we introduce PU-VALOR, a comprehensive\naudio-visual dataset comprising over 114,000 pseudo-untrimmed videos with\ndetailed temporal annotations. PU-VALOR is derived from the large-scale but\ncoarse-annotated audio-visual dataset VALOR, through a subtle method involving\nevent-based video clustering, random temporal scaling, and permutation. By\nfine-tuning a multimodal LLM on PU-VALOR, we developed AVicuna, a model capable\nof aligning audio-visual events with temporal intervals and corresponding text\ntokens. AVicuna excels in temporal localization and time-aware dialogue\ncapabilities. Our experiments demonstrate that AVicuna effectively handles\ntemporal understanding in audio-visual videos and achieves state-of-the-art\nperformance on open-ended video QA, audio-visual QA, and audio-visual event\ndense localization tasks.\n","authors":["Yunlong Tang","Daiki Shimada","Jing Bi","Mingqian Feng","Hang Hua","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2403.16276v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10258v2","updated":"2024-08-21T00:52:28Z","published":"2024-08-13T13:21:53Z","title":"NeRF-US: Removing Ultrasound Imaging Artifacts from Neural Radiance\n Fields in the Wild","summary":" Current methods for performing 3D reconstruction and novel view synthesis\n(NVS) in ultrasound imaging data often face severe artifacts when training\nNeRF-based approaches. The artifacts produced by current approaches differ from\nNeRF floaters in general scenes because of the unique nature of ultrasound\ncapture. Furthermore, existing models fail to produce reasonable 3D\nreconstructions when ultrasound data is captured or obtained casually in\nuncontrolled environments, which is common in clinical settings. Consequently,\nexisting reconstruction and NVS methods struggle to handle ultrasound motion,\nfail to capture intricate details, and cannot model transparent and reflective\nsurfaces. In this work, we introduced NeRF-US, which incorporates 3D-geometry\nguidance for border probability and scattering density into NeRF training,\nwhile also utilizing ultrasound-specific rendering over traditional volume\nrendering. These 3D priors are learned through a diffusion model. Through\nexperiments conducted on our new \"Ultrasound in the Wild\" dataset, we observed\naccurate, clinically plausible, artifact-free reconstructions.\n","authors":["Rishit Dagli","Atsuhiro Hibi","Rahul G. Krishnan","Pascal N. Tyrrell"],"pdf_url":"https://arxiv.org/pdf/2408.10258v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10469v2","updated":"2024-08-21T00:39:38Z","published":"2024-08-20T00:45:13Z","title":"LSVOS Challenge 3rd Place Report: SAM2 and Cutie based VOS","summary":" Video Object Segmentation (VOS) presents several challenges, including object\nocclusion and fragmentation, the dis-appearance and re-appearance of objects,\nand tracking specific objects within crowded scenes. In this work, we combine\nthe strengths of the state-of-the-art (SOTA) models SAM2 and Cutie to address\nthese challenges. Additionally, we explore the impact of various\nhyperparameters on video instance segmentation performance. Our approach\nachieves a J\\&F score of 0.7952 in the testing phase of LSVOS challenge VOS\ntrack, ranking third overall.\n","authors":["Xinyu Liu","Jing Zhang","Kexin Zhang","Xu Liu","Lingling Li"],"pdf_url":"https://arxiv.org/pdf/2408.10469v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2406.03668"},{"id":"http://arxiv.org/abs/2404.02135v4","updated":"2024-08-21T00:37:20Z","published":"2024-04-02T17:48:46Z","title":"Enhancing Ship Classification in Optical Satellite Imagery: Integrating\n Convolutional Block Attention Module with ResNet for Improved Performance","summary":" In this study, we present an advanced convolutional neural network (CNN)\narchitecture for ship classification based on optical satellite imagery, which\nsignificantly enhances performance through the integration of a convolutional\nblock attention module (CBAM) and additional architectural innovations.\nBuilding upon the foundational ResNet50 model, we first incorporated a standard\nCBAM to direct the model's focus toward more informative features, achieving an\naccuracy of 87% compared to 85% of the baseline ResNet50. Further augmentations\ninvolved multiscale feature integration, depthwise separable convolutions, and\ndilated convolutions, culminating in an enhanced ResNet model with improved\nCBAM. This model demonstrated a remarkable accuracy of 95%, with precision,\nrecall, and F1 scores all witnessing substantial improvements across various\nship classes. In particular, the bulk carrier and oil tanker classes exhibited\nnearly perfect precision and recall rates, underscoring the enhanced capability\nof the model to accurately identify and classify ships. Attention heatmap\nanalyses further validated the efficacy of the improved model, revealing more\nfocused attention on relevant ship features regardless of background\ncomplexities. These findings underscore the potential of integrating attention\nmechanisms and architectural innovations into CNNs for high-resolution\nsatellite imagery classification. This study navigates through the class\nimbalance and computational costs and proposes future directions for\nscalability and adaptability in new or rare ship-type recognition. This study\nlays the groundwork for applying advanced deep learning techniques in remote\nsensing, offering insights into scalable and efficient satellite image\nclassification.\n","authors":["Ryan Donghan Kwon","Gangjoo Robin Nam","Jisoo Tak","Junseob Shin","Hyerin Cha","Seung Won Lee"],"pdf_url":"https://arxiv.org/pdf/2404.02135v4.pdf","comment":"Submitted to IEEE Access on August 16, 2024"},{"id":"http://arxiv.org/abs/2408.08459v2","updated":"2024-08-21T00:24:53Z","published":"2024-08-15T23:57:02Z","title":"JPEG-LM: LLMs as Image Generators with Canonical Codec Representations","summary":" Recent work in image and video generation has been adopting the\nautoregressive LLM architecture due to its generality and potentially easy\nintegration into multi-modal systems. The crux of applying autoregressive\ntraining in language generation to visual generation is discretization --\nrepresenting continuous data like images and videos as discrete tokens. Common\nmethods of discretizing images and videos include modeling raw pixel values,\nwhich are prohibitively lengthy, or vector quantization, which requires\nconvoluted pre-hoc training. In this work, we propose to directly model images\nand videos as compressed files saved on computers via canonical codecs (e.g.,\nJPEG, AVC/H.264). Using the default Llama architecture without any\nvision-specific modifications, we pretrain JPEG-LM from scratch to generate\nimages (and AVC-LM to generate videos as a proof of concept), by directly\noutputting compressed file bytes in JPEG and AVC formats. Evaluation of image\ngeneration shows that this simple and straightforward approach is more\neffective than pixel-based modeling and sophisticated vector quantization\nbaselines (on which our method yields a 31% reduction in FID). Our analysis\nshows that JPEG-LM has an especial advantage over vector quantization models in\ngenerating long-tail visual elements. Overall, we show that using canonical\ncodec representations can help lower the barriers between language generation\nand visual generation, facilitating future research on multi-modal\nlanguage/image/video LLMs.\n","authors":["Xiaochuang Han","Marjan Ghazvininejad","Pang Wei Koh","Yulia Tsvetkov"],"pdf_url":"https://arxiv.org/pdf/2408.08459v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11253v1","updated":"2024-08-21T00:20:08Z","published":"2024-08-21T00:20:08Z","title":"Automatic Image Annotation (AIA) of AlmondNet-20 Method for Almond\n Detection by Improved CNN-based Model","summary":" In response to the burgeoning global demand for premium agricultural\nproducts, particularly within the competitive nut market, this paper introduces\nan innovative methodology aimed at enhancing the grading process for almonds\nand their shells. Leveraging state-of-the-art Deep Convolutional Neural\nNetworks (CNNs), specifically the AlmondNet-20 architecture, our study achieves\nexceptional accuracy exceeding 99%, facilitated by the utilization of a\n20-layer CNN model. To bolster robustness in differentiating between almonds\nand shells, data augmentation techniques are employed, ensuring the reliability\nand accuracy of our classification system. Our model, meticulously trained over\n1000 epochs, demonstrates remarkable performance, boasting an accuracy rate of\n99% alongside a minimal loss function of 0.0567. Rigorous evaluation through\ntest datasets further validates the efficacy of our approach, revealing\nimpeccable precision, recall, and F1-score metrics for almond detection. Beyond\nits technical prowess, this advanced classification system offers tangible\nbenefits to both industry experts and non-specialists alike, ensuring globally\nreliable almond classification. The application of deep learning algorithms, as\nshowcased in our study, not only enhances grading accuracy but also presents\nopportunities for product patents, thereby contributing to the economic value\nof our nation. Through the adoption of cutting-edge technologies such as the\nAlmondNet-20 model, we pave the way for future advancements in agricultural\nproduct classification, ultimately enriching global trade and economic\nprosperity.\n","authors":["Mohsen Asghari Ilani","Saba Moftakhar Tehran","Ashkan Kavei","Arian Radmehr"],"pdf_url":"https://arxiv.org/pdf/2408.11253v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12037v1","updated":"2024-08-21T23:42:16Z","published":"2024-08-21T23:42:16Z","title":"FUSELOC: Fusing Global and Local Descriptors to Disambiguate 2D-3D\n Matching in Visual Localization","summary":" Hierarchical methods represent state-of-the-art visual localization,\noptimizing search efficiency by using global descriptors to focus on relevant\nmap regions. However, this state-of-the-art performance comes at the cost of\nsubstantial memory requirements, as all database images must be stored for\nfeature matching. In contrast, direct 2D-3D matching algorithms require\nsignificantly less memory but suffer from lower accuracy due to the larger and\nmore ambiguous search space. We address this ambiguity by fusing local and\nglobal descriptors using a weighted average operator within a 2D-3D search\nframework. This fusion rearranges the local descriptor space such that\ngeographically nearby local descriptors are closer in the feature space\naccording to the global descriptors. Therefore, the number of irrelevant\ncompeting descriptors decreases, specifically if they are geographically\ndistant, thereby increasing the likelihood of correctly matching a query\ndescriptor. We consistently improve the accuracy over local-only systems and\nachieve performance close to hierarchical methods while halving memory\nrequirements. Extensive experiments using various state-of-the-art local and\nglobal descriptors across four different datasets demonstrate the effectiveness\nof our approach. For the first time, our approach enables direct matching\nalgorithms to benefit from global descriptors while maintaining memory\nefficiency. The code for this paper will be published at\n\\href{https://github.com/sontung/descriptor-disambiguation}{github.com/sontung/descriptor-disambiguation}.\n","authors":["Son Tung Nguyen","Alejandro Fontan","Michael Milford","Tobias Fischer"],"pdf_url":"https://arxiv.org/pdf/2408.12037v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10161v2","updated":"2024-08-21T23:23:10Z","published":"2024-08-19T17:13:34Z","title":"NeuFlow v2: High-Efficiency Optical Flow Estimation on Edge Devices","summary":" Real-time high-accuracy optical flow estimation is crucial for various\nreal-world applications. While recent learning-based optical flow methods have\nachieved high accuracy, they often come with significant computational costs.\nIn this paper, we propose a highly efficient optical flow method that balances\nhigh accuracy with reduced computational demands. Building upon NeuFlow v1, we\nintroduce new components including a much more light-weight backbone and a fast\nrefinement module. Both these modules help in keeping the computational demands\nlight while providing close to state of the art accuracy. Compares to other\nstate of the art methods, our model achieves a 10x-70x speedup while\nmaintaining comparable performance on both synthetic and real-world data. It is\ncapable of running at over 20 FPS on 512x384 resolution images on a Jetson Orin\nNano. The full training and evaluation code is available at\nhttps://github.com/neufieldrobotics/NeuFlow_v2.\n","authors":["Zhiyong Zhang","Aniket Gupta","Huaizu Jiang","Hanumant Singh"],"pdf_url":"https://arxiv.org/pdf/2408.10161v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18243v2","updated":"2024-08-21T22:36:38Z","published":"2024-07-25T17:57:48Z","title":"BIV-Priv-Seg: Locating Private Content in Images Taken by People With\n Visual Impairments","summary":" Individuals who are blind or have low vision (BLV) are at a heightened risk\nof sharing private information if they share photographs they have taken. To\nfacilitate developing technologies that can help preserve privacy, we introduce\nBIV-Priv-Seg, the first localization dataset originating from people with\nvisual impairments that shows private content. It contains 1,028 images with\nsegmentation annotations for 16 private object categories. We first\ncharacterize BIV-Priv-Seg and then evaluate modern models' performance for\nlocating private content in the dataset. We find modern models struggle most\nwith locating private objects that are not salient, small, and lack text as\nwell as recognizing when private content is absent from an image. We facilitate\nfuture extensions by sharing our new dataset with the evaluation server at\nhttps://vizwiz.org/tasks-and-datasets/object-localization.\n","authors":["Yu-Yun Tseng","Tanusree Sharma","Lotus Zhang","Abigale Stangl","Leah Findlater","Yang Wang","Danna Gurari"],"pdf_url":"https://arxiv.org/pdf/2407.18243v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12023v1","updated":"2024-08-21T22:30:36Z","published":"2024-08-21T22:30:36Z","title":"Limitations in Employing Natural Language Supervision for Sensor-Based\n Human Activity Recognition -- And Ways to Overcome Them","summary":" Cross-modal contrastive pre-training between natural language and other\nmodalities, e.g., vision and audio, has demonstrated astonishing performance\nand effectiveness across a diverse variety of tasks and domains. In this paper,\nwe investigate whether such natural language supervision can be used for\nwearable sensor based Human Activity Recognition (HAR), and discover\nthat-surprisingly-it performs substantially worse than standard end-to-end\ntraining and self-supervision. We identify the primary causes for this as:\nsensor heterogeneity and the lack of rich, diverse text descriptions of\nactivities. To mitigate their impact, we also develop strategies and assess\ntheir effectiveness through an extensive experimental evaluation. These\nstrategies lead to significant increases in activity recognition, bringing\nperformance closer to supervised and self-supervised training, while also\nenabling the recognition of unseen activities and cross modal retrieval of\nvideos. Overall, our work paves the way for better sensor-language learning,\nultimately leading to the development of foundational models for HAR using\nwearables.\n","authors":["Harish Haresamudram","Apoorva Beedu","Mashfiqui Rabbi","Sankalita Saha","Irfan Essa","Thomas Ploetz"],"pdf_url":"https://arxiv.org/pdf/2408.12023v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15438v2","updated":"2024-08-21T22:29:08Z","published":"2023-11-26T21:52:47Z","title":"ProtoArgNet: Interpretable Image Classification with Super-Prototypes\n and Argumentation [Technical Report]","summary":" We propose ProtoArgNet, a novel interpretable deep neural architecture for\nimage classification in the spirit of prototypical-part-learning as found,\ne.g., in ProtoPNet. While earlier approaches associate every class with\nmultiple prototypical-parts, ProtoArgNet uses super-prototypes that combine\nprototypical-parts into a unified class representation. This is done by\ncombining local activations of prototypes in an MLP-like manner, enabling the\nlocalization of prototypes and learning (non-linear) spatial relationships\namong them. By leveraging a form of argumentation, ProtoArgNet is capable of\nproviding both supporting (i.e. `this looks like that') and attacking (i.e.\n`this differs from that') explanations. We demonstrate on several datasets that\nProtoArgNet outperforms state-of-the-art prototypical-part-learning approaches.\nMoreover, the argumentation component in ProtoArgNet is customisable to the\nuser's cognitive requirements by a process of sparsification, which leads to\nmore compact explanations compared to state-of-the-art approaches.\n","authors":["Hamed Ayoobi","Nico Potyka","Francesca Toni"],"pdf_url":"https://arxiv.org/pdf/2311.15438v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12013v1","updated":"2024-08-21T21:51:47Z","published":"2024-08-21T21:51:47Z","title":"Detection of Under-represented Samples Using Dynamic Batch Training for\n Brain Tumor Segmentation from MR Images","summary":" Brain tumors in magnetic resonance imaging (MR) are difficult,\ntime-consuming, and prone to human error. These challenges can be resolved by\ndeveloping automatic brain tumor segmentation methods from MR images. Various\ndeep-learning models based on the U-Net have been proposed for the task. These\ndeep-learning models are trained on a dataset of tumor images and then used for\nsegmenting the masks. Mini-batch training is a widely used method in deep\nlearning for training. However, one of the significant challenges associated\nwith this approach is that if the training dataset has under-represented\nsamples or samples with complex latent representations, the model may not\ngeneralize well to these samples. The issue leads to skewed learning of the\ndata, where the model learns to fit towards the majority representations while\nunderestimating the under-represented samples. The proposed dynamic batch\ntraining method addresses the challenges posed by under-represented data\npoints, data points with complex latent representation, and imbalances within\nthe class, where some samples may be harder to learn than others. Poor\nperformance of such samples can be identified only after the completion of the\ntraining, leading to the wastage of computational resources. Also, training\neasy samples after each epoch is an inefficient utilization of computation\nresources. To overcome these challenges, the proposed method identifies hard\nsamples and trains such samples for more iterations compared to easier samples\non the BraTS2020 dataset. Additionally, the samples trained multiple times are\nidentified and it provides a way to identify hard samples in the BraTS2020\ndataset. The comparison of the proposed training approach with U-Net and other\nmodels in the literature highlights the capabilities of the proposed training\napproach.\n","authors":["Subin Sahayam","John Michael Sujay Zakkam","Yoga Sri Varshan V","Umarani Jayaraman"],"pdf_url":"https://arxiv.org/pdf/2408.12013v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12009v1","updated":"2024-08-21T21:40:30Z","published":"2024-08-21T21:40:30Z","title":"CaRDiff: Video Salient Object Ranking Chain of Thought Reasoning for\n Saliency Prediction with Diffusion","summary":" Video saliency prediction aims to identify the regions in a video that\nattract human attention and gaze, driven by bottom-up features from the video\nand top-down processes like memory and cognition. Among these top-down\ninfluences, language plays a crucial role in guiding attention by shaping how\nvisual information is interpreted. Existing methods primarily focus on modeling\nperceptual information while neglecting the reasoning process facilitated by\nlanguage, where ranking cues are crucial outcomes of this process and practical\nguidance for saliency prediction. In this paper, we propose CaRDiff (Caption,\nRank, and generate with Diffusion), a framework that imitates the process by\nintegrating a multimodal large language model (MLLM), a grounding module, and a\ndiffusion model, to enhance video saliency prediction. Specifically, we\nintroduce a novel prompting method VSOR-CoT (Video Salient Object Ranking Chain\nof Thought), which utilizes an MLLM with a grounding module to caption video\ncontent and infer salient objects along with their rankings and positions. This\nprocess derives ranking maps that can be sufficiently leveraged by the\ndiffusion model to decode the saliency maps for the given video accurately.\nExtensive experiments show the effectiveness of VSOR-CoT in improving the\nperformance of video saliency prediction. The proposed CaRDiff performs better\nthan state-of-the-art models on the MVS dataset and demonstrates cross-dataset\ncapabilities on the DHF1k dataset through zero-shot evaluation.\n","authors":["Yunlong Tang","Gen Zhan","Li Yang","Yiting Liao","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2408.12009v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05892v2","updated":"2024-08-21T21:26:28Z","published":"2024-08-12T02:10:18Z","title":"Polyp SAM 2: Advancing Zero shot Polyp Segmentation in Colorectal Cancer\n Detection","summary":" Polyp segmentation plays a crucial role in the early detection and diagnosis\nof colorectal cancer. However, obtaining accurate segmentations often requires\nlabor-intensive annotations and specialized models. Recently, Meta AI Research\nreleased a general Segment Anything Model 2 (SAM 2), which has demonstrated\npromising performance in several segmentation tasks. In this work, we evaluate\nthe performance of SAM 2 in segmenting polyps under various prompted settings.\nWe hope this report will provide insights to advance the field of polyp\nsegmentation and promote more interesting work in the future. This project is\npublicly available at https://github.com/ sajjad-sh33/Polyp-SAM-2.\n","authors":["Mobina Mansoori","Sajjad Shahabodini","Jamshid Abouei","Konstantinos N. Plataniotis","Arash Mohammadi"],"pdf_url":"https://arxiv.org/pdf/2408.05892v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11992v1","updated":"2024-08-21T21:03:36Z","published":"2024-08-21T21:03:36Z","title":"MBSS-T1: Model-Based Self-Supervised Motion Correction for Robust\n Cardiac T1 Mapping","summary":" T1 mapping is a valuable quantitative MRI technique for diagnosing diffuse\nmyocardial diseases. Traditional methods, relying on breath-hold sequences and\necho triggering, face challenges with patient compliance and arrhythmias,\nlimiting their effectiveness. Image registration can enable motion-robust T1\nmapping, but inherent intensity differences between time points pose a\nchallenge. We introduce MBSS-T1, a self-supervised model for motion correction\nin cardiac T1 mapping, constrained by physical and anatomical principles. The\nphysical constraints ensure expected signal decay behavior, while the\nanatomical constraints maintain realistic deformations. The unique combination\nof these constraints ensures accurate T1 mapping along the longitudinal\nrelaxation axis. MBSS-T1 outperformed baseline deep-learning-based image\nregistration approaches in a 5-fold experiment on a public dataset of 210\npatients (STONE sequence) and an internal dataset of 19 patients (MOLLI\nsequence). MBSS-T1 excelled in model fitting quality (R2: 0.974 vs. 0.941,\n0.946), anatomical alignment (Dice score: 0.921 vs. 0.984, 0.988), and expert\nvisual quality assessment for the presence of visible motion artifacts (4.33\nvs. 3.34, 3.62). MBSS-T1 has the potential to enable motion-robust T1 mapping\nfor a broader range of patients, overcoming challenges such as arrhythmias, and\nsuboptimal compliance, and allowing for free-breathing T1 mapping without\nrequiring large training datasets.\n","authors":["Eyal Hanania","Ilya Volovik","Daphna Link-Sourani","Israel Cohen","Moti Freiman"],"pdf_url":"https://arxiv.org/pdf/2408.11992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11982v1","updated":"2024-08-21T20:32:45Z","published":"2024-08-21T20:32:45Z","title":"AIM 2024 Challenge on Compressed Video Quality Assessment: Methods and\n Results","summary":" Video quality assessment (VQA) is a crucial task in the development of video\ncompression standards, as it directly impacts the viewer experience. This paper\npresents the results of the Compressed Video Quality Assessment challenge, held\nin conjunction with the Advances in Image Manipulation (AIM) workshop at ECCV\n2024. The challenge aimed to evaluate the performance of VQA methods on a\ndiverse dataset of 459 videos, encoded with 14 codecs of various compression\nstandards (AVC/H.264, HEVC/H.265, AV1, and VVC/H.266) and containing a\ncomprehensive collection of compression artifacts. To measure the methods\nperformance, we employed traditional correlation coefficients between their\npredictions and subjective scores, which were collected via large-scale\ncrowdsourced pairwise human comparisons. For training purposes, participants\nwere provided with the Compressed Video Quality Assessment Dataset (CVQAD), a\npreviously developed dataset of 1022 videos. Up to 30 participating teams\nregistered for the challenge, while we report the results of 6 teams, which\nsubmitted valid final solutions and code for reproducing the results. Moreover,\nwe calculated and present the performance of state-of-the-art VQA methods on\nthe developed dataset, providing a comprehensive benchmark for future research.\nThe dataset, results, and online leaderboard are publicly available at\nhttps://challenges.videoprocessing.ai/challenges/compressed-video-quality-assessment.html.\n","authors":["Maksim Smirnov","Aleksandr Gushchin","Anastasia Antsiferova","Dmitry Vatolin","Radu Timofte","Ziheng Jia","Zicheng Zhang","Wei Sun","Jiaying Qian","Yuqin Cao","Yinan Sun","Yuxin Zhu","Xiongkuo Min","Guangtao Zhai","Kanjar De","Qing Luo","Ao-Xiang Zhang","Peng Zhang","Haibo Lei","Linyan Jiang","Yaqing Li","Wenhui Meng","Xiaoheng Tan","Haiqiang Wang","Xiaozhong Xu","Shan Liu","Zhenzhong Chen","Zhengxue Cheng","Jiahao Xiao","Jun Xu","Chenlong He","Qi Zheng","Ruoxi Zhu","Min Li","Yibo Fan","Zhengzhong Tu"],"pdf_url":"https://arxiv.org/pdf/2408.11982v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19786v2","updated":"2024-08-21T19:54:02Z","published":"2024-03-28T19:10:54Z","title":"Zero-shot Prompt-based Video Encoder for Surgical Gesture Recognition","summary":" Purpose: In order to produce a surgical gesture recognition system that can\nsupport a wide variety of procedures, either a very large annotated dataset\nmust be acquired, or fitted models must generalize to new labels (so called\n\"zero-shot\" capability). In this paper we investigate the feasibility of latter\noption. Methods: Leveraging the Bridge-Prompt framework, we prompt-tune a\npre-trained vision-text model (CLIP) for gesture recognition in surgical\nvideos. This can utilize extensive outside video data such as text, but also\nmake use of label meta-data and weakly supervised contrastive losses. Results:\nOur experiments show that prompt-based video encoder outperforms standard\nencoders in surgical gesture recognition tasks. Notably, it displays strong\nperformance in zero-shot scenarios, where gestures/tasks that were not provided\nduring the encoder training phase are included in the prediction phase.\nAdditionally, we measure the benefit of inclusion text descriptions in the\nfeature extractor training schema. Conclusion Bridge-Prompt and similar\npre-trained+prompt-tuned video encoder models present significant visual\nrepresentation for surgical robotics, especially in gesture recognition tasks.\nGiven the diverse range of surgical tasks (gestures), the ability of these\nmodels to zero-shot transfer without the need for any task (gesture) specific\nretraining makes them invaluable.\n","authors":["Mingxing Rao","Yinhong Qin","Soheil Kolouri","Jie Ying Wu","Daniel Moyer"],"pdf_url":"https://arxiv.org/pdf/2403.19786v2.pdf","comment":"17 pages,4 figures, 7 tables, IPCAI 2024 & IJCARS"},{"id":"http://arxiv.org/abs/2406.11819v2","updated":"2024-08-21T19:50:43Z","published":"2024-06-17T17:55:55Z","title":"MegaScenes: Scene-Level View Synthesis at Scale","summary":" Scene-level novel view synthesis (NVS) is fundamental to many vision and\ngraphics applications. Recently, pose-conditioned diffusion models have led to\nsignificant progress by extracting 3D information from 2D foundation models,\nbut these methods are limited by the lack of scene-level training data. Common\ndataset choices either consist of isolated objects (Objaverse), or of\nobject-centric scenes with limited pose distributions (DTU, CO3D). In this\npaper, we create a large-scale scene-level dataset from Internet photo\ncollections, called MegaScenes, which contains over 100K structure from motion\n(SfM) reconstructions from around the world. Internet photos represent a\nscalable data source but come with challenges such as lighting and transient\nobjects. We address these issues to further create a subset suitable for the\ntask of NVS. Additionally, we analyze failure cases of state-of-the-art NVS\nmethods and significantly improve generation consistency. Through extensive\nexperiments, we validate the effectiveness of both our dataset and method on\ngenerating in-the-wild scenes. For details on the dataset and code, see our\nproject page at https://megascenes.github.io.\n","authors":["Joseph Tung","Gene Chou","Ruojin Cai","Guandao Yang","Kai Zhang","Gordon Wetzstein","Bharath Hariharan","Noah Snavely"],"pdf_url":"https://arxiv.org/pdf/2406.11819v2.pdf","comment":"Accepted at ECCV 2024. Our project page is at\n https://megascenes.github.io"},{"id":"http://arxiv.org/abs/2408.11966v1","updated":"2024-08-21T19:37:17Z","published":"2024-08-21T19:37:17Z","title":"Visual Localization in 3D Maps: Comparing Point Cloud, Mesh, and NeRF\n Representations","summary":" This paper introduces and assesses a cross-modal global visual localization\nsystem that can localize camera images within a color 3D map representation\nbuilt using both visual and lidar sensing. We present three different\nstate-of-the-art methods for creating the color 3D maps: point clouds, meshes,\nand neural radiance fields (NeRF). Our system constructs a database of\nsynthetic RGB and depth image pairs from these representations. This database\nserves as the basis for global localization. We present an automatic approach\nthat builds this database by synthesizing novel images of the scene and\nexploiting the 3D structure encoded in the different representations. Next, we\npresent a global localization system that relies on the synthetic image\ndatabase to accurately estimate the 6 DoF camera poses of monocular query\nimages. Our localization approach relies on different learning-based global\ndescriptors and feature detectors which enable robust image retrieval and\nmatching despite the domain gap between (real) query camera images and the\nsynthetic database images. We assess the system's performance through extensive\nreal-world experiments in both indoor and outdoor settings, in order to\nevaluate the effectiveness of each map representation and the benefits against\ntraditional structure-from-motion localization approaches. Our results show\nthat all three map representations can achieve consistent localization success\nrates of 55% and higher across various environments. NeRF synthesized images\nshow superior performance, localizing query images at an average success rate\nof 72%. Furthermore, we demonstrate that our synthesized database enables\nglobal localization even when the map creation data and the localization\nsequence are captured when travelling in opposite directions. Our system,\noperating in real-time on a mobile laptop equipped with a GPU, achieves a\nprocessing rate of 1Hz.\n","authors":["Lintong Zhang","Yifu Tao","Jiarong Lin","Fu Zhang","Maurice Fallon"],"pdf_url":"https://arxiv.org/pdf/2408.11966v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11965v1","updated":"2024-08-21T19:36:27Z","published":"2024-08-21T19:36:27Z","title":"CT-AGRG: Automated Abnormality-Guided Report Generation from 3D Chest CT\n Volumes","summary":" The rapid increase of computed tomography (CT) scans and their time-consuming\nmanual analysis have created an urgent need for robust automated analysis\ntechniques in clinical settings. These aim to assist radiologists and help them\nmanaging their growing workload. Existing methods typically generate entire\nreports directly from 3D CT images, without explicitly focusing on observed\nabnormalities. This unguided approach often results in repetitive content or\nincomplete reports, failing to prioritize anomaly-specific descriptions. We\npropose a new anomaly-guided report generation model, which first predicts\nabnormalities and then generates targeted descriptions for each. Evaluation on\na public dataset demonstrates significant improvements in report quality and\nclinical relevance. We extend our work by conducting an ablation study to\ndemonstrate its effectiveness.\n","authors":["Theo Di Piazza"],"pdf_url":"https://arxiv.org/pdf/2408.11965v1.pdf","comment":"15 pages, 9 figures, submitted to ISBI 2025"},{"id":"http://arxiv.org/abs/2408.11963v1","updated":"2024-08-21T19:31:39Z","published":"2024-08-21T19:31:39Z","title":"Real-Time Incremental Explanations for Object Detectors","summary":" Existing black box explainability tools for object detectors rely on multiple\ncalls to the model, which prevents them from computing explanations in real\ntime. In this paper we introduce IncX, an algorithm for real-time incremental\napproximations of explanations, based on linear transformations of saliency\nmaps. We implement IncX on top of D-RISE, a state-of-the-art black-box\nexplainability tool for object detectors. We show that IncX's explanations are\ncomparable in quality to those of D-RISE, with insertion curves being within\n8%, and are computed two orders of magnitude faster that D-RISE's explanations.\n","authors":["Santiago Calderón-Peña","Hana Chockler","David A. Kelly"],"pdf_url":"https://arxiv.org/pdf/2408.11963v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11958v1","updated":"2024-08-21T19:25:03Z","published":"2024-08-21T19:25:03Z","title":"CARLA Drone: Monocular 3D Object Detection from a Different Perspective","summary":" Existing techniques for monocular 3D detection have a serious restriction.\nThey tend to perform well only on a limited set of benchmarks, faring well\neither on ego-centric car views or on traffic camera views, but rarely on both.\nTo encourage progress, this work advocates for an extended evaluation of 3D\ndetection frameworks across different camera perspectives. We make two key\ncontributions. First, we introduce the CARLA Drone dataset, CDrone. Simulating\ndrone views, it substantially expands the diversity of camera perspectives in\nexisting benchmarks. Despite its synthetic nature, CDrone represents a\nreal-world challenge. To show this, we confirm that previous techniques\nstruggle to perform well both on CDrone and a real-world 3D drone dataset.\nSecond, we develop an effective data augmentation pipeline called GroundMix.\nIts distinguishing element is the use of the ground for creating 3D-consistent\naugmentation of a training image. GroundMix significantly boosts the detection\naccuracy of a lightweight one-stage detector. In our expanded evaluation, we\nachieve the average precision on par with or substantially higher than the\nprevious state of the art across all tested datasets.\n","authors":["Johannes Meier","Luca Scalerandi","Oussema Dhaouadi","Jacques Kaiser","Nikita Araslanov","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2408.11958v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10500v2","updated":"2024-08-21T18:58:26Z","published":"2024-08-20T02:46:03Z","title":"SZTU-CMU at MER2024: Improving Emotion-LLaMA with Conv-Attention for\n Multimodal Emotion Recognition","summary":" This paper presents our winning approach for the MER-NOISE and MER-OV tracks\nof the MER2024 Challenge on multimodal emotion recognition. Our system\nleverages the advanced emotional understanding capabilities of Emotion-LLaMA to\ngenerate high-quality annotations for unlabeled samples, addressing the\nchallenge of limited labeled data. To enhance multimodal fusion while\nmitigating modality-specific noise, we introduce Conv-Attention, a lightweight\nand efficient hybrid framework. Extensive experimentation vali-dates the\neffectiveness of our approach. In the MER-NOISE track, our system achieves a\nstate-of-the-art weighted average F-score of 85.30%, surpassing the second and\nthird-place teams by 1.47% and 1.65%, respectively. For the MER-OV track, our\nutilization of Emotion-LLaMA for open-vocabulary annotation yields an 8.52%\nimprovement in average accuracy and recall compared to GPT-4V, securing the\nhighest score among all participating large multimodal models. The code and\nmodel for Emotion-LLaMA are available at\nhttps://github.com/ZebangCheng/Emotion-LLaMA.\n","authors":["Zebang Cheng","Shuyuan Tu","Dawei Huang","Minghan Li","Xiaojiang Peng","Zhi-Qi Cheng","Alexander G. Hauptmann"],"pdf_url":"https://arxiv.org/pdf/2408.10500v2.pdf","comment":"Ranked 1st in MER24@IJCAI and MRAC24@ACM MM (MER-NOISE & MER-OV\n (self-evaluated))"},{"id":"http://arxiv.org/abs/2408.11915v1","updated":"2024-08-21T18:06:15Z","published":"2024-08-21T18:06:15Z","title":"Video-Foley: Two-Stage Video-To-Sound Generation via Temporal Event\n Condition For Foley Sound","summary":" Foley sound synthesis is crucial for multimedia production, enhancing user\nexperience by synchronizing audio and video both temporally and semantically.\nRecent studies on automating this labor-intensive process through\nvideo-to-sound generation face significant challenges. Systems lacking explicit\ntemporal features suffer from poor controllability and alignment, while\ntimestamp-based models require costly and subjective human annotation. We\npropose Video-Foley, a video-to-sound system using Root Mean Square (RMS) as a\ntemporal event condition with semantic timbre prompts (audio or text). RMS, a\nframe-level intensity envelope feature closely related to audio semantics,\nensures high controllability and synchronization. The annotation-free\nself-supervised learning framework consists of two stages, Video2RMS and\nRMS2Sound, incorporating novel ideas including RMS discretization and\nRMS-ControlNet with a pretrained text-to-audio model. Our extensive evaluation\nshows that Video-Foley achieves state-of-the-art performance in audio-visual\nalignment and controllability for sound timing, intensity, timbre, and nuance.\nCode, model weights, and demonstrations are available on the accompanying\nwebsite. (https://jnwnlee.github.io/video-foley-demo)\n","authors":["Junwon Lee","Jaekwon Im","Dabin Kim","Juhan Nam"],"pdf_url":"https://arxiv.org/pdf/2408.11915v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2404.11457v2","updated":"2024-08-21T17:23:03Z","published":"2024-04-17T15:05:03Z","title":"Bias and Unfairness in Information Retrieval Systems: New Challenges in\n the LLM Era","summary":" With the rapid advancements of large language models (LLMs), information\nretrieval (IR) systems, such as search engines and recommender systems, have\nundergone a significant paradigm shift. This evolution, while heralding new\nopportunities, introduces emerging challenges, particularly in terms of biases\nand unfairness, which may threaten the information ecosystem. In this paper, we\npresent a comprehensive survey of existing works on emerging and pressing bias\nand unfairness issues in IR systems when the integration of LLMs. We first\nunify bias and unfairness issues as distribution mismatch problems, providing a\ngroundwork for categorizing various mitigation strategies through distribution\nalignment. Subsequently, we systematically delve into the specific bias and\nunfairness issues arising from three critical stages of LLMs integration into\nIR systems: data collection, model development, and result evaluation. In doing\nso, we meticulously review and analyze recent literature, focusing on the\ndefinitions, characteristics, and corresponding mitigation strategies\nassociated with these issues. Finally, we identify and highlight some open\nproblems and challenges for future work, aiming to inspire researchers and\nstakeholders in the IR field and beyond to better understand and mitigate bias\nand unfairness issues of IR in this LLM era. We also consistently maintain a\nGitHub repository for the relevant papers and resources in this rising\ndirection at https://github.com/KID-22/LLM-IR-Bias-Fairness-Survey.\n","authors":["Sunhao Dai","Chen Xu","Shicheng Xu","Liang Pang","Zhenhua Dong","Jun Xu"],"pdf_url":"https://arxiv.org/pdf/2404.11457v2.pdf","comment":"KDD 2024 Tutorial&Survey; Tutorial Website:\n https://llm-ir-bias-fairness.github.io/"},{"id":"http://arxiv.org/abs/2408.11767v1","updated":"2024-08-21T16:39:47Z","published":"2024-08-21T16:39:47Z","title":"Do We Really Need to Drop Items with Missing Modalities in Multimodal\n Recommendation?","summary":" Generally, items with missing modalities are dropped in multimodal\nrecommendation. However, with this work, we question this procedure,\nhighlighting that it would further damage the pipeline of any multimodal\nrecommender system. First, we show that the lack of (some) modalities is, in\nfact, a widely-diffused phenomenon in multimodal recommendation. Second, we\npropose a pipeline that imputes missing multimodal features in recommendation\nby leveraging traditional imputation strategies in machine learning. Then,\ngiven the graph structure of the recommendation data, we also propose three\nmore effective imputation solutions that leverage the item-item co-purchase\ngraph and the multimodal similarities of co-interacted items. Our method can be\nplugged into any multimodal RSs in the literature working as an untrained\npre-processing phase, showing (through extensive experiments) that any data\npre-filtering is not only unnecessary but also harmful to the performance.\n","authors":["Daniele Malitesta","Emanuele Rossi","Claudio Pomo","Tommaso Di Noia","Fragkiskos D. Malliaros"],"pdf_url":"https://arxiv.org/pdf/2408.11767v1.pdf","comment":"Accepted at CIKM 2024 in the short paper track"},{"id":"http://arxiv.org/abs/2408.11762v1","updated":"2024-08-21T16:34:53Z","published":"2024-08-21T16:34:53Z","title":"A Novel Evaluation Perspective on GNNs-based Recommender Systems through\n the Topology of the User-Item Graph","summary":" Recently, graph neural networks (GNNs)-based recommender systems have\nencountered great success in recommendation. As the number of GNNs approaches\nrises, some works have started questioning the theoretical and empirical\nreasons behind their superior performance. Nevertheless, this investigation\nstill disregards that GNNs treat the recommendation data as a topological graph\nstructure. Building on this assumption, in this work, we provide a novel\nevaluation perspective on GNNs-based recommendation, which investigates the\nimpact of the graph topology on the recommendation performance. To this end, we\nselect some (topological) properties of the recommendation data and three\nGNNs-based recommender systems (i.e., LightGCN, DGCF, and SVD-GCN). Then,\nstarting from three popular recommendation datasets (i.e., Yelp2018, Gowalla,\nand Amazon-Book) we sample them to obtain 1,800 size-reduced datasets that\nstill resemble the original ones but can encompass a wider range of topological\nstructures. We use this procedure to build a large pool of samples for which\ndata characteristics and recommendation performance of the selected GNNs models\nare measured. Through an explanatory framework, we find strong correspondences\nbetween graph topology and GNNs performance, offering a novel evaluation\nperspective on these models.\n","authors":["Daniele Malitesta","Claudio Pomo","Vito Walter Anelli","Alberto Carlo Maria Mancino","Tommaso Di Noia","Eugenio Di Sciascio"],"pdf_url":"https://arxiv.org/pdf/2408.11762v1.pdf","comment":"Accepted at RecSys 2024 in the reproducibility track. arXiv admin\n note: substantial text overlap with arXiv:2308.10778"},{"id":"http://arxiv.org/abs/2408.11646v1","updated":"2024-08-21T14:17:24Z","published":"2024-08-21T14:17:24Z","title":"Mathematical Information Retrieval: Search and Question Answering","summary":" Mathematical information is essential for technical work, but its creation,\ninterpretation, and search are challenging. To help address these challenges,\nresearchers have developed multimodal search engines and mathematical question\nanswering systems. This book begins with a simple framework characterizing the\ninformation tasks that people and systems perform as we work to answer\nmath-related questions. The framework is used to organize and relate the other\ncore topics of the book, including interactions between people and systems,\nrepresenting math formulas in sources, and evaluation. We close with some key\nquestions and concrete directions for future work. This book is intended for\nuse by students, instructors, and researchers, and those who simply wish that\nit was easier to find and use mathematical information\n","authors":["Richard Zanibbi","Behrooz Mansouri","Anurag Agarwal"],"pdf_url":"https://arxiv.org/pdf/2408.11646v1.pdf","comment":"[DRAFT] 1st draft"},{"id":"http://arxiv.org/abs/2408.11623v1","updated":"2024-08-21T13:48:00Z","published":"2024-08-21T13:48:00Z","title":"End-to-End Cost-Effective Incentive Recommendation under Budget\n Constraint with Uplift Modeling","summary":" In modern online platforms, incentives are essential factors that enhance\nuser engagement and increase platform revenue. Over recent years, uplift\nmodeling has been introduced as a strategic approach to assign incentives to\nindividual customers. Especially in many real-world applications, online\nplatforms can only incentivize customers with specific budget constraints. This\nproblem can be reformulated as the multi-choice knapsack problem. This\noptimization aims to select the optimal incentive for each customer to maximize\nthe return on investment. Recent works in this field frequently tackle the\nbudget allocation problem using a two-stage approach. However, this solution is\nconfronted with the following challenges: (1) The causal inference methods\noften ignore the domain knowledge in online marketing, where the expected\nresponse curve of a customer should be monotonic and smooth as the incentive\nincreases. (2) An optimality gap between the two stages results in inferior\nsub-optimal allocation performance due to the loss of the incentive\nrecommendation information for the uplift prediction under the limited budget\nconstraint. To address these challenges, we propose a novel End-to-End\nCost-Effective Incentive Recommendation (E3IR) model under budget constraints.\nSpecifically, our methods consist of two modules, i.e., the uplift prediction\nmodule and the differentiable allocation module. In the uplift prediction\nmodule, we construct prediction heads to capture the incremental improvement\nbetween adjacent treatments with the marketing domain constraints (i.e.,\nmonotonic and smooth). We incorporate integer linear programming (ILP) as a\ndifferentiable layer input in the allocation module. Furthermore, we conduct\nextensive experiments on public and real product datasets, demonstrating that\nour E3IR improves allocation performance compared to existing two-stage\napproaches.\n","authors":["Zexu Sun","Hao Yang an Dugang Liu","Yunpeng Weng","Xing Tang","Xiuqiang He"],"pdf_url":"https://arxiv.org/pdf/2408.11623v1.pdf","comment":"Accepted by RecSys 2024"},{"id":"http://arxiv.org/abs/2408.11611v1","updated":"2024-08-21T13:39:21Z","published":"2024-08-21T13:39:21Z","title":"DTN: Deep Multiple Task-specific Feature Interactions Network for\n Multi-Task Recommendation","summary":" Neural-based multi-task learning (MTL) has been successfully applied to many\nrecommendation applications. However, these MTL models (e.g., MMoE, PLE) did\nnot consider feature interaction during the optimization, which is crucial for\ncapturing complex high-order features and has been widely used in ranking\nmodels for real-world recommender systems. Moreover, through feature importance\nanalysis across various tasks in MTL, we have observed an interesting\ndivergence phenomenon that the same feature can have significantly different\nimportance across different tasks in MTL. To address these issues, we propose\nDeep Multiple Task-specific Feature Interactions Network (DTN) with a novel\nmodel structure design. DTN introduces multiple diversified task-specific\nfeature interaction methods and task-sensitive network in MTL networks,\nenabling the model to learn task-specific diversified feature interaction\nrepresentations, which improves the efficiency of joint representation learning\nin a general setup. We applied DTN to our company's real-world E-commerce\nrecommendation dataset, which consisted of over 6.3 billion samples, the\nresults demonstrated that DTN significantly outperformed state-of-the-art MTL\nmodels. Moreover, during online evaluation of DTN in a large-scale E-commerce\nrecommender system, we observed a 3.28% in clicks, a 3.10% increase in orders\nand a 2.70% increase in GMV (Gross Merchandise Value) compared to the\nstate-of-the-art MTL models. Finally, extensive offline experiments conducted\non public benchmark datasets demonstrate that DTN can be applied to various\nscenarios beyond recommendations, enhancing the performance of ranking models.\n","authors":["Yaowen Bi","Yuteng Lian","Jie Cui","Jun Liu","Peijian Wang","Guanghui Li","Xuejun Chen","Jinglin Zhao","Hao Wen","Jing Zhang","Zhaoqi Zhang","Wenzhuo Song","Yang Sun","Weiwei Zhang","Mingchen Cai","Guanxing Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.11611v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21033v2","updated":"2024-08-21T13:09:02Z","published":"2024-07-17T05:42:43Z","title":"Multi-Grained Query-Guided Set Prediction Network for Grounded\n Multimodal Named Entity Recognition","summary":" Grounded Multimodal Named Entity Recognition (GMNER) is an emerging\ninformation extraction (IE) task, aiming to simultaneously extract entity\nspans, types, and corresponding visual regions of entities from given\nsentence-image pairs data. Recent unified methods employing machine reading\ncomprehension or sequence generation-based frameworks show limitations in this\ndifficult task. The former, utilizing human-designed queries, struggles to\ndifferentiate ambiguous entities, such as Jordan (Person) and off-White x\nJordan (Shoes). The latter, following the one-by-one decoding order, suffers\nfrom exposure bias issues. We maintain that these works misunderstand the\nrelationships of multimodal entities. To tackle these, we propose a novel\nunified framework named Multi-grained Query-guided Set Prediction Network\n(MQSPN) to learn appropriate relationships at intra-entity and inter-entity\nlevels. Specifically, MQSPN consists of a Multi-grained Query Set (MQS) and a\nMultimodal Set Prediction Network (MSP). MQS explicitly aligns entity regions\nwith entity spans by employing a set of learnable queries to strengthen\nintra-entity connections. Based on distinct intra-entity modeling, MSP\nreformulates GMNER as a set prediction, guiding models to establish appropriate\ninter-entity relationships from a global matching perspective. Additionally, we\nincorporate a query-guided Fusion Net (QFNet) to work as a glue network between\nMQS and MSP. Extensive experiments demonstrate that our approach achieves\nstate-of-the-art performances in widely used benchmarks.\n","authors":["Jielong Tang","Zhenxing Wang","Ziyang Gong","Jianxing Yu","Xiangwei Zhu","Jian Yin"],"pdf_url":"https://arxiv.org/pdf/2407.21033v2.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2408.11596v1","updated":"2024-08-21T13:06:28Z","published":"2024-08-21T13:06:28Z","title":"Calibrating the Predictions for Top-N Recommendations","summary":" Well-calibrated predictions of user preferences are essential for many\napplications. Since recommender systems typically select the top-N items for\nusers, calibration for those top-N items, rather than for all items, is\nimportant. We show that previous calibration methods result in miscalibrated\npredictions for the top-N items, despite their excellent calibration\nperformance when evaluated on all items. In this work, we address the\nmiscalibration in the top-N recommended items. We first define evaluation\nmetrics for this objective and then propose a generic method to optimize\ncalibration models focusing on the top-N items. It groups the top-N items by\ntheir ranks and optimizes distinct calibration models for each group with\nrank-dependent training weights. We verify the effectiveness of the proposed\nmethod for both explicit and implicit feedback datasets, using diverse classes\nof recommender models.\n","authors":["Masahiro Sato"],"pdf_url":"https://arxiv.org/pdf/2408.11596v1.pdf","comment":"accepted at RecSys 2024"},{"id":"http://arxiv.org/abs/2408.11565v1","updated":"2024-08-21T12:18:28Z","published":"2024-08-21T12:18:28Z","title":"Oh, Behave! Country Representation Dynamics Created by Feedback Loops in\n Music Recommender Systems","summary":" Recent work suggests that music recommender systems are prone to\ndisproportionally frequent recommendations of music from countries more\nprominently represented in the training data, notably the US. However, it\nremains unclear to what extent feedback loops in music recommendation influence\nthe dynamics of such imbalance. In this work, we investigate the dynamics of\nrepresentation of local (i.e., country-specific) and US-produced music in user\nprofiles and recommendations. To this end, we conduct a feedback loop\nsimulation study using the standardized LFM-2b dataset. The results suggest\nthat most of the investigated recommendation models decrease the proportion of\nmusic from local artists in their recommendations. Furthermore, we find that\nmodels preserving average proportions of US and local music do not necessarily\nprovide country-calibrated recommendations. We also look into popularity\ncalibration and, surprisingly, find that the most popularity-calibrated model\nin our study (ItemKNN) provides the least country-calibrated recommendations.\nIn addition, users from less represented countries (e.g., Finland) are, in the\nlong term, most affected by the under-representation of their local music in\nrecommendations.\n","authors":["Oleg Lesota","Jonas Geiger","Max Walder","Dominik Kowald","Markus Schedl"],"pdf_url":"https://arxiv.org/pdf/2408.11565v1.pdf","comment":"RecSys 2024"},{"id":"http://arxiv.org/abs/2408.11557v1","updated":"2024-08-21T12:09:37Z","published":"2024-08-21T12:09:37Z","title":"A Quick, trustworthy spectral detection Q&A system based on the SDAAP\n Dataset and large language model","summary":" Large Language Model (LLM) has demonstrated significant success in a range of\nnatural language processing (NLP) tasks within general domain. The emergence of\nLLM has introduced innovative methodologies across diverse fields, including\nthe natural sciences. Researchers aim to implement automated, concurrent\nprocess driven by LLM to supplant conventional manual, repetitive and\nlabor-intensive work. In the domain of spectral analysis and detection, it is\nimperative for researchers to autonomously acquire pertinent knowledge across\nvarious research objects, which encompasses the spectroscopic techniques and\nthe chemometric methods that are employed in experiments and analysis.\nParadoxically, despite the recognition of spectroscopic detection as an\neffective analytical method, the fundamental process of knowledge retrieval\nremains both time-intensive and repetitive. In response to this challenge, we\nfirst introduced the Spectral Detection and Analysis Based Paper(SDAAP)\ndataset, which is the first open-source textual knowledge dataset for spectral\nanalysis and detection and contains annotated literature data as well as\ncorresponding knowledge instruction data. Subsequently, we also designed an\nautomated Q\\&A framework based on the SDAAP dataset, which can retrieve\nrelevant knowledge and generate high-quality responses by extracting entities\nin the input as retrieval parameters. It is worth noting that: within this\nframework, LLM is only used as a tool to provide generalizability, while RAG\ntechnique is used to accurately capture the source of the knowledge.This\napproach not only improves the quality of the generated responses, but also\nensures the traceability of the knowledge. Experimental results show that our\nframework generates responses with more reliable expertise compared to the\nbaseline.\n","authors":["Jiheng Liang","Ziru Yu","Zujie Xie","Xiangyang Yu"],"pdf_url":"https://arxiv.org/pdf/2408.11557v1.pdf","comment":"16 pages,10 figures,3 tables"},{"id":"http://arxiv.org/abs/2408.11523v1","updated":"2024-08-21T10:56:26Z","published":"2024-08-21T10:56:26Z","title":"LARR: Large Language Model Aided Real-time Scene Recommendation with\n Semantic Understanding","summary":" Click-Through Rate (CTR) prediction is crucial for Recommendation System(RS),\naiming to provide personalized recommendation services for users in many\naspects such as food delivery, e-commerce and so on. However, traditional RS\nrelies on collaborative signals, which lacks semantic understanding to\nreal-time scenes. We also noticed that a major challenge in utilizing Large\nLanguage Models (LLMs) for practical recommendation purposes is their\nefficiency in dealing with long text input. To break through the problems\nabove, we propose Large Language Model Aided Real-time Scene\nRecommendation(LARR), adopt LLMs for semantic understanding, utilizing\nreal-time scene information in RS without requiring LLM to process the entire\nreal-time scene text directly, thereby enhancing the efficiency of LLM-based\nCTR modeling. Specifically, recommendation domain-specific knowledge is\ninjected into LLM and then RS employs an aggregation encoder to build real-time\nscene information from separate LLM's outputs. Firstly, a LLM is continual\npretrained on corpus built from recommendation data with the aid of special\ntokens. Subsequently, the LLM is fine-tuned via contrastive learning on three\nkinds of sample construction strategies. Through this step, LLM is transformed\ninto a text embedding model. Finally, LLM's separate outputs for different\nscene features are aggregated by an encoder, aligning to collaborative signals\nin RS, enhancing the performance of recommendation model.\n","authors":["Zhizhong Wan","Bin Yin","Junjie Xie","Fei Jiang","Xiang Li","Wei Lin"],"pdf_url":"https://arxiv.org/pdf/2408.11523v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11372v1","updated":"2024-08-21T06:48:38Z","published":"2024-08-21T06:48:38Z","title":"Denoising Pre-Training and Customized Prompt Learning for Efficient\n Multi-Behavior Sequential Recommendation","summary":" In the realm of recommendation systems, users exhibit a diverse array of\nbehaviors when interacting with items. This phenomenon has spurred research\ninto learning the implicit semantic relationships between these behaviors to\nenhance recommendation performance. However, these methods often entail high\ncomputational complexity. To address concerns regarding efficiency,\npre-training presents a viable solution. Its objective is to extract knowledge\nfrom extensive pre-training data and fine-tune the model for downstream tasks.\nNevertheless, previous pre-training methods have primarily focused on\nsingle-behavior data, while multi-behavior data contains significant noise.\nAdditionally, the fully fine-tuning strategy adopted by these methods still\nimposes a considerable computational burden. In response to this challenge, we\npropose DPCPL, the first pre-training and prompt-tuning paradigm tailored for\nMulti-Behavior Sequential Recommendation. Specifically, in the pre-training\nstage, we commence by proposing a novel Efficient Behavior Miner (EBM) to\nfilter out the noise at multiple time scales, thereby facilitating the\ncomprehension of the contextual semantics of multi-behavior sequences.\nSubsequently, we propose to tune the pre-trained model in a highly efficient\nmanner with the proposed Customized Prompt Learning (CPL) module, which\ngenerates personalized, progressive, and diverse prompts to fully exploit the\npotential of the pre-trained model effectively. Extensive experiments on three\nreal-world datasets have unequivocally demonstrated that DPCPL not only\nexhibits high efficiency and effectiveness, requiring minimal parameter\nadjustments but also surpasses the state-of-the-art performance across a\ndiverse range of downstream tasks.\n","authors":["Hao Wang","Yongqiang Han","Kefan Wang","Kai Cheng","Zhen Wang","Wei Guo","Yong Liu","Defu Lian","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2408.11372v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12473v3","updated":"2024-08-21T06:31:40Z","published":"2024-05-21T03:25:32Z","title":"Learning Partially Aligned Item Representation for Cross-Domain\n Sequential Recommendation","summary":" Cross-domain sequential recommendation (CDSR) aims to uncover and transfer\nusers' sequential preferences across multiple recommendation domains. While\nsignificant endeavors have been made, they primarily concentrated on developing\nadvanced transfer modules and aligning user representations using\nself-supervised learning techniques. However, the problem of aligning item\nrepresentations has received limited attention, and misaligned item\nrepresentations can potentially lead to sub-optimal sequential modeling and\nuser representation alignment. To this end, we propose a model-agnostic\nframework called \\textbf{C}ross-domain item representation \\textbf{A}lignment\nfor \\textbf{C}ross-\\textbf{D}omain \\textbf{S}equential \\textbf{R}ecommendation\n(\\textbf{CA-CDSR}), which achieves sequence-aware generation and adaptively\npartial alignment for item representations. Specifically, we first develop a\nsequence-aware feature augmentation strategy, which captures both collaborative\nand sequential item correlations, thus facilitating holistic item\nrepresentation generation. Next, we conduct an empirical study to investigate\nthe partial representation alignment problem from a spectrum perspective. It\nmotivates us to devise an adaptive spectrum filter, achieving partial alignment\nadaptively. Furthermore, the aligned item representations can be fed into\ndifferent sequential encoders to obtain user representations. The entire\nframework is optimized in a multi-task learning paradigm with an annealing\nstrategy. Extensive experiments have demonstrated that CA-CDSR can surpass\nstate-of-the-art baselines by a significant margin and can effectively align\nitems in representation spaces to enhance performance.\n","authors":["Mingjia Yin","Hao Wang","Wei Guo","Yong Liu","Zhi Li","Sirui Zhao","Zhen Wang","Defu Lian","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2405.12473v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05606v3","updated":"2024-08-21T06:20:34Z","published":"2024-05-09T07:55:52Z","title":"Optimizing E-commerce Search: Toward a Generalizable and Rank-Consistent\n Pre-Ranking Model","summary":" In large e-commerce platforms, search systems are typically composed of a\nseries of modules, including recall, pre-ranking, and ranking phases. The\npre-ranking phase, serving as a lightweight module, is crucial for filtering\nout the bulk of products in advance for the downstream ranking module.\nIndustrial efforts on optimizing the pre-ranking model have predominantly\nfocused on enhancing ranking consistency, model structure, and generalization\ntowards long-tail items. Beyond these optimizations, meeting the system\nperformance requirements presents a significant challenge. Contrasting with\nexisting industry works, we propose a novel method: a Generalizable and\nRAnk-ConsistEnt Pre-Ranking Model (GRACE), which achieves: 1) Ranking\nconsistency by introducing multiple binary classification tasks that predict\nwhether a product is within the top-k results as estimated by the ranking\nmodel, which facilitates the addition of learning objectives on common\npoint-wise ranking models; 2) Generalizability through contrastive learning of\nrepresentation for all products by pre-training on a subset of ranking product\nembeddings; 3) Ease of implementation in feature construction and online\ndeployment. Our extensive experiments demonstrate significant improvements in\nboth offline metrics and online A/B test: a 0.75% increase in AUC and a 1.28%\nincrease in CVR.\n","authors":["Enqiang Xu","Yiming Qiu","Junyang Bai","Ping Zhang","Dadong Miao","Songlin Wang","Guoyu Tang","Lin Liu","Mingming Li"],"pdf_url":"https://arxiv.org/pdf/2405.05606v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11345v1","updated":"2024-08-21T05:09:53Z","published":"2024-08-21T05:09:53Z","title":"Deep Tree-based Retrieval for Efficient Recommendation: Theory and\n Method","summary":" With the development of deep learning techniques, deep recommendation models\nalso achieve remarkable improvements in terms of recommendation accuracy.\nHowever, due to the large number of candidate items in practice and the high\ncost of preference computation, these methods also suffer from low efficiency\nof recommendation. The recently proposed tree-based deep recommendation models\nalleviate the problem by directly learning tree structure and representations\nunder the guidance of recommendation objectives. However, such models have\nshortcomings. The max-heap assumption in the hierarchical tree, in which the\npreference for a parent node should be the maximum between the preferences for\nits children, is difficult to satisfy in their binary classification\nobjectives. To this end, we propose Tree-based Deep Retrieval (TDR for short)\nfor efficient recommendation. In TDR, all the trees generated during the\ntraining process are retained to form the forest. When learning the node\nrepresentation of each tree, we have to satisfy the max-heap assumption as much\nas possible and mimic beam search behavior over the tree in the training stage.\nThis is achieved by TDR to regard the training task as multi-classification\nover tree nodes at the same level. However, the number of tree nodes grows\nexponentially with levels, making us train the preference model with the\nguidance of the sampled-softmax technique. The experiments are conducted on\nreal-world datasets, validating the effectiveness of the proposed preference\nmodel learning method and tree learning method.\n","authors":["Ze Liu","Jin Zhang","Chao Feng","Defu Lian","Jie Wang","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2408.11345v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11331v1","updated":"2024-08-21T04:27:57Z","published":"2024-08-21T04:27:57Z","title":"Parallel Algorithms for Median Consensus Clustering in Complex Networks","summary":" We develop an algorithm that finds the consensus of many different clustering\nsolutions of a graph. We formulate the problem as a median set partitioning\nproblem and propose a greedy optimization technique. Unlike other approaches\nthat find median set partitions, our algorithm takes graph structure into\naccount and finds a comparable quality solution much faster than the other\napproaches. For graphs with known communities, our consensus partition captures\nthe actual community structure more accurately than alternative approaches. To\nmake it applicable to large graphs, we remove sequential dependencies from our\nalgorithm and design a parallel algorithm. Our parallel algorithm achieves 35x\nspeedup when utilizing 64 processing cores for large real-world graphs from\nsingle-cell experiments.\n","authors":["Md Taufique Hussain","Mahantesh Halappanavar","Samrat Chatterjee","Filippo Radicchi","Santo Fortunato","Ariful Azad"],"pdf_url":"https://arxiv.org/pdf/2408.11331v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2408.10469v2","updated":"2024-08-21T00:39:38Z","published":"2024-08-20T00:45:13Z","title":"LSVOS Challenge 3rd Place Report: SAM2 and Cutie based VOS","summary":" Video Object Segmentation (VOS) presents several challenges, including object\nocclusion and fragmentation, the dis-appearance and re-appearance of objects,\nand tracking specific objects within crowded scenes. In this work, we combine\nthe strengths of the state-of-the-art (SOTA) models SAM2 and Cutie to address\nthese challenges. Additionally, we explore the impact of various\nhyperparameters on video instance segmentation performance. Our approach\nachieves a J\\&F score of 0.7952 in the testing phase of LSVOS challenge VOS\ntrack, ranking third overall.\n","authors":["Xinyu Liu","Jing Zhang","Kexin Zhang","Xu Liu","Lingling Li"],"pdf_url":"https://arxiv.org/pdf/2408.10469v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2406.03668"},{"id":"http://arxiv.org/abs/2402.11060v2","updated":"2024-08-21T00:31:07Z","published":"2024-02-16T20:20:43Z","title":"Persona-DB: Efficient Large Language Model Personalization for Response\n Prediction with Collaborative Data Refinement","summary":" The increasing demand for personalized interactions with large language\nmodels (LLMs) calls for methodologies capable of accurately and efficiently\nidentifying user opinions and preferences. Retrieval augmentation emerges as an\neffective strategy, as it can accommodate a vast number of users without the\ncosts from fine-tuning. Existing research, however, has largely focused on\nenhancing the retrieval stage and devoted limited exploration toward optimizing\nthe representation of the database, a crucial aspect for tasks such as\npersonalization. In this work, we examine the problem from a novel angle,\nfocusing on how data can be better represented for more data-efficient\nretrieval in the context of LLM customization. To tackle this challenge, we\nintroduce Persona-DB, a simple yet effective framework consisting of a\nhierarchical construction process to improve generalization across task\ncontexts and collaborative refinement to effectively bridge knowledge gaps\namong users. In the evaluation of response prediction, Persona-DB demonstrates\nsuperior context efficiency in maintaining accuracy with a significantly\nreduced retrieval size, a critical advantage in scenarios with extensive\nhistories or limited context windows. Our experiments also indicate a marked\nimprovement of over 10% under cold-start scenarios, when users have extremely\nsparse data. Furthermore, our analysis reveals the increasing importance of\ncollaborative knowledge as the retrieval capacity expands.\n","authors":["Chenkai Sun","Ke Yang","Revanth Gangi Reddy","Yi R. Fung","Hou Pong Chan","Kevin Small","ChengXiang Zhai","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2402.11060v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12036v1","updated":"2024-08-21T23:42:06Z","published":"2024-08-21T23:42:06Z","title":"Reasoning and Tools for Human-Level Forecasting","summary":" Language models (LMs) trained on web-scale datasets are largely successful\ndue to their ability to memorize large amounts of training data, even if only\npresent in a few examples. These capabilities are often desirable in evaluation\non tasks such as question answering but raise questions about whether these\nmodels can exhibit genuine reasoning or succeed only at mimicking patterns from\nthe training data. This distinction is particularly salient in forecasting\ntasks, where the answer is not present in the training data, and the model must\nreason to make logical deductions. We present Reasoning and Tools for\nForecasting (RTF), a framework of reasoning-and-acting (ReAct) agents that can\ndynamically retrieve updated information and run numerical simulation with\nequipped tools. We evaluate our model with questions from competitive\nforecasting platforms and demonstrate that our method is competitive with and\ncan outperform human predictions. This suggests that LMs, with the right tools,\ncan indeed think and adapt like humans, offering valuable insights for\nreal-world decision-making.\n","authors":["Elvis Hsieh","Preston Fu","Jonathan Chen"],"pdf_url":"https://arxiv.org/pdf/2408.12036v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12008v1","updated":"2024-08-21T21:40:07Z","published":"2024-08-21T21:40:07Z","title":"Does It Look Sequential? An Analysis of Datasets for Evaluation of\n Sequential Recommendations","summary":" Sequential recommender systems are an important and demanded area of\nresearch. Such systems aim to use the order of interactions in a user's history\nto predict future interactions. The premise is that the order of interactions\nand sequential patterns play an essential role. Therefore, it is crucial to use\ndatasets that exhibit a sequential structure to evaluate sequential\nrecommenders properly.\n We apply several methods based on the random shuffling of the user's sequence\nof interactions to assess the strength of sequential structure across 15\ndatasets, frequently used for sequential recommender systems evaluation in\nrecent research papers presented at top-tier conferences. As shuffling\nexplicitly breaks sequential dependencies inherent in datasets, we estimate the\nstrength of sequential patterns by comparing metrics for shuffled and original\nversions of the dataset. Our findings show that several popular datasets have a\nrather weak sequential structure.\n","authors":["Anton Klenitskiy","Anna Volodkevich","Anton Pembek","Alexey Vasilev"],"pdf_url":"https://arxiv.org/pdf/2408.12008v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11942v1","updated":"2024-08-21T18:51:46Z","published":"2024-08-21T18:51:46Z","title":"What are the limits of cross-lingual dense passage retrieval for\n low-resource languages?","summary":" In this paper, we analyze the capabilities of the multi-lingual Dense Passage\nRetriever (mDPR) for extremely low-resource languages. In the Cross-lingual\nOpen-Retrieval Answer Generation (CORA) pipeline, mDPR achieves success on\nmultilingual open QA benchmarks across 26 languages, of which 9 were unseen\nduring training. These results are promising for Question Answering (QA) for\nlow-resource languages. We focus on two extremely low-resource languages for\nwhich mDPR performs poorly: Amharic and Khmer. We collect and curate datasets\nto train mDPR models using Translation Language Modeling (TLM) and\nquestion--passage alignment. We also investigate the effect of our extension on\nthe language distribution in the retrieval results. Our results on the MKQA and\nAmQA datasets show that language alignment brings improvements to mDPR for the\nlow-resource languages, but the improvements are modest and the results remain\nlow. We conclude that fulfilling CORA's promise to enable multilingual open QA\nin extremely low-resource settings is challenging because the model, the data,\nand the evaluation approach are intertwined. Hence, all three need attention in\nfollow-up work. We release our code for reproducibility and future work:\nhttps://anonymous.4open.science/r/Question-Answering-for-Low-Resource-Languages-B13C/\n","authors":["Jie Wu","Zhaochun Ren","Suzan Verberne"],"pdf_url":"https://arxiv.org/pdf/2408.11942v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11903v1","updated":"2024-08-21T18:00:21Z","published":"2024-08-21T18:00:21Z","title":"Ancient Wisdom, Modern Tools: Exploring Retrieval-Augmented LLMs for\n Ancient Indian Philosophy","summary":" LLMs have revolutionized the landscape of information retrieval and knowledge\ndissemination. However, their application in specialized areas is often\nhindered by factual inaccuracies and hallucinations, especially in long-tail\nknowledge distributions. We explore the potential of retrieval-augmented\ngeneration (RAG) models for long-form question answering (LFQA) in a\nspecialized knowledge domain. We present VedantaNY-10M, a dataset curated from\nextensive public discourses on the ancient Indian philosophy of Advaita\nVedanta. We develop and benchmark a RAG model against a standard, non-RAG LLM,\nfocusing on transcription, retrieval, and generation performance. Human\nevaluations by computational linguists and domain experts show that the RAG\nmodel significantly outperforms the standard model in producing factual and\ncomprehensive responses having fewer hallucinations. In addition, a\nkeyword-based hybrid retriever that emphasizes unique low-frequency terms\nfurther improves results. Our study provides insights into effectively\nintegrating modern large language models with ancient knowledge systems.\nProject page with dataset and code: https://sites.google.com/view/vedantany-10m\n","authors":["Priyanka Mandikal"],"pdf_url":"https://arxiv.org/pdf/2408.11903v1.pdf","comment":"Best paper at the Workshop on Machine Learning for Ancient Languages\n @ ACL 2024. Proceedings of the 1st Machine Learning for Ancient Languages\n Workshop, 2024.ml4al-1.23, Association for Computational Linguistics (ACL)\n 2024. Dataset, code, and evaluation is available at:\n https://sites.google.com/view/vedantany-10m"},{"id":"http://arxiv.org/abs/2008.13078v2","updated":"2024-08-21T00:18:00Z","published":"2020-08-30T02:57:38Z","title":"Probability-turbulence divergence: A tunable allotaxonometric instrument\n for comparing heavy-tailed categorical distributions","summary":" Real-world complex systems often comprise many distinct types of elements as\nwell as many more types of networked interactions between elements. When the\nrelative abundances of types can be measured well, we further observe\nheavy-tailed categorical distributions for type frequencies. For the comparison\nof type frequency distributions of two systems or a system with itself at\ndifferent time points in time -- a facet of allotaxonometry -- a great range of\nprobability divergences are available. Here, we introduce and explore\n`probability-turbulence divergence', a tunable, straightforward, and\ninterpretable instrument for comparing normalizable categorical frequency\ndistributions. We model probability-turbulence divergence (PTD) after\nrank-turbulence divergence (RTD). While probability-turbulence divergence is\nmore limited in application than rank-turbulence divergence, it is more\nsensitive to changes in type frequency. We build allotaxonographs to display\nprobability turbulence, incorporating a way to visually accommodate zero\nprobabilities for `exclusive types' which are types that appear in only one\nsystem. We explore comparisons of example distributions taken from literature,\nsocial media, and ecology. We show how probability-turbulence divergence either\nexplicitly or functionally generalizes many existing kinds of distances and\nmeasures, including, as special cases, $L^{(p)}$ norms, the S{\\o}rensen-Dice\ncoefficient (the $F_1$ statistic), and the Hellinger distance. We discuss\nsimilarities with the generalized entropies of R{\\'e}nyi and Tsallis, and the\ndiversity indices (or Hill numbers) from ecology. We close with thoughts on\nopen problems concerning the optimization of the tuning of rank- and\nprobability-turbulence divergence.\n","authors":["P. S. Dodds","J. R. Minot","M. V. Arnold","T. Alshaabi","J. L. Adams","A. J. Reagan","C. M. Danforth"],"pdf_url":"https://arxiv.org/pdf/2008.13078v2.pdf","comment":"14 pages, 7 figures"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2408.11816v1","updated":"2024-08-21T17:59:31Z","published":"2024-08-21T17:59:31Z","title":"Efficient Exploration and Discriminative World Model Learning with an\n Object-Centric Abstraction","summary":" In the face of difficult exploration problems in reinforcement learning, we\nstudy whether giving an agent an object-centric mapping (describing a set of\nitems and their attributes) allow for more efficient learning. We found this\nproblem is best solved hierarchically by modelling items at a higher level of\nstate abstraction to pixels, and attribute change at a higher level of temporal\nabstraction to primitive actions. This abstraction simplifies the transition\ndynamic by making specific future states easier to predict. We make use of this\nto propose a fully model-based algorithm that learns a discriminative world\nmodel, plans to explore efficiently with only a count-based intrinsic reward,\nand can subsequently plan to reach any discovered (abstract) states.\n We demonstrate the model's ability to (i) efficiently solve single tasks,\n(ii) transfer zero-shot and few-shot across item types and environments, and\n(iii) plan across long horizons. Across a suite of 2D crafting and MiniHack\nenvironments, we empirically show our model significantly out-performs\nstate-of-the-art low-level methods (without abstraction), as well as performant\nmodel-free and model-based methods using the same abstraction. Finally, we show\nhow to reinforce learn low level object-perturbing policies, as well as\nsupervise learn the object mapping itself.\n","authors":["Anthony GX-Chen","Kenneth Marino","Rob Fergus"],"pdf_url":"https://arxiv.org/pdf/2408.11816v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2408.11812v1","updated":"2024-08-21T17:57:51Z","published":"2024-08-21T17:57:51Z","title":"Scaling Cross-Embodied Learning: One Policy for Manipulation,\n Navigation, Locomotion and Aviation","summary":" Modern machine learning systems rely on large datasets to attain broad\ngeneralization, and this often poses a challenge in robot learning, where each\nrobotic platform and task might have only a small dataset. By training a single\npolicy across many different kinds of robots, a robot learning method can\nleverage much broader and more diverse datasets, which in turn can lead to\nbetter generalization and robustness. However, training a single policy on\nmulti-robot data is challenging because robots can have widely varying sensors,\nactuators, and control frequencies. We propose CrossFormer, a scalable and\nflexible transformer-based policy that can consume data from any embodiment. We\ntrain CrossFormer on the largest and most diverse dataset to date, 900K\ntrajectories across 20 different robot embodiments. We demonstrate that the\nsame network weights can control vastly different robots, including single and\ndual arm manipulation systems, wheeled robots, quadcopters, and quadrupeds.\nUnlike prior work, our model does not require manual alignment of the\nobservation or action spaces. Extensive experiments in the real world show that\nour method matches the performance of specialist policies tailored for each\nembodiment, while also significantly outperforming the prior state of the art\nin cross-embodiment learning.\n","authors":["Ria Doshi","Homer Walke","Oier Mees","Sudeep Dasari","Sergey Levine"],"pdf_url":"https://arxiv.org/pdf/2408.11812v1.pdf","comment":"Project website at https://crossformer-model.github.io/"},{"id":"http://arxiv.org/abs/2408.11805v1","updated":"2024-08-21T17:48:31Z","published":"2024-08-21T17:48:31Z","title":"ACE: A Cross-Platform Visual-Exoskeletons System for Low-Cost Dexterous\n Teleoperation","summary":" Learning from demonstrations has shown to be an effective approach to robotic\nmanipulation, especially with the recently collected large-scale robot data\nwith teleoperation systems. Building an efficient teleoperation system across\ndiverse robot platforms has become more crucial than ever. However, there is a\nnotable lack of cost-effective and user-friendly teleoperation systems for\ndifferent end-effectors, e.g., anthropomorphic robot hands and grippers, that\ncan operate across multiple platforms. To address this issue, we develop ACE, a\ncross-platform visual-exoskeleton system for low-cost dexterous teleoperation.\nOur system utilizes a hand-facing camera to capture 3D hand poses and an\nexoskeleton mounted on a portable base, enabling accurate real-time capture of\nboth finger and wrist poses. Compared to previous systems, which often require\nhardware customization according to different robots, our single system can\ngeneralize to humanoid hands, arm-hands, arm-gripper, and quadruped-gripper\nsystems with high-precision teleoperation. This enables imitation learning for\ncomplex manipulation tasks on diverse platforms.\n","authors":["Shiqi Yang","Minghuan Liu","Yuzhe Qin","Runyu Ding","Jialong Li","Xuxin Cheng","Ruihan Yang","Sha Yi","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2408.11805v1.pdf","comment":"Webpage: https://ace-teleop.github.io/"},{"id":"http://arxiv.org/abs/2408.11804v1","updated":"2024-08-21T17:48:01Z","published":"2024-08-21T17:48:01Z","title":"Approaching Deep Learning through the Spectral Dynamics of Weights","summary":" We propose an empirical approach centered on the spectral dynamics of weights\n-- the behavior of singular values and vectors during optimization -- to unify\nand clarify several phenomena in deep learning. We identify a consistent bias\nin optimization across various experiments, from small-scale ``grokking'' to\nlarge-scale tasks like image classification with ConvNets, image generation\nwith UNets, speech recognition with LSTMs, and language modeling with\nTransformers. We also demonstrate that weight decay enhances this bias beyond\nits role as a norm regularizer, even in practical systems. Moreover, we show\nthat these spectral dynamics distinguish memorizing networks from generalizing\nones, offering a novel perspective on this longstanding conundrum.\nAdditionally, we leverage spectral dynamics to explore the emergence of\nwell-performing sparse subnetworks (lottery tickets) and the structure of the\nloss surface through linear mode connectivity. Our findings suggest that\nspectral dynamics provide a coherent framework to better understand the\nbehavior of neural networks across diverse settings.\n","authors":["David Yunis","Kumar Kshitij Patel","Samuel Wheeler","Pedro Savarese","Gal Vardi","Karen Livescu","Michael Maire","Matthew R. Walter"],"pdf_url":"https://arxiv.org/pdf/2408.11804v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11796v1","updated":"2024-08-21T17:38:48Z","published":"2024-08-21T17:38:48Z","title":"LLM Pruning and Distillation in Practice: The Minitron Approach","summary":" We present a comprehensive report on compressing the Llama 3.1 8B and Mistral\nNeMo 12B models to 4B and 8B parameters, respectively, using pruning and\ndistillation. We explore two distinct pruning strategies: (1) depth pruning and\n(2) joint hidden/attention/MLP (width) pruning, and evaluate the results on\ncommon benchmarks from the LM Evaluation Harness. The models are then aligned\nwith NeMo Aligner and tested in instruct-tuned versions. This approach produces\na compelling 4B model from Llama 3.1 8B and a state-of-the-art\nMistral-NeMo-Minitron-8B (MN-Minitron-8B for brevity) model from Mistral NeMo\n12B. We found that with no access to the original data, it is beneficial to\nslightly fine-tune teacher models on the distillation dataset. We open-source\nour base model weights on Hugging Face with a permissive license.\n","authors":["Sharath Turuvekere Sreenivas","Saurav Muralidharan","Raviraj Joshi","Marcin Chochowski","Mostofa Patwary","Mohammad Shoeybi","Bryan Catanzaro","Jan Kautz","Pavlo Molchanov"],"pdf_url":"https://arxiv.org/pdf/2408.11796v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.05590v2","updated":"2024-08-21T17:34:07Z","published":"2024-06-08T22:21:42Z","title":"NYU CTF Dataset: A Scalable Open-Source Benchmark Dataset for Evaluating\n LLMs in Offensive Security","summary":" Large Language Models (LLMs) are being deployed across various domains today.\nHowever, their capacity to solve Capture the Flag (CTF) challenges in\ncybersecurity has not been thoroughly evaluated. To address this, we develop a\nnovel method to assess LLMs in solving CTF challenges by creating a scalable,\nopen-source benchmark database specifically designed for these applications.\nThis database includes metadata for LLM testing and adaptive learning,\ncompiling a diverse range of CTF challenges from popular competitions.\nUtilizing the advanced function calling capabilities of LLMs, we build a fully\nautomated system with an enhanced workflow and support for external tool calls.\nOur benchmark dataset and automated framework allow us to evaluate the\nperformance of five LLMs, encompassing both black-box and open-source models.\nThis work lays the foundation for future research into improving the efficiency\nof LLMs in interactive cybersecurity tasks and automated task planning. By\nproviding a specialized dataset, our project offers an ideal platform for\ndeveloping, testing, and refining LLM-based approaches to vulnerability\ndetection and resolution. Evaluating LLMs on these challenges and comparing\nwith human performance yields insights into their potential for AI-driven\ncybersecurity solutions to perform real-world threat management. We make our\ndataset open source to public https://github.com/NYU-LLM-CTF/LLM_CTF_Database\nalong with our playground automated framework\nhttps://github.com/NYU-LLM-CTF/llm_ctf_automation.\n","authors":["Minghao Shao","Sofija Jancheska","Meet Udeshi","Brendan Dolan-Gavitt","Haoran Xi","Kimberly Milner","Boyuan Chen","Max Yin","Siddharth Garg","Prashanth Krishnamurthy","Farshad Khorrami","Ramesh Karri","Muhammad Shafique"],"pdf_url":"https://arxiv.org/pdf/2406.05590v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11792v1","updated":"2024-08-21T17:25:40Z","published":"2024-08-21T17:25:40Z","title":"Optical ISAC: Fundamental Performance Limits and Transceiver Design","summary":" This paper characterizes the optimal capacity-distortion (C-D) tradeoff in an\noptical point-to-point (P2P) system with single-input single-output for\ncommunication and single-input multiple-output for sensing (SISO-SIMO-C/S)\nwithin an integrated sensing and communication (ISAC) framework. We introduce\npractical, asymptotically optimal maximum a posteriori (MAP) and maximum\nlikelihood estimators (MLE) for target distance, addressing nonlinear\nmeasurement-to-state relationships and non-conjugate priors. Our results show\nthese estimators converge to the Bayesian Cramer-Rao bound (BCRB) as sensing\nantennas increase. We also demonstrate that the achievable rate-CRB (AR-CRB)\nserves as an outer bound (OB) for the optimal C-D region. To optimize input\ndistribution across the Pareto boundary of the C-D region, we propose two\nalgorithms: an iterative Blahut-Arimoto algorithm (BAA)-type method and a\nmemory-efficient closed-form (CF) approach, including a CF optimal distribution\nfor high optical signal-to-noise ratio (O-SNR) conditions. Additionally, we\nextend and modify the Deterministic-Random Tradeoff (DRT) to this optical ISAC\ncontext.\n","authors":["Alireza Ghazavi Khorasgani","Mahtab Mirmohseni","Ahmed Elzanaty"],"pdf_url":"https://arxiv.org/pdf/2408.11792v1.pdf","comment":"7 pages, 3 figures"},{"id":"http://arxiv.org/abs/2408.11791v1","updated":"2024-08-21T17:24:15Z","published":"2024-08-21T17:24:15Z","title":"Critique-out-Loud Reward Models","summary":" Traditionally, reward models used for reinforcement learning from human\nfeedback (RLHF) are trained to directly predict preference scores without\nleveraging the generation capabilities of the underlying large language model\n(LLM). This limits the capabilities of reward models as they must reason\nimplicitly about the quality of a response, i.e., preference modeling must be\nperformed in a single forward pass through the model. To enable reward models\nto reason explicitly about the quality of a response, we introduce\nCritique-out-Loud (CLoud) reward models. CLoud reward models operate by first\ngenerating a natural language critique of the assistant's response that is then\nused to predict a scalar reward for the quality of the response. We demonstrate\nthe success of CLoud reward models for both Llama-3-8B and 70B base models:\ncompared to classic reward models CLoud reward models improve pairwise\npreference classification accuracy on RewardBench by 4.65 and 5.84 percentage\npoints for the 8B and 70B base models respectively. Furthermore, CLoud reward\nmodels lead to a Pareto improvement for win rate on ArenaHard when used as the\nscoring model for Best-of-N. Finally, we explore how to exploit the dynamic\ninference compute capabilities of CLoud reward models by performing\nself-consistency decoding for reward prediction.\n","authors":["Zachary Ankner","Mansheej Paul","Brandon Cui","Jonathan D. Chang","Prithviraj Ammanabrolu"],"pdf_url":"https://arxiv.org/pdf/2408.11791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11782v1","updated":"2024-08-21T17:12:40Z","published":"2024-08-21T17:12:40Z","title":"RFID based Health Adherence Medicine Case Using Fair Federated Learning","summary":" Medication nonadherence significantly reduces the effectiveness of therapies,\nyet it remains prevalent among patients. Nonadherence has been linked to\nadverse outcomes, including increased risks of mortality and hospitalization.\nAlthough various methods exist to help patients track medication schedules,\nsuch as the Intelligent Drug Administration System (IDAS) and Smart Blister,\nthese tools often face challenges that hinder their commercial viability.\nBuilding on the principles of dosage measurement and information communication\nin IoT, we introduce the Smart Pill Case a smart health adherence tool that\nleverages RFID-based data recording and NFC-based data extraction. This system\nincorporates a load cell for precise dosage measurement and features an Android\napp to monitor medication intake, offer suggestions, and issue warnings. To\nenhance the effectiveness and personalization of the Smart Pill Case, we\npropose integrating federated learning into the system. Federated learning\nallows the Smart Pill Case to learn from medication adherence patterns across\nmultiple users without compromising individual privacy. By training machine\nlearning models on decentralized data collected from various Smart Pill Cases,\nthe system can continuously improve its recommendations and warnings, adapting\nto the diverse needs and behaviors of users. This approach not only enhances\nthe tools ability to support medication adherence but also ensures that\nsensitive user data remains secure and private.\n","authors":["Ali Kamrani khodaei","Sina Hajer Ahmadi"],"pdf_url":"https://arxiv.org/pdf/2408.11782v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11778v1","updated":"2024-08-21T17:08:05Z","published":"2024-08-21T17:08:05Z","title":"Sum of Squares Circuits","summary":" Designing expressive generative models that support exact and efficient\ninference is a core question in probabilistic ML. Probabilistic circuits (PCs)\noffer a framework where this tractability-vs-expressiveness trade-off can be\nanalyzed theoretically. Recently, squared PCs encoding subtractive mixtures via\nnegative parameters have emerged as tractable models that can be exponentially\nmore expressive than monotonic PCs, i.e., PCs with positive parameters only. In\nthis paper, we provide a more precise theoretical characterization of the\nexpressiveness relationships among these models. First, we prove that squared\nPCs can be less expressive than monotonic ones. Second, we formalize a novel\nclass of PCs -- sum of squares PCs -- that can be exponentially more expressive\nthan both squared and monotonic PCs. Around sum of squares PCs, we build an\nexpressiveness hierarchy that allows us to precisely unify and separate\ndifferent tractable model classes such as Born Machines and PSD models, and\nother recently introduced tractable probabilistic models by using complex\nparameters. Finally, we empirically show the effectiveness of sum of squares\ncircuits in performing distribution estimation.\n","authors":["Lorenzo Loconte","Stefan Mengel","Antonio Vergari"],"pdf_url":"https://arxiv.org/pdf/2408.11778v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01105v2","updated":"2024-08-21T17:02:21Z","published":"2024-02-02T02:44:59Z","title":"A Survey for Foundation Models in Autonomous Driving","summary":" The advent of foundation models has revolutionized the fields of natural\nlanguage processing and computer vision, paving the way for their application\nin autonomous driving (AD). This survey presents a comprehensive review of more\nthan 40 research papers, demonstrating the role of foundation models in\nenhancing AD. Large language models contribute to planning and simulation in\nAD, particularly through their proficiency in reasoning, code generation and\ntranslation. In parallel, vision foundation models are increasingly adapted for\ncritical tasks such as 3D object detection and tracking, as well as creating\nrealistic driving scenarios for simulation and testing. Multi-modal foundation\nmodels, integrating diverse inputs, exhibit exceptional visual understanding\nand spatial reasoning, crucial for end-to-end AD. This survey not only provides\na structured taxonomy, categorizing foundation models based on their modalities\nand functionalities within the AD domain but also delves into the methods\nemployed in current research. It identifies the gaps between existing\nfoundation models and cutting-edge AD approaches, thereby charting future\nresearch directions and proposing a roadmap for bridging these gaps.\n","authors":["Haoxiang Gao","Zhongruo Wang","Yaqian Li","Kaiwen Long","Ming Yang","Yiqing Shen"],"pdf_url":"https://arxiv.org/pdf/2402.01105v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12235v5","updated":"2024-08-21T16:49:18Z","published":"2024-05-14T23:50:01Z","title":"Hypergraph: A Unified and Uniform Definition with Application to\n Chemical Hypergraph and More","summary":" The conventional definition of hypergraph has two major issues: (1) there is\nnot a standard definition of directed hypergraph and (2) there is not a formal\ndefinition of nested hypergraph. To resolve these issues, we propose a new\ndefinition of hypergraph that unifies the concepts of undirected, directed and\nnested hypergraphs, and that is uniform in using hyperedge as a single\nconstruct for representing high-order correlations among things, i.e., nodes\nand hyperedges. Specifically, we define a hyperedge to be a simple hyperedge, a\nnesting hyperedge, or a directed hyperedge. With this new definition, a\nhypergraph is nested if it has nesting hyperedge(s), and is directed if it has\ndirected hyperedge(s). Otherwise, a hypergraph is a simple hypergraph. The\nuniformity and power of this new definition, with visualization, should\nfacilitate the use of hypergraph for representing (hierarchical) high-order\ncorrelations in general and chemical systems in particular. Graph has been\nwidely used as a mathematical structure for machine learning on molecular\nstructures and 3D molecular geometries. However, graph has a major limitation:\nit can represent only pairwise correlations between nodes. Hypergraph extends\ngraph with high-order correlations among nodes. This extension is significant\nor essential for machine learning on chemical systems. For molecules, this is\nsignificant as it allows the direct, explicit representation of multicenter\nbonds and molecular substructures. For chemical reactions, this is essential\nsince most chemical reactions involve multiple participants. We propose the use\nof chemical hypergraph, a multilevel hypergraph with simple, nesting and\ndirected hyperedges, as a single mathematical structure for representing\nchemical systems. We apply the new definition of hypergraph to chemical\nhypergraph and, as simplified versions, molecular hypergraph and chemical\nreaction hypergraph.\n","authors":["Daniel T. Chang"],"pdf_url":"https://arxiv.org/pdf/2405.12235v5.pdf","comment":"arXiv admin note: text overlap with arXiv:2310.03623 by other authors"},{"id":"http://arxiv.org/abs/2408.11768v1","updated":"2024-08-21T16:42:58Z","published":"2024-08-21T16:42:58Z","title":"Embedding Ordinality to Binary Loss Function for Improving Solar Flare\n Forecasting","summary":" In this paper, we propose a novel loss function aimed at optimizing the\nbinary flare prediction problem by embedding the intrinsic ordinal flare\ncharacteristics into the binary cross-entropy (BCE) loss function. This\nmodification is intended to provide the model with better guidance based on the\nordinal characteristics of the data and improve the overall performance of the\nmodels. For our experiments, we employ a ResNet34-based model with transfer\nlearning to predict $\\geq$M-class flares by utilizing the shape-based features\nof magnetograms of active region (AR) patches spanning from $-$90$^{\\circ}$ to\n$+$90$^{\\circ}$ of solar longitude as our input data. We use a composite skill\nscore (CSS) as our evaluation metric, which is calculated as the geometric mean\nof the True Skill Score (TSS) and the Heidke Skill Score (HSS) to rank and\ncompare our models' performance. The primary contributions of this work are as\nfollows: (i) We introduce a novel approach to encode ordinality into a binary\nloss function showing an application to solar flare prediction, (ii) We enhance\nsolar flare forecasting by enabling flare predictions for each AR across the\nentire solar disk, without any longitudinal restrictions, and evaluate and\ncompare performance. (iii) Our candidate model, optimized with the proposed\nloss function, shows an improvement of $\\sim$7%, $\\sim$4%, and $\\sim$3% for AR\npatches within $\\pm$30$^\\circ$, $\\pm$60$^\\circ$, and $\\pm$90$^\\circ$ of solar\nlongitude, respectively in terms of CSS, when compared with standard BCE.\nAdditionally, we demonstrate the ability to issue flare forecasts for ARs in\nnear-limb regions (regions between $\\pm$60$^{\\circ}$ to $\\pm$90$^{\\circ}$) with\na CSS=0.34 (TSS=0.50 and HSS=0.23), expanding the scope of AR-based models for\nsolar flare prediction. This advances the reliability of solar flare forecasts,\nleading to more effective prediction capabilities.\n","authors":["Chetraj Pandey","Anli Ji","Jinsu Hong","Rafal A. Angryk","Berkay Aydin"],"pdf_url":"https://arxiv.org/pdf/2408.11768v1.pdf","comment":"10 Pages, 8 Figures. This manuscript is accepted to be published at\n DSAA 2024 conference. arXiv admin note: substantial text overlap with\n arXiv:2406.11054"},{"id":"http://arxiv.org/abs/2306.13532v2","updated":"2024-08-21T16:39:15Z","published":"2023-06-23T14:52:58Z","title":"PathMLP: Smooth Path Towards High-order Homophily","summary":" Real-world graphs exhibit increasing heterophily, where nodes no longer tend\nto be connected to nodes with the same label, challenging the homophily\nassumption of classical graph neural networks (GNNs) and impeding their\nperformance. Intriguingly, from the observation of heterophilous data, we\nnotice that certain high-order information exhibits higher homophily, which\nmotivates us to involve high-order information in node representation learning.\nHowever, common practices in GNNs to acquire high-order information mainly\nthrough increasing model depth and altering message-passing mechanisms, which,\nalbeit effective to a certain extent, suffer from three shortcomings: 1)\nover-smoothing due to excessive model depth and propagation times; 2)\nhigh-order information is not fully utilized; 3) low computational efficiency.\nIn this regard, we design a similarity-based path sampling strategy to capture\nsmooth paths containing high-order homophily. Then we propose a lightweight\nmodel based on multi-layer perceptrons (MLP), named PathMLP, which can encode\nmessages carried by paths via simple transformation and concatenation\noperations, and effectively learn node representations in heterophilous graphs\nthrough adaptive path aggregation. Extensive experiments demonstrate that our\nmethod outperforms baselines on 16 out of 20 datasets, underlining its\neffectiveness and superiority in alleviating the heterophily problem. In\naddition, our method is immune to over-smoothing and has high computational\nefficiency. The source code will be available in\nhttps://github.com/Graph4Sec-Team/PathMLP.\n","authors":["Jiajun Zhou","Chenxuan Xie","Shengbo Gong","Jiaxu Qian","Shanqing Yu","Qi Xuan","Xiaoniu Yang"],"pdf_url":"https://arxiv.org/pdf/2306.13532v2.pdf","comment":"Accepted by Neural Networks"},{"id":"http://arxiv.org/abs/2311.12786v2","updated":"2024-08-21T16:37:20Z","published":"2023-11-21T18:51:04Z","title":"Mechanistically analyzing the effects of fine-tuning on procedurally\n defined tasks","summary":" Fine-tuning large pre-trained models has become the de facto strategy for\ndeveloping both task-specific and general-purpose machine learning systems,\nincluding developing models that are safe to deploy. Despite its clear\nimportance, there has been minimal work that explains how fine-tuning alters\nthe underlying capabilities learned by a model during pretraining: does\nfine-tuning yield entirely novel capabilities or does it just modulate existing\nones? We address this question empirically in synthetic, controlled settings\nwhere we can use mechanistic interpretability tools (e.g., network pruning and\nprobing) to understand how the model's underlying capabilities are changing. We\nperform an extensive analysis of the effects of fine-tuning in these settings,\nand show that: (i) fine-tuning rarely alters the underlying model capabilities;\n(ii) a minimal transformation, which we call a 'wrapper', is typically learned\non top of the underlying model capabilities, creating the illusion that they\nhave been modified; and (iii) further fine-tuning on a task where such hidden\ncapabilities are relevant leads to sample-efficient 'revival' of the\ncapability, i.e., the model begins reusing these capability after only a few\ngradient steps. This indicates that practitioners can unintentionally remove a\nmodel's safety wrapper merely by fine-tuning it on a, e.g., superficially\nunrelated, downstream task. We additionally perform analysis on language models\ntrained on the TinyStories dataset to support our claims in a more realistic\nsetup.\n","authors":["Samyak Jain","Robert Kirk","Ekdeep Singh Lubana","Robert P. Dick","Hidenori Tanaka","Edward Grefenstette","Tim Rocktäschel","David Scott Krueger"],"pdf_url":"https://arxiv.org/pdf/2311.12786v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15673v2","updated":"2024-08-21T16:34:19Z","published":"2023-11-27T10:02:12Z","title":"Accelerating Hopfield Network Dynamics: Beyond Synchronous Updates and\n Forward Euler","summary":" The Hopfield network serves as a fundamental energy-based model in machine\nlearning, capturing memory retrieval dynamics through an ordinary differential\nequation (ODE). The model's output, the equilibrium point of the ODE, is\ntraditionally computed via synchronous updates using the forward Euler method.\nThis paper aims to overcome some of the disadvantages of this approach. We\npropose a conceptual shift, viewing Hopfield networks as instances of Deep\nEquilibrium Models (DEQs). The DEQ framework not only allows for the use of\nspecialized solvers, but also leads to new insights on an empirical inference\ntechnique that we will refer to as 'even-odd splitting'. Our theoretical\nanalysis of the method uncovers a parallelizable asynchronous update scheme,\nwhich should converge roughly twice as fast as the conventional synchronous\nupdates. Empirical evaluations validate these findings, showcasing the\nadvantages of both the DEQ framework and even-odd splitting in digitally\nsimulating energy minimization in Hopfield networks. The code is available at\nhttps://github.com/cgoemaere/hopdeq\n","authors":["Cédric Goemaere","Johannes Deleu","Thomas Demeester"],"pdf_url":"https://arxiv.org/pdf/2311.15673v2.pdf","comment":"Accepted at the ML-DE Workshop at ECAI 2024"},{"id":"http://arxiv.org/abs/2408.11746v1","updated":"2024-08-21T16:13:16Z","published":"2024-08-21T16:13:16Z","title":"Mixed Sparsity Training: Achieving 4$\\times$ FLOP Reduction for\n Transformer Pretraining","summary":" Large language models (LLMs) have made significant strides in complex tasks,\nyet their widespread adoption is impeded by substantial computational demands.\nWith hundreds of billion parameters, transformer-based LLMs necessitate months\nof pretraining across a high-end GPU cluster. However, this paper reveals a\ncompelling finding: transformers exhibit considerable redundancy in pretraining\ncomputations, which motivates our proposed solution, Mixed Sparsity Training\n(MST), an efficient pretraining method that can reduce about $75\\%$ of Floating\nPoint Operations (FLOPs) while maintaining performance. MST integrates dynamic\nsparse training (DST) with Sparsity Variation (SV) and Hybrid Sparse Attention\n(HSA) during pretraining, involving three distinct phases: warm-up,\nultra-sparsification, and restoration. The warm-up phase transforms the dense\nmodel into a sparse one, and the restoration phase reinstates connections.\nThroughout these phases, the model is trained with a dynamically evolving\nsparse topology and an HSA mechanism to maintain performance and minimize\ntraining FLOPs concurrently. Our experiment on GPT-2 showcases a FLOP reduction\nof $4\\times$ without compromising performance.\n","authors":["Pihe Hu","Shaolong Li","Longbo Huang"],"pdf_url":"https://arxiv.org/pdf/2408.11746v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11743v1","updated":"2024-08-21T16:10:41Z","published":"2024-08-21T16:10:41Z","title":"MARLIN: Mixed-Precision Auto-Regressive Parallel Inference on Large\n Language Models","summary":" As inference on Large Language Models (LLMs) emerges as an important workload\nin machine learning applications, weight quantization has become a standard\ntechnique for efficient GPU deployment. Quantization not only reduces model\nsize, but has also been shown to yield substantial speedups for single-user\ninference, due to reduced memory movement, with low accuracy impact. Yet, it\nremains open whether speedups are achievable also in \\emph{batched} settings\nwith multiple parallel clients, which are highly relevant for practical\nserving. It is unclear whether GPU kernels can be designed to remain\npractically memory-bound, while supporting the substantially increased compute\nrequirements of batched workloads.\n This paper resolves this question positively by describing the design of\nMixed-precision Auto-Regressive LINear kernels, called MARLIN. Concretely,\ngiven a model whose weights are compressed via quantization to, e.g., 4 bits\nper element, MARLIN shows that batchsizes up to 16-32 can be supported with\nclose to maximum ($4\\times$) quantization speedup, and larger batchsizes up to\n64-128 with gradually decreasing, but still significant, acceleration. MARLIN\naccomplishes this via a combination of techniques, such as asynchronous memory\naccess, complex task scheduling and pipelining, and bespoke quantization\nsupport. Our experiments show that MARLIN's near-optimal performance on\nindividual LLM layers across different scenarios can also lead to end-to-end\nLLM inference speedups (of up to $2.8\\times$) when integrated with the popular\nvLLM serving engine. Finally, MARLIN is extensible to further compression\ntechniques, like NVIDIA 2:4 sparsity, leading to additional speedups.\n","authors":["Elias Frantar","Roberto L. Castro","Jiale Chen","Torsten Hoefler","Dan Alistarh"],"pdf_url":"https://arxiv.org/pdf/2408.11743v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09104v2","updated":"2024-08-21T16:01:06Z","published":"2023-08-17T17:14:18Z","title":"Spike-and-slab shrinkage priors for structurally sparse Bayesian neural\n networks","summary":" Network complexity and computational efficiency have become increasingly\nsignificant aspects of deep learning. Sparse deep learning addresses these\nchallenges by recovering a sparse representation of the underlying target\nfunction by reducing heavily over-parameterized deep neural networks.\nSpecifically, deep neural architectures compressed via structured sparsity\n(e.g. node sparsity) provide low latency inference, higher data throughput, and\nreduced energy consumption. In this paper, we explore two well-established\nshrinkage techniques, Lasso and Horseshoe, for model compression in Bayesian\nneural networks. To this end, we propose structurally sparse Bayesian neural\nnetworks which systematically prune excessive nodes with (i) Spike-and-Slab\nGroup Lasso (SS-GL), and (ii) Spike-and-Slab Group Horseshoe (SS-GHS) priors,\nand develop computationally tractable variational inference including\ncontinuous relaxation of Bernoulli variables. We establish the contraction\nrates of the variational posterior of our proposed models as a function of the\nnetwork topology, layer-wise node cardinalities, and bounds on the network\nweights. We empirically demonstrate the competitive performance of our models\ncompared to the baseline models in prediction accuracy, model compression, and\ninference latency.\n","authors":["Sanket Jantre","Shrijita Bhattacharya","Tapabrata Maiti"],"pdf_url":"https://arxiv.org/pdf/2308.09104v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04380v3","updated":"2024-08-21T15:54:54Z","published":"2024-08-08T11:34:31Z","title":"Deep Generative Models in Robotics: A Survey on Learning from Multimodal\n Demonstrations","summary":" Learning from Demonstrations, the field that proposes to learn robot behavior\nmodels from data, is gaining popularity with the emergence of deep generative\nmodels. Although the problem has been studied for years under names such as\nImitation Learning, Behavioral Cloning, or Inverse Reinforcement Learning,\nclassical methods have relied on models that don't capture complex data\ndistributions well or don't scale well to large numbers of demonstrations. In\nrecent years, the robot learning community has shown increasing interest in\nusing deep generative models to capture the complexity of large datasets. In\nthis survey, we aim to provide a unified and comprehensive review of the last\nyear's progress in the use of deep generative models in robotics. We present\nthe different types of models that the community has explored, such as\nenergy-based models, diffusion models, action value maps, or generative\nadversarial networks. We also present the different types of applications in\nwhich deep generative models have been used, from grasp generation to\ntrajectory generation or cost learning. One of the most important elements of\ngenerative models is the generalization out of distributions. In our survey, we\nreview the different decisions the community has made to improve the\ngeneralization of the learned models. Finally, we highlight the research\nchallenges and propose a number of future directions for learning deep\ngenerative models in robotics.\n","authors":["Julen Urain","Ajay Mandlekar","Yilun Du","Mahi Shafiullah","Danfei Xu","Katerina Fragkiadaki","Georgia Chalvatzaki","Jan Peters"],"pdf_url":"https://arxiv.org/pdf/2408.04380v3.pdf","comment":"20 pages, 11 figures, submitted to TRO"},{"id":"http://arxiv.org/abs/2310.06644v3","updated":"2024-08-21T15:53:02Z","published":"2023-10-10T14:07:37Z","title":"HYVE: Hybrid Vertex Encoder for Neural Distance Fields","summary":" Neural shape representation generally refers to representing 3D geometry\nusing neural networks, e.g., computing a signed distance or occupancy value at\na specific spatial position. In this paper we present a neural-network\narchitecture suitable for accurate encoding of 3D shapes in a single forward\npass. Our architecture is based on a multi-scale hybrid system incorporating\ngraph-based and voxel-based components, as well as a continuously\ndifferentiable decoder. The hybrid system includes a novel way of voxelizing\npoint-based features in neural networks, which we show can be used in\ncombination with oriented point-clouds to obtain smoother and more detailed\nreconstructions. Furthermore, our network is trained to solve the eikonal\nequation and only requires knowledge of the zero-level set for training and\ninference. This means that in contrast to most previous shape encoder\narchitectures, our network is able to output valid signed distance fields\nwithout explicit prior knowledge of non-zero distance values or shape\noccupancy. It also requires only a single forward-pass, instead of the\nlatent-code optimization used in auto-decoder methods. We further propose a\nmodification to the loss function in case that surface normals are not well\ndefined, e.g., in the context of non-watertight surfaces and non-manifold\ngeometry, resulting in an unsigned distance field. Overall, our system can help\nto reduce the computational overhead of training and evaluating neural distance\nfields, as well as enabling the application to difficult geometry.\n","authors":["Stefan Rhys Jeske","Jonathan Klein","Dominik L. Michels","Jan Bender"],"pdf_url":"https://arxiv.org/pdf/2310.06644v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11721v1","updated":"2024-08-21T15:51:46Z","published":"2024-08-21T15:51:46Z","title":"Iterative Object Count Optimization for Text-to-image Diffusion Models","summary":" We address a persistent challenge in text-to-image models: accurately\ngenerating a specified number of objects. Current models, which learn from\nimage-text pairs, inherently struggle with counting, as training data cannot\ndepict every possible number of objects for any given object. To solve this, we\npropose optimizing the generated image based on a counting loss derived from a\ncounting model that aggregates an object\\'s potential. Employing an\nout-of-the-box counting model is challenging for two reasons: first, the model\nrequires a scaling hyperparameter for the potential aggregation that varies\ndepending on the viewpoint of the objects, and second, classifier guidance\ntechniques require modified models that operate on noisy intermediate diffusion\nsteps. To address these challenges, we propose an iterated online training mode\nthat improves the accuracy of inferred images while altering the text\nconditioning embedding and dynamically adjusting hyperparameters. Our method\noffers three key advantages: (i) it can consider non-derivable counting\ntechniques based on detection models, (ii) it is a zero-shot plug-and-play\nsolution facilitating rapid changes to the counting techniques and image\ngeneration methods, and (iii) the optimized counting token can be reused to\ngenerate accurate images without additional optimization. We evaluate the\ngeneration of various objects and show significant improvements in accuracy.\nThe project page is available at https://ozzafar.github.io/count_token.\n","authors":["Oz Zafar","Lior Wolf","Idan Schwartz"],"pdf_url":"https://arxiv.org/pdf/2408.11721v1.pdf","comment":"Pre-print"},{"id":"http://arxiv.org/abs/2408.11720v1","updated":"2024-08-21T15:50:37Z","published":"2024-08-21T15:50:37Z","title":"On Learnable Parameters of Optimal and Suboptimal Deep Learning Models","summary":" We scrutinize the structural and operational aspects of deep learning models,\nparticularly focusing on the nuances of learnable parameters (weight)\nstatistics, distribution, node interaction, and visualization. By establishing\ncorrelations between variance in weight patterns and overall network\nperformance, we investigate the varying (optimal and suboptimal) performances\nof various deep-learning models. Our empirical analysis extends across widely\nrecognized datasets such as MNIST, Fashion-MNIST, and CIFAR-10, and various\ndeep learning models such as deep neural networks (DNNs), convolutional neural\nnetworks (CNNs), and vision transformer (ViT), enabling us to pinpoint\ncharacteristics of learnable parameters that correlate with successful\nnetworks. Through extensive experiments on the diverse architectures of deep\nlearning models, we shed light on the critical factors that influence the\nfunctionality and efficiency of DNNs. Our findings reveal that successful\nnetworks, irrespective of datasets or models, are invariably similar to other\nsuccessful networks in their converged weights statistics and distribution,\nwhile poor-performing networks vary in their weights. In addition, our research\nshows that the learnable parameters of widely varied deep learning models such\nas DNN, CNN, and ViT exhibit similar learning characteristics.\n","authors":["Ziwei Zheng","Huizhi Liang","Vaclav Snasel","Vito Latora","Panos Pardalos","Giuseppe Nicosia","Varun Ojha"],"pdf_url":"https://arxiv.org/pdf/2408.11720v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.04693v2","updated":"2024-08-21T15:50:31Z","published":"2024-03-07T17:42:40Z","title":"Analysis of Systems' Performance in Natural Language Processing\n Competitions","summary":" Collaborative competitions have gained popularity in the scientific and\ntechnological fields. These competitions involve defining tasks, selecting\nevaluation scores, and devising result verification methods. In the standard\nscenario, participants receive a training set and are expected to provide a\nsolution for a held-out dataset kept by organizers. An essential challenge for\norganizers arises when comparing algorithms' performance, assessing multiple\nparticipants, and ranking them. Statistical tools are often used for this\npurpose; however, traditional statistical methods often fail to capture\ndecisive differences between systems' performance. This manuscript describes an\nevaluation methodology for statistically analyzing competition results and\ncompetition. The methodology is designed to be universally applicable; however,\nit is illustrated using eight natural language competitions as case studies\ninvolving classification and regression problems. The proposed methodology\noffers several advantages, including off-the-shell comparisons with correction\nmechanisms and the inclusion of confidence intervals. Furthermore, we introduce\nmetrics that allow organizers to assess the difficulty of competitions. Our\nanalysis shows the potential usefulness of our methodology for effectively\nevaluating competition results.\n","authors":["Sergio Nava-Muñoz","Mario Graff","Hugo Jair Escalante"],"pdf_url":"https://arxiv.org/pdf/2403.04693v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19375v3","updated":"2024-08-21T15:21:42Z","published":"2024-05-28T22:25:17Z","title":"Improving global awareness of linkset predictions using Cross-Attentive\n Modulation tokens","summary":" Most of multiple link prediction or graph generation techniques rely on the\nattention mechanism or on Graph Neural Networks (GNNs), which consist in\nleveraging node-level information exchanges in order to form proper link\npredictions. Such node-level interactions do not process nodes as an ordered\nsequence, which would imply some kind of natural ordering of the nodes: they\nare said to be permutation invariant mechanisms. They are well suited for graph\nproblems, but struggle at providing a global orchestration of the predicted\nlinks, which can result in a loss of performance. Some typical issues can be\nthe difficulty to ensure high-level properties such as global connectedness,\nfixed diameter or to avoid information bottleneck effects such as oversmoothing\nand oversquashing, which respectively consist in abundant smoothing in dense\nareas leading to a loss of information and a tendency to exclude isolated nodes\nfrom the message passing scheme, and often result in irrelevant, unbalanced\nlink predictions. To tackle this problem, we hereby present Cross-Attentive\nModulation (CAM) tokens, which introduce cross-attentive units used to\ncondition node and edge-level modulations in order to enable context-aware\ncomputations that improve the global consistency of the prediction links. We\nwill implement it on a few permutation invariant architectures, and showcase\nbenchmarks that prove the merits of our work.\n","authors":["Félix Marcoccia","Cédric Adjih","Paul Mühlethaler"],"pdf_url":"https://arxiv.org/pdf/2405.19375v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12163v4","updated":"2024-08-21T15:16:02Z","published":"2023-11-20T20:28:30Z","title":"Quantum Inception Score","summary":" Motivated by the great success of classical generative models in machine\nlearning, enthusiastic exploration of their quantum version has recently\nstarted. To depart on this journey, it is important to develop a relevant\nmetric to evaluate the quality of quantum generative models; in the classical\ncase, one such example is the (classical) inception score (cIS). In this paper,\nas a natural extension of cIS, we propose the quantum inception score (qIS) for\nquantum generators. Importantly, qIS relates the quality to the Holevo\ninformation of the quantum channel that classifies a given dataset. In this\ncontext, we show several properties of qIS. First, qIS is greater than or equal\nto the corresponding cIS, which is defined through projection measurements on\nthe system output. Second, the difference between qIS and cIS arises from the\npresence of quantum coherence, as characterized by the resource theory of\nasymmetry. Third, when a set of entangled generators is prepared, there exists\na classifying process leading to the further enhancement of qIS. Fourth, we\nharness the quantum fluctuation theorem to characterize the physical limitation\nof qIS. Finally, we apply qIS to assess the quality of the one-dimensional spin\nchain model as a quantum generative model, with the quantum convolutional\nneural network as a quantum classifier, for the phase classification problem in\nthe quantum many-body physics.\n","authors":["Akira Sone","Akira Tanji","Naoki Yamamoto"],"pdf_url":"https://arxiv.org/pdf/2311.12163v4.pdf","comment":"very close to the published version"},{"id":"http://arxiv.org/abs/2405.13858v2","updated":"2024-08-21T15:15:57Z","published":"2024-05-22T17:33:51Z","title":"Carbon Connect: An Ecosystem for Sustainable Computing","summary":" Computing is at a moment of profound opportunity. Emerging applications --\nsuch as capable artificial intelligence, immersive virtual realities, and\npervasive sensor systems -- drive unprecedented demand for computer. Despite\nrecent advances toward net zero carbon emissions, the computing industry's\ngross energy usage continues to rise at an alarming rate, outpacing the growth\nof new energy installations and renewable energy deployments. A shift towards\nsustainability is needed to spark a transformation in how computer systems are\nmanufactured, allocated, and consumed. Carbon Connect envisions coordinated\nresearch thrusts that produce design and management strategies for sustainable,\nnext-generation computer systems. These strategies must flatten and then\nreverse growth trajectories for computing power and carbon for society's most\nrapidly growing applications such as artificial intelligence and virtual\nspaces. We will require accurate models for carbon accounting in computing\ntechnology. For embodied carbon, we must re-think conventional design\nstrategies -- over-provisioned monolithic servers, frequent hardware refresh\ncycles, custom silicon -- and adopt life-cycle design strategies that more\neffectively reduce, reuse and recycle hardware at scale. For operational\ncarbon, we must not only embrace renewable energy but also design systems to\nuse that energy more efficiently. Finally, new hardware design and management\nstrategies must be cognizant of economic policy and regulatory landscape,\naligning private initiatives with societal goals. Many of these broader goals\nwill require computer scientists to develop deep, enduring collaborations with\nresearchers in economics, law, and industrial ecology to spark change in\nbroader practice.\n","authors":["Benjamin C. Lee","David Brooks","Arthur van Benthem","Udit Gupta","Gage Hills","Vincent Liu","Benjamin Pierce","Christopher Stewart","Emma Strubell","Gu-Yeon Wei","Adam Wierman","Yuan Yao","Minlan Yu"],"pdf_url":"https://arxiv.org/pdf/2405.13858v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10264v3","updated":"2024-08-21T15:12:37Z","published":"2024-07-14T16:12:57Z","title":"What Makes and Breaks Safety Fine-tuning? A Mechanistic Study","summary":" Safety fine-tuning helps align Large Language Models (LLMs) with human\npreferences for their safe deployment. To better understand the underlying\nfactors that make models safe via safety fine-tuning, we design a synthetic\ndata generation framework that captures salient aspects of an unsafe input by\nmodeling the interaction between the task the model is asked to perform (e.g.,\n\"design\") versus the specific concepts the task is asked to be performed upon\n(e.g., a \"cycle\" vs. a \"bomb\"). Using this, we investigate three well-known\nsafety fine-tuning methods -- supervised safety fine-tuning, direct preference\noptimization, and unlearning -- and provide significant evidence demonstrating\nthat these methods minimally transform MLP weights to specifically align unsafe\ninputs into its weights' null space. This yields a clustering of inputs based\non whether the model deems them safe or not. Correspondingly, when an\nadversarial input (e.g., a jailbreak) is provided, its activations are closer\nto safer samples, leading to the model processing such an input as if it were\nsafe. We validate our findings, wherever possible, on real-world models --\nspecifically, Llama-2 7B and Llama-3 8B.\n","authors":["Samyak Jain","Ekdeep Singh Lubana","Kemal Oksuz","Tom Joy","Philip H. S. Torr","Amartya Sanyal","Puneet K. Dokania"],"pdf_url":"https://arxiv.org/pdf/2407.10264v3.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2408.11686v1","updated":"2024-08-21T15:07:25Z","published":"2024-08-21T15:07:25Z","title":"Plug-in estimation of Schrödinger bridges","summary":" We propose a procedure for estimating the Schr\\\"odinger bridge between two\nprobability distributions. Unlike existing approaches, our method does not\nrequire iteratively simulating forward and backward diffusions or training\nneural networks to fit unknown drifts. Instead, we show that the potentials\nobtained from solving the static entropic optimal transport problem between the\nsource and target samples can be modified to yield a natural plug-in estimator\nof the time-dependent drift that defines the bridge between two measures. Under\nminimal assumptions, we show that our proposal, which we call the\n\\emph{Sinkhorn bridge}, provably estimates the Schr\\\"odinger bridge with a rate\nof convergence that depends on the intrinsic dimensionality of the target\nmeasure. Our approach combines results from the areas of sampling, and\ntheoretical and statistical entropic optimal transport.\n","authors":["Aram-Alexandre Pooladian","Jonathan Niles-Weed"],"pdf_url":"https://arxiv.org/pdf/2408.11686v1.pdf","comment":"39 pages, 3 figures, 1 table"},{"id":"http://arxiv.org/abs/2310.06715v2","updated":"2024-08-21T15:03:22Z","published":"2023-10-10T15:42:14Z","title":"S4Sleep: Elucidating the design space of deep-learning-based sleep stage\n classification models","summary":" Scoring sleep stages in polysomnography recordings is a time-consuming task\nplagued by significant inter-rater variability. Therefore, it stands to benefit\nfrom the application of machine learning algorithms. While many algorithms have\nbeen proposed for this purpose, certain critical architectural decisions have\nnot received systematic exploration. In this study, we meticulously investigate\nthese design choices within the broad category of encoder-predictor\narchitectures. We identify robust architectures applicable to both time series\nand spectrogram input representations. These architectures incorporate\nstructured state space models as integral components and achieve statistically\nsignificant performance improvements compared to state-of-the-art approaches on\nthe extensive Sleep Heart Health Study dataset. We anticipate that the\narchitectural insights gained from this study along with the refined\nmethodology for architecture search demonstrated herein will not only prove\nvaluable for future research in sleep staging but also hold relevance for other\ntime series annotation tasks.\n","authors":["Tiezhi Wang","Nils Strodthoff"],"pdf_url":"https://arxiv.org/pdf/2310.06715v2.pdf","comment":"33 pages, 3 figures, code available at\n https://github.com/AI4HealthUOL/s4sleep"},{"id":"http://arxiv.org/abs/2408.11680v1","updated":"2024-08-21T15:00:16Z","published":"2024-08-21T15:00:16Z","title":"First line of defense: A robust first layer mitigates adversarial\n attacks","summary":" Adversarial training (AT) incurs significant computational overhead, leading\nto growing interest in designing inherently robust architectures. We\ndemonstrate that a carefully designed first layer of the neural network can\nserve as an implicit adversarial noise filter (ANF). This filter is created\nusing a combination of large kernel size, increased convolution filters, and a\nmaxpool operation. We show that integrating this filter as the first layer in\narchitectures such as ResNet, VGG, and EfficientNet results in adversarially\nrobust networks. Our approach achieves higher adversarial accuracies than\nexisting natively robust architectures without AT and is competitive with\nadversarial-trained architectures across a wide range of datasets. Supporting\nour findings, we show that (a) the decision regions for our method have better\nmargins, (b) the visualized loss surfaces are smoother, (c) the modified peak\nsignal-to-noise ratio (mPSNR) values at the output of the ANF are higher, (d)\nhigh-frequency components are more attenuated, and (e) architectures\nincorporating ANF exhibit better denoising in Gaussian noise compared to\nbaseline architectures. Code for all our experiments are available at\n\\url{https://github.com/janani-suresh-97/first-line-defence.git}.\n","authors":["Janani Suresh","Nancy Nayak","Sheetal Kalyani"],"pdf_url":"https://arxiv.org/pdf/2408.11680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.00963v3","updated":"2024-08-21T14:47:26Z","published":"2024-08-02T00:35:18Z","title":"MIS-ME: A Multi-modal Framework for Soil Moisture Estimation","summary":" Soil moisture estimation is an important task to enable precision agriculture\nin creating optimal plans for irrigation, fertilization, and harvest. It is\ncommon to utilize statistical and machine learning models to estimate soil\nmoisture from traditional data sources such as weather forecasts, soil\nproperties, and crop properties. However, there is a growing interest in\nutilizing aerial and geospatial imagery to estimate soil moisture. Although\nthese images capture high-resolution crop details, they are expensive to curate\nand challenging to interpret. Imagine, an AI-enhanced software tool that\npredicts soil moisture using visual cues captured by smartphones and\nstatistical data given by weather forecasts. This work is a first step towards\nthat goal of developing a multi-modal approach for soil moisture estimation. In\nparticular, we curate a dataset consisting of real-world images taken from\nground stations and their corresponding weather data. We also propose MIS-ME -\nMeteorological & Image based Soil Moisture Estimator, a multi-modal framework\nfor soil moisture estimation. Our extensive analysis shows that MIS-ME achieves\na MAPE of 10.14%, outperforming traditional unimodal approaches with a\nreduction of 3.25% in MAPE for meteorological data and 2.15% in MAPE for image\ndata, highlighting the effectiveness of tailored multi-modal approaches. Our\ncode and dataset will be available at\nhttps://github.com/OSU-Complex-Systems/MIS-ME.git.\n","authors":["Mohammed Rakib","Adil Aman Mohammed","D. Cole Diggins","Sumit Sharma","Jeff Michael Sadler","Tyson Ochsner","Arun Bagavathi"],"pdf_url":"https://arxiv.org/pdf/2408.00963v3.pdf","comment":"Accepted by DSAA2024"},{"id":"http://arxiv.org/abs/2307.03690v5","updated":"2024-08-21T14:39:27Z","published":"2023-06-19T20:20:10Z","title":"Suppressing unknown disturbances to dynamical systems using machine\n learning","summary":" Identifying and suppressing unknown disturbances to dynamical systems is a\nproblem with applications in many different fields. Here we present a\nmodel-free method to identify and suppress an unknown disturbance to an unknown\nsystem based only on previous observations of the system under the influence of\na known forcing function. We find that, under very mild restrictions on the\ntraining function, our method is able to robustly identify and suppress a large\nclass of unknown disturbances. We illustrate our scheme with the identification\nof both deterministic and stochastic unknown disturbances to an analog electric\nchaotic circuit and with numerical examples where a chaotic disturbance to\nvarious chaotic dynamical systems is identified and suppressed.\n","authors":["Juan G. Restrepo","Clayton P. Byers","Per Sebastian Skardal"],"pdf_url":"https://arxiv.org/pdf/2307.03690v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11662v1","updated":"2024-08-21T14:37:50Z","published":"2024-08-21T14:37:50Z","title":"Optimizing Federated Graph Learning with Inherent Structural Knowledge\n and Dual-Densely Connected GNNs","summary":" Federated Graph Learning (FGL) is an emerging technology that enables clients\nto collaboratively train powerful Graph Neural Networks (GNNs) in a distributed\nmanner without exposing their private data. Nevertheless, FGL still faces the\nchallenge of the severe non-Independent and Identically Distributed (non-IID)\nnature of graphs, which possess diverse node and edge structures, especially\nacross varied domains. Thus, exploring the knowledge inherent in these\nstructures becomes significantly crucial. Existing methods, however, either\noverlook the inherent structural knowledge in graph data or capture it at the\ncost of significantly increased resource demands (e.g., FLOPs and communication\nbandwidth), which can be detrimental to distributed paradigms. Inspired by\nthis, we propose FedDense, a novel FGL framework that optimizes the utilization\nefficiency of inherent structural knowledge. To better acquire knowledge of\ndiverse and underexploited structures, FedDense first explicitly encodes the\nstructural knowledge inherent within graph data itself alongside node features.\nBesides, FedDense introduces a Dual-Densely Connected (DDC) GNN architecture\nthat exploits the multi-scale (i.e., one-hop to multi-hop) feature and\nstructure insights embedded in the aggregated feature maps at each layer. In\naddition to the exploitation of inherent structures, we consider resource\nlimitations in FGL, devising exceedingly narrow layers atop the DDC\narchitecture and adopting a selective parameter sharing strategy to reduce\nresource costs substantially. We conduct extensive experiments using 15\ndatasets across 4 different domains, demonstrating that FedDense consistently\nsurpasses baselines by a large margin in training performance, while demanding\nminimal resources.\n","authors":["Longwen Wang","Jianchun Liu","Zhi Liu","Jinyang Huang"],"pdf_url":"https://arxiv.org/pdf/2408.11662v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.01018v2","updated":"2024-08-21T14:37:31Z","published":"2024-08-02T05:36:14Z","title":"GNN-SKAN: Harnessing the Power of SwallowKAN to Advance Molecular\n Representation Learning with GNNs","summary":" Effective molecular representation learning is crucial for advancing\nmolecular property prediction and drug design. Mainstream molecular\nrepresentation learning approaches are based on Graph Neural Networks (GNNs).\nHowever, these approaches struggle with three significant challenges:\ninsufficient annotations, molecular diversity, and architectural limitations\nsuch as over-squashing, which leads to the loss of critical structural details.\nTo address these challenges, we introduce a new class of GNNs that integrates\nthe Kolmogorov-Arnold Networks (KANs), known for their robust data-fitting\ncapabilities and high accuracy in small-scale AI + Science tasks. By\nincorporating KANs into GNNs, our model enhances the representation of\nmolecular structures. We further advance this approach with a variant called\nSwallowKAN (SKAN), which employs adaptive Radial Basis Functions (RBFs) as the\ncore of the non-linear neurons. This innovation improves both computational\nefficiency and adaptability to diverse molecular structures. Building on the\nstrengths of SKAN, we propose a new class of GNNs, GNN-SKAN, and its augmented\nvariant, GNN-SKAN+, which incorporates a SKAN-based classifier to further boost\nperformance. To our knowledge, this is the first work to integrate KANs into\nGNN architectures tailored for molecular representation learning. Experiments\nacross 6 classification datasets, 6 regression datasets, and 4 few-shot\nlearning datasets demonstrate that our approach achieves new state-of-the-art\nperformance in terms of accuracy and computational cost.\n","authors":["Ruifeng Li","Mingqian Li","Wei Liu","Hongyang Chen"],"pdf_url":"https://arxiv.org/pdf/2408.01018v2.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.10468v2","updated":"2024-08-21T14:35:48Z","published":"2024-08-20T00:40:49Z","title":"Tracing Privacy Leakage of Language Models to Training Data via Adjusted\n Influence Functions","summary":" The responses generated by Large Language Models (LLMs) can include sensitive\ninformation from individuals and organizations, leading to potential privacy\nleakage. This work implements Influence Functions (IFs) to trace privacy\nleakage back to the training data, thereby mitigating privacy concerns of\nLanguage Models (LMs). However, we notice that current IFs struggle to\naccurately estimate the influence of tokens with large gradient norms,\npotentially overestimating their influence. When tracing the most influential\nsamples, this leads to frequently tracing back to samples with large gradient\nnorm tokens, overshadowing the actual most influential samples even if their\ninfluences are well estimated. To address this issue, we propose Heuristically\nAdjusted IF (HAIF), which reduces the weight of tokens with large gradient\nnorms, thereby significantly improving the accuracy of tracing the most\ninfluential samples. To establish easily obtained groundtruth for tracing\nprivacy leakage, we construct two datasets, PII-E and PII-CR, representing two\ndistinct scenarios: one with identical text in the model outputs and\npre-training data, and the other where models leverage their reasoning\nabilities to generate text divergent from pre-training data. HAIF significantly\nimproves tracing accuracy, enhancing it by 20.96\\% to 73.71\\% on the PII-E\ndataset and 3.21\\% to 45.93\\% on the PII-CR dataset, compared to the best SOTA\nIFs against various GPT-2 and QWen-1.5 models. HAIF also outperforms SOTA IFs\non real-world pretraining data CLUECorpus2020, demonstrating strong robustness\nregardless prompt and response lengths.\n","authors":["Jinxin Liu","Zao Yang"],"pdf_url":"https://arxiv.org/pdf/2408.10468v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11659v1","updated":"2024-08-21T14:33:43Z","published":"2024-08-21T14:33:43Z","title":"5G NR PRACH Detection with Convolutional Neural Networks (CNN):\n Overcoming Cell Interference Challenges","summary":" In this paper, we present a novel approach to interference detection in 5G\nNew Radio (5G-NR) networks using Convolutional Neural Networks (CNN).\nInterference in 5G networks challenges high-quality service due to dense user\nequipment deployment and increased wireless environment complexity. Our\nCNN-based model is designed to detect Physical Random Access Channel (PRACH)\nsequences amidst various interference scenarios, leveraging the spatial and\ntemporal characteristics of PRACH signals to enhance detection accuracy and\nrobustness. Comprehensive datasets of simulated PRACH signals under controlled\ninterference conditions were generated to train and validate the model.\nExperimental results show that our CNN-based approach outperforms traditional\nPRACH detection methods in accuracy, precision, recall and F1-score. This study\ndemonstrates the potential of AI/ML techniques in advancing interference\nmanagement in 5G networks, providing a foundation for future research and\npractical applications in optimizing network performance and reliability.\n","authors":["Desire Guel","Arsene Kabore","Didier Bassole"],"pdf_url":"https://arxiv.org/pdf/2408.11659v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2107.08310v5","updated":"2024-08-21T14:29:37Z","published":"2021-07-17T20:40:45Z","title":"FairBalance: How to Achieve Equalized Odds With Data Pre-processing","summary":" This research seeks to benefit the software engineering society by providing\na simple yet effective pre-processing approach to achieve equalized odds\nfairness in machine learning software. Fairness issues have attracted\nincreasing attention since machine learning software is increasingly used for\nhigh-stakes and high-risk decisions. Amongst all the existing fairness notions,\nthis work specifically targets \"equalized odds\" given its advantage in always\nallowing perfect classifiers. Equalized odds requires that members of every\ndemographic group do not receive disparate mistreatment. Prior works either\noptimize for an equalized odds related metric during the learning process like\na black-box, or manipulate the training data following some intuition. This\nwork studies the root cause of the violation of equalized odds and how to\ntackle it. We found that equalizing the class distribution in each demographic\ngroup with sample weights is a necessary condition for achieving equalized odds\nwithout modifying the normal training process. In addition, an important\npartial condition for equalized odds (zero average odds difference) can be\nguaranteed when the class distributions are weighted to be not only equal but\nalso balanced (1:1). Based on these analyses, we proposed FairBalance, a\npre-processing algorithm which balances the class distribution in each\ndemographic group by assigning calculated weights to the training data. On\neight real-world datasets, our empirical results show that, at low\ncomputational overhead, the proposed pre-processing algorithm FairBalance can\nsignificantly improve equalized odds without much, if any damage to the\nutility. FairBalance also outperforms existing state-of-the-art approaches in\nterms of equalized odds. To facilitate reuse, reproduction, and validation, we\nmade our scripts available at https://github.com/hil-se/FairBalance.\n","authors":["Zhe Yu","Joymallya Chakraborty","Tim Menzies"],"pdf_url":"https://arxiv.org/pdf/2107.08310v5.pdf","comment":"16 pages. Accepted by TSE"},{"id":"http://arxiv.org/abs/2408.11656v1","updated":"2024-08-21T14:27:36Z","published":"2024-08-21T14:27:36Z","title":"Macformer: Transformer with Random Maclaurin Feature Attention","summary":" Random feature attention (RFA) adopts random fourier feature (RFF) methods to\napproximate the softmax function, resulting in a linear time and space\nattention mechanism that enables the construction of an efficient Transformer.\nInspired by RFA, we propose Macformer, a Transformer architecture that employs\nrandom Maclaurin features (RMF) to approximate various dot-product kernels,\nthereby accelerating attention computations for long sequence. Macformer\nconsists of Random Maclaurin Feature Attention (RMFA) and pre-post Scaling\nBatch Normalization (ppSBN), the former is an unbiased approximation for\ndot-product kernelized attention and the later is a two-stage regularization\nmechanism guaranteeing the error of RMFA. We conducted toy experiments to\ndemonstrate the efficiency of RMFA and ppSBN, and experiments on long range\narena (LRA) benchmark to validate the acceleration and accuracy of Macformer\nwith different dot-product kernels. Experiment results of Macformer are\nconsistent with our theoretical analysis.\n","authors":["Yuhan Guo","Lizhong Ding","Ye Yuan","Guoren Wang"],"pdf_url":"https://arxiv.org/pdf/2408.11656v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.05762v2","updated":"2024-08-21T14:18:34Z","published":"2023-02-11T19:26:17Z","title":"Interpretable Deep Learning for Forecasting Online Advertising Costs:\n Insights from the Competitive Bidding Landscape","summary":" As advertisers increasingly shift their budgets toward digital advertising,\naccurately forecasting advertising costs becomes essential for optimizing\nmarketing campaign returns. This paper presents a comprehensive study that\nemploys various time-series forecasting methods to predict daily average CPC in\nthe online advertising market. We evaluate the performance of statistical\nmodels, machine learning techniques, and deep learning approaches, including\nthe Temporal Fusion Transformer (TFT). Our findings reveal that incorporating\nmultivariate models, enriched with covariates derived from competitors' CPC\npatterns through time-series clustering, significantly improves forecasting\naccuracy. We interpret the results by analyzing feature importance and temporal\nattention, demonstrating how the models leverage both the advertiser's data and\ninsights from the competitive landscape. Additionally, our method proves robust\nduring major market shifts, such as the COVID-19 pandemic, consistently\noutperforming models that rely solely on individual advertisers' data. This\nstudy introduces a scalable technique for selecting relevant covariates from a\nbroad pool of advertisers, offering more accurate long-term forecasts and\nstrategic insights into budget allocation and competitive dynamics in digital\nadvertising.\n","authors":["Fynn Oldenburg","Qiwei Han","Maximilian Kaiser"],"pdf_url":"https://arxiv.org/pdf/2302.05762v2.pdf","comment":"Acceptd at IEEE DSAA 2024, 10 pages, 8 figures, 3 tables"},{"id":"http://arxiv.org/abs/2405.15480v2","updated":"2024-08-21T14:16:40Z","published":"2024-05-24T11:59:02Z","title":"Fundamental computational limits of weak learnability in\n high-dimensional multi-index models","summary":" Multi-index models - functions which only depend on the covariates through a\nnon-linear transformation of their projection on a subspace - are a useful\nbenchmark for investigating feature learning with neural networks. This paper\nexamines the theoretical boundaries of efficient learnability in this\nhypothesis class, focusing particularly on the minimum sample complexity\nrequired for weakly recovering their low-dimensional structure with first-order\niterative algorithms, in the high-dimensional regime where the number of\nsamples is $n=\\alpha d$ is proportional to the covariate dimension $d$. Our\nfindings unfold in three parts: (i) first, we identify under which conditions a\ntrivial subspace can be learned with a single step of a first-order algorithm\nfor any $\\alpha\\!>\\!0$; (ii) second, in the case where the trivial subspace is\nempty, we provide necessary and sufficient conditions for the existence of an\neasy subspace consisting of directions that can be learned only above a certain\nsample complexity $\\alpha\\!>\\!\\alpha_c$. The critical threshold $\\alpha_{c}$\nmarks the presence of a computational phase transition, in the sense that it is\nconjectured that no efficient iterative algorithm can succeed for\n$\\alpha\\!<\\!\\alpha_c$. In a limited but interesting set of really hard\ndirections - akin to the parity problem - $\\alpha_c$ is found to diverge.\nFinally, (iii) we demonstrate that interactions between different directions\ncan result in an intricate hierarchical learning phenomenon, where some\ndirections can be learned sequentially when coupled to easier ones. Our\nanalytical approach is built on the optimality of approximate message-passing\nalgorithms among first-order iterative methods, delineating the fundamental\nlearnability limit across a broad spectrum of algorithms, including neural\nnetworks trained with gradient descent.\n","authors":["Emanuele Troiani","Yatin Dandi","Leonardo Defilippis","Lenka Zdeborová","Bruno Loureiro","Florent Krzakala"],"pdf_url":"https://arxiv.org/pdf/2405.15480v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11641v1","updated":"2024-08-21T14:10:58Z","published":"2024-08-21T14:10:58Z","title":"Estimated Audio-Caption Correspondences Improve Language-Based Audio\n Retrieval","summary":" Dual-encoder-based audio retrieval systems are commonly optimized with\ncontrastive learning on a set of matching and mismatching audio-caption pairs.\nThis leads to a shared embedding space in which corresponding items from the\ntwo modalities end up close together. Since audio-caption datasets typically\nonly contain matching pairs of recordings and descriptions, it has become\ncommon practice to create mismatching pairs by pairing the audio with a caption\nrandomly drawn from the dataset. This is not ideal because the randomly sampled\ncaption could, just by chance, partly or entirely describe the audio recording.\nHowever, correspondence information for all possible pairs is costly to\nannotate and thus typically unavailable; we, therefore, suggest substituting it\nwith estimated correspondences. To this end, we propose a two-staged training\nprocedure in which multiple retrieval models are first trained as usual, i.e.,\nwithout estimated correspondences. In the second stage, the audio-caption\ncorrespondences predicted by these models then serve as prediction targets. We\nevaluate our method on the ClothoV2 and the AudioCaps benchmark and show that\nit improves retrieval performance, even in a restricting self-distillation\nsetting where a single model generates and then learns from the estimated\ncorrespondences. We further show that our method outperforms the current state\nof the art by 1.6 pp. mAP@10 on the ClothoV2 benchmark.\n","authors":["Paul Primus","Florian Schmid","Gerhard Widmer"],"pdf_url":"https://arxiv.org/pdf/2408.11641v1.pdf","comment":"In Proceedings of the 9th Workshop on Detection and Classification of\n Acoustic Scenes and Events, DCASE, Tokyo, Japan, 2024. Implementation\n available on GitHub: https://github.com/OptimusPrimus/salsa"},{"id":"http://arxiv.org/abs/2408.11632v1","updated":"2024-08-21T14:04:00Z","published":"2024-08-21T14:04:00Z","title":"Optimizing Interpretable Decision Tree Policies for Reinforcement\n Learning","summary":" Reinforcement learning techniques leveraging deep learning have made\ntremendous progress in recent years. However, the complexity of neural networks\nprevents practitioners from understanding their behavior. Decision trees have\ngained increased attention in supervised learning for their inherent\ninterpretability, enabling modelers to understand the exact prediction process\nafter learning. This paper considers the problem of optimizing interpretable\ndecision tree policies to replace neural networks in reinforcement learning\nsettings. Previous works have relaxed the tree structure, restricted to\noptimizing only tree leaves, or applied imitation learning techniques to\napproximately copy the behavior of a neural network policy with a decision\ntree. We propose the Decision Tree Policy Optimization (DTPO) algorithm that\ndirectly optimizes the complete decision tree using policy gradients. Our\ntechnique uses established decision tree heuristics for regression to perform\npolicy optimization. We empirically show that DTPO is a competitive algorithm\ncompared to imitation learning algorithms for optimizing decision tree policies\nin reinforcement learning.\n","authors":["Daniël Vos","Sicco Verwer"],"pdf_url":"https://arxiv.org/pdf/2408.11632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11629v1","updated":"2024-08-21T14:00:22Z","published":"2024-08-21T14:00:22Z","title":"A Markovian Model for Learning-to-Optimize","summary":" We present a probabilistic model for stochastic iterative algorithms with the\nuse case of optimization algorithms in mind. Based on this model, we present\nPAC-Bayesian generalization bounds for functions that are defined on the\ntrajectory of the learned algorithm, for example, the expected (non-asymptotic)\nconvergence rate and the expected time to reach the stopping criterion. Thus,\nnot only does this model allow for learning stochastic algorithms based on\ntheir empirical performance, it also yields results about their actual\nconvergence rate and their actual convergence time. We stress that, since the\nmodel is valid in a more general setting than learning-to-optimize, it is of\ninterest for other fields of application, too. Finally, we conduct five\npractically relevant experiments, showing the validity of our claims.\n","authors":["Michael Sucker","Peter Ochs"],"pdf_url":"https://arxiv.org/pdf/2408.11629v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11623v1","updated":"2024-08-21T13:48:00Z","published":"2024-08-21T13:48:00Z","title":"End-to-End Cost-Effective Incentive Recommendation under Budget\n Constraint with Uplift Modeling","summary":" In modern online platforms, incentives are essential factors that enhance\nuser engagement and increase platform revenue. Over recent years, uplift\nmodeling has been introduced as a strategic approach to assign incentives to\nindividual customers. Especially in many real-world applications, online\nplatforms can only incentivize customers with specific budget constraints. This\nproblem can be reformulated as the multi-choice knapsack problem. This\noptimization aims to select the optimal incentive for each customer to maximize\nthe return on investment. Recent works in this field frequently tackle the\nbudget allocation problem using a two-stage approach. However, this solution is\nconfronted with the following challenges: (1) The causal inference methods\noften ignore the domain knowledge in online marketing, where the expected\nresponse curve of a customer should be monotonic and smooth as the incentive\nincreases. (2) An optimality gap between the two stages results in inferior\nsub-optimal allocation performance due to the loss of the incentive\nrecommendation information for the uplift prediction under the limited budget\nconstraint. To address these challenges, we propose a novel End-to-End\nCost-Effective Incentive Recommendation (E3IR) model under budget constraints.\nSpecifically, our methods consist of two modules, i.e., the uplift prediction\nmodule and the differentiable allocation module. In the uplift prediction\nmodule, we construct prediction heads to capture the incremental improvement\nbetween adjacent treatments with the marketing domain constraints (i.e.,\nmonotonic and smooth). We incorporate integer linear programming (ILP) as a\ndifferentiable layer input in the allocation module. Furthermore, we conduct\nextensive experiments on public and real product datasets, demonstrating that\nour E3IR improves allocation performance compared to existing two-stage\napproaches.\n","authors":["Zexu Sun","Hao Yang an Dugang Liu","Yunpeng Weng","Xing Tang","Xiuqiang He"],"pdf_url":"https://arxiv.org/pdf/2408.11623v1.pdf","comment":"Accepted by RecSys 2024"},{"id":"http://arxiv.org/abs/2408.11620v1","updated":"2024-08-21T13:47:01Z","published":"2024-08-21T13:47:01Z","title":"Annealed Sinkhorn for Optimal Transport: convergence, regularization\n path and debiasing","summary":" Sinkhorn's algorithm is a method of choice to solve large-scale optimal\ntransport (OT) problems. In this context, it involves an inverse temperature\nparameter $\\beta$ that determines the speed-accuracy trade-off. To improve this\ntrade-off, practitioners often use a variant of this algorithm, Annealed\nSinkhorn, that uses an nondecreasing sequence $(\\beta_t)_{t\\in \\mathbb{N}}$\nwhere $t$ is the iteration count. However, besides for the schedule\n$\\beta_t=\\Theta(\\log t)$ which is impractically slow, it is not known whether\nthis variant is guaranteed to actually solve OT. Our first contribution answers\nthis question: we show that a concave annealing schedule asymptotically solves\nOT if and only if $\\beta_t\\to+\\infty$ and $\\beta_t-\\beta_{t-1}\\to 0$. The proof\nis based on an equivalence with Online Mirror Descent and further suggests that\nthe iterates of Annealed Sinkhorn follow the solutions of a sequence of\nrelaxed, entropic OT problems, the regularization path. An analysis of this\npath reveals that, in addition to the well-known \"entropic\" error in\n$\\Theta(\\beta^{-1}_t)$, the annealing procedure induces a \"relaxation\" error in\n$\\Theta(\\beta_{t}-\\beta_{t-1})$. The best error trade-off is achieved with the\nschedule $\\beta_t = \\Theta(\\sqrt{t})$ which, albeit slow, is a universal\nlimitation of this method. Going beyond this limitation, we propose a simple\nmodification of Annealed Sinkhorn that reduces the relaxation error, and\ntherefore enables faster annealing schedules. In toy experiments, we observe\nthe effectiveness of our Debiased Annealed Sinkhorn's algorithm: a single run\nof this algorithm spans the whole speed-accuracy Pareto front of the standard\nSinkhorn's algorithm.\n","authors":["Lénaïc Chizat"],"pdf_url":"https://arxiv.org/pdf/2408.11620v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11619v1","updated":"2024-08-21T13:46:58Z","published":"2024-08-21T13:46:58Z","title":"Data-driven Modeling of Combined Sewer Systems for Urban Sustainability:\n An Empirical Evaluation","summary":" Climate change poses complex challenges, with extreme weather events becoming\nincreasingly frequent and difficult to model. Examples include the dynamics of\nCombined Sewer Systems (CSS). Overburdened CSS during heavy rainfall will\noverflow untreated wastewater into surface water bodies. Classical approaches\nto modeling the impact of extreme rainfall events rely on physical simulations,\nwhich are particularly challenging to create for large urban infrastructures.\nDeep Learning (DL) models offer a cost-effective alternative for modeling the\ncomplex dynamics of sewer systems. In this study, we present a comprehensive\nempirical evaluation of several state-of-the-art DL time series models for\npredicting sewer system dynamics in a large urban infrastructure, utilizing\nthree years of measurement data. We especially investigate the potential of DL\nmodels to maintain predictive precision during network outages by comparing\nglobal models, which have access to all variables within the sewer system, and\nlocal models, which are limited to data from a restricted set of local sensors.\nOur findings demonstrate that DL models can accurately predict the dynamics of\nsewer system load, even under network outage conditions. These results suggest\nthat DL models can effectively aid in balancing the load redistribution in CSS,\nthereby enhancing the sustainability and resilience of urban infrastructures.\n","authors":["Vipin Singh","Tianheng Ling","Teodor Chiaburu","Felix Biessmann"],"pdf_url":"https://arxiv.org/pdf/2408.11619v1.pdf","comment":"12 pages, 4 figures, accepted at 47th German Conference on Artificial\n Intelligence, Wuerzburg 2024"},{"id":"http://arxiv.org/abs/2408.11611v1","updated":"2024-08-21T13:39:21Z","published":"2024-08-21T13:39:21Z","title":"DTN: Deep Multiple Task-specific Feature Interactions Network for\n Multi-Task Recommendation","summary":" Neural-based multi-task learning (MTL) has been successfully applied to many\nrecommendation applications. However, these MTL models (e.g., MMoE, PLE) did\nnot consider feature interaction during the optimization, which is crucial for\ncapturing complex high-order features and has been widely used in ranking\nmodels for real-world recommender systems. Moreover, through feature importance\nanalysis across various tasks in MTL, we have observed an interesting\ndivergence phenomenon that the same feature can have significantly different\nimportance across different tasks in MTL. To address these issues, we propose\nDeep Multiple Task-specific Feature Interactions Network (DTN) with a novel\nmodel structure design. DTN introduces multiple diversified task-specific\nfeature interaction methods and task-sensitive network in MTL networks,\nenabling the model to learn task-specific diversified feature interaction\nrepresentations, which improves the efficiency of joint representation learning\nin a general setup. We applied DTN to our company's real-world E-commerce\nrecommendation dataset, which consisted of over 6.3 billion samples, the\nresults demonstrated that DTN significantly outperformed state-of-the-art MTL\nmodels. Moreover, during online evaluation of DTN in a large-scale E-commerce\nrecommender system, we observed a 3.28% in clicks, a 3.10% increase in orders\nand a 2.70% increase in GMV (Gross Merchandise Value) compared to the\nstate-of-the-art MTL models. Finally, extensive offline experiments conducted\non public benchmark datasets demonstrate that DTN can be applied to various\nscenarios beyond recommendations, enhancing the performance of ranking models.\n","authors":["Yaowen Bi","Yuteng Lian","Jie Cui","Jun Liu","Peijian Wang","Guanghui Li","Xuejun Chen","Jinglin Zhao","Hao Wen","Jing Zhang","Zhaoqi Zhang","Wenzhuo Song","Yang Sun","Weiwei Zhang","Mingchen Cai","Guanxing Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.11611v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11607v1","updated":"2024-08-21T13:32:46Z","published":"2024-08-21T13:32:46Z","title":"Networked Communication for Mean-Field Games with Function Approximation\n and Empirical Mean-Field Estimation","summary":" Recent works have provided algorithms by which decentralised agents, which\nmay be connected via a communication network, can learn equilibria in\nMean-Field Games from a single, non-episodic run of the empirical system.\nHowever, these algorithms are given for tabular settings: this computationally\nlimits the size of players' observation space, meaning that the algorithms are\nnot able to handle anything but small state spaces, nor to generalise beyond\npolicies depending on the ego player's state to so-called\n'population-dependent' policies. We address this limitation by introducing\nfunction approximation to the existing setting, drawing on the Munchausen\nOnline Mirror Descent method that has previously been employed only in\nfinite-horizon, episodic, centralised settings. While this permits us to\ninclude the population's mean-field distribution in the observation for each\nplayer's policy, it is arguably unrealistic to assume that decentralised agents\nwould have access to this global information: we therefore additionally provide\nnew algorithms that allow agents to estimate the global empirical distribution\nbased on a local neighbourhood, and to improve this estimate via communication\nover a given network. Our experiments showcase how the communication network\nallows decentralised agents to estimate the mean-field distribution for\npopulation-dependent policies, and that exchanging policy information helps\nnetworked agents to outperform both independent and even centralised agents in\nfunction-approximation settings, by an even greater margin than in tabular\nsettings.\n","authors":["Patrick Benjamin","Alessandro Abate"],"pdf_url":"https://arxiv.org/pdf/2408.11607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14795v4","updated":"2024-08-21T13:32:18Z","published":"2024-04-23T07:19:20Z","title":"Watch Out for Your Guidance on Generation! Exploring Conditional\n Backdoor Attacks against Large Language Models","summary":" Mainstream backdoor attacks on large language models (LLMs) typically set a\nfixed trigger in the input instance and specific responses for triggered\nqueries. However, the fixed trigger setting (e.g., unusual words) may be easily\ndetected by human detection, limiting the effectiveness and practicality in\nreal-world scenarios. To enhance the stealthiness of backdoor activation, we\npresent a new poisoning paradigm against LLMs triggered by specifying\ngeneration conditions, which are commonly adopted strategies by users during\nmodel inference. The poisoned model performs normally for output under\nnormal/other generation conditions, while becomes harmful for output under\ntarget generation conditions. To achieve this objective, we introduce BrieFool,\nan efficient attack framework. It leverages the characteristics of generation\nconditions by efficient instruction sampling and poisoning data generation,\nthereby influencing the behavior of LLMs under target conditions. Our attack\ncan be generally divided into two types with different targets: Safety\nunalignment attack and Ability degradation attack. Our extensive experiments\ndemonstrate that BrieFool is effective across safety domains and ability\ndomains, achieving higher success rates than baseline methods, with 94.3 % on\nGPT-3.5-turbo\n","authors":["Jiaming He","Wenbo Jiang","Guanyu Hou","Wenshu Fan","Rui Zhang","Hongwei Li"],"pdf_url":"https://arxiv.org/pdf/2404.14795v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11598v1","updated":"2024-08-21T13:10:44Z","published":"2024-08-21T13:10:44Z","title":"Improving Calibration by Relating Focal Loss, Temperature Scaling, and\n Properness","summary":" Proper losses such as cross-entropy incentivize classifiers to produce class\nprobabilities that are well-calibrated on the training data. Due to the\ngeneralization gap, these classifiers tend to become overconfident on the test\ndata, mandating calibration methods such as temperature scaling. The focal loss\nis not proper, but training with it has been shown to often result in\nclassifiers that are better calibrated on test data. Our first contribution is\na simple explanation about why focal loss training often leads to better\ncalibration than cross-entropy training. For this, we prove that focal loss can\nbe decomposed into a confidence-raising transformation and a proper loss. This\nis why focal loss pushes the model to provide under-confident predictions on\nthe training data, resulting in being better calibrated on the test data, due\nto the generalization gap. Secondly, we reveal a strong connection between\ntemperature scaling and focal loss through its confidence-raising\ntransformation, which we refer to as the focal calibration map. Thirdly, we\npropose focal temperature scaling - a new post-hoc calibration method combining\nfocal calibration and temperature scaling. Our experiments on three image\nclassification datasets demonstrate that focal temperature scaling outperforms\nstandard temperature scaling.\n","authors":["Viacheslav Komisarenko","Meelis Kull"],"pdf_url":"https://arxiv.org/pdf/2408.11598v1.pdf","comment":"Accepted to ECAI 2024"},{"id":"http://arxiv.org/abs/2408.11596v1","updated":"2024-08-21T13:06:28Z","published":"2024-08-21T13:06:28Z","title":"Calibrating the Predictions for Top-N Recommendations","summary":" Well-calibrated predictions of user preferences are essential for many\napplications. Since recommender systems typically select the top-N items for\nusers, calibration for those top-N items, rather than for all items, is\nimportant. We show that previous calibration methods result in miscalibrated\npredictions for the top-N items, despite their excellent calibration\nperformance when evaluated on all items. In this work, we address the\nmiscalibration in the top-N recommended items. We first define evaluation\nmetrics for this objective and then propose a generic method to optimize\ncalibration models focusing on the top-N items. It groups the top-N items by\ntheir ranks and optimizes distinct calibration models for each group with\nrank-dependent training weights. We verify the effectiveness of the proposed\nmethod for both explicit and implicit feedback datasets, using diverse classes\nof recommender models.\n","authors":["Masahiro Sato"],"pdf_url":"https://arxiv.org/pdf/2408.11596v1.pdf","comment":"accepted at RecSys 2024"},{"id":"http://arxiv.org/abs/2408.04057v2","updated":"2024-08-21T13:05:14Z","published":"2024-08-07T19:39:37Z","title":"PowerPM: Foundation Model for Power Systems","summary":" The emergence of abundant electricity time series (ETS) data provides ample\nopportunities for various applications in the power systems, including\ndemand-side management, grid stability, and consumer behavior analysis. Deep\nlearning models have advanced ETS modeling by effectively capturing sequence\ndependence. Nevertheless, learning a generic representation of ETS data for\nvarious applications remains challenging due to the inherently complex\nhierarchical structure of ETS data. Moreover, ETS data exhibits intricate\ntemporal dependencies and is suscepti ble to the influence of exogenous\nvariables. Furthermore, different instances exhibit diverse electricity\nconsumption behavior. In this paper, we propose a foundation model PowerPM to\nmodel ETS data, providing a large-scale, off-the-shelf model for power systems.\nPowerPM consists of a temporal encoder and a hierarchical encoder. The temporal\nencoder captures both temporal dependencies in ETS data, considering exogenous\nvariables. The hierarchical encoder models the correlation between hierarchy.\nFurthermore, PowerPM leverages a novel self-supervised pretraining framework\nconsisting of masked ETS modeling and dual-view contrastive learning, which\nenable PowerPM to capture temporal dependency within ETS windows and aware the\ndiscrepancy across ETS windows, providing two different perspectives to learn\ngeneric representation. Our experiments involve five real world scenario\ndatasets, comprising private and public data. Through pre-training on massive\nETS data, PowerPM achieves SOTA performance on diverse downstream tasks within\nthe private dataset. Impressively, when transferred to the public datasets,\nPowerPM maintains its superiority, showcasing its remarkable generalization\nability across various tasks and domains. Moreover, ablation studies, few-shot\nexperiments provide additional evidence of the effectiveness of our model.\n","authors":["Shihao Tu","Yupeng Zhang","Jing Zhang","Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2408.04057v2.pdf","comment":"23 pages, 5 figures, 8 tables"},{"id":"http://arxiv.org/abs/2408.07666v3","updated":"2024-08-21T12:47:31Z","published":"2024-08-14T16:58:48Z","title":"Model Merging in LLMs, MLLMs, and Beyond: Methods, Theories,\n Applications and Opportunities","summary":" Model merging is an efficient empowerment technique in the machine learning\ncommunity that does not require the collection of raw training data and does\nnot require expensive computation. As model merging becomes increasingly\nprevalent across various fields, it is crucial to understand the available\nmodel merging techniques comprehensively. However, there is a significant gap\nin the literature regarding a systematic and thorough review of these\ntechniques. This survey provides a comprehensive overview of model merging\nmethods and theories, their applications in various domains and settings, and\nfuture research directions. Specifically, we first propose a new taxonomic\napproach that exhaustively discusses existing model merging methods. Secondly,\nwe discuss the application of model merging techniques in large language\nmodels, multimodal large language models, and 10+ machine learning subfields,\nincluding continual learning, multi-task learning, few-shot learning, etc.\nFinally, we highlight the remaining challenges of model merging and discuss\nfuture research directions. A comprehensive list of papers about model merging\nis available at\n\\url{https://github.com/EnnengYang/Awesome-Model-Merging-Methods-Theories-Applications}.\n","authors":["Enneng Yang","Li Shen","Guibing Guo","Xingwei Wang","Xiaochun Cao","Jie Zhang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2408.07666v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10775v2","updated":"2024-08-21T12:28:21Z","published":"2024-08-20T12:14:18Z","title":"Generative AI in Industrial Machine Vision -- A Review","summary":" Machine vision enhances automation, quality control, and operational\nefficiency in industrial applications by enabling machines to interpret and act\non visual data. While traditional computer vision algorithms and approaches\nremain widely utilized, machine learning has become pivotal in current research\nactivities. In particular, generative AI demonstrates promising potential by\nimproving pattern recognition capabilities, through data augmentation,\nincreasing image resolution, and identifying anomalies for quality control.\nHowever, the application of generative AI in machine vision is still in its\nearly stages due to challenges in data diversity, computational requirements,\nand the necessity for robust validation methods. A comprehensive literature\nreview is essential to understand the current state of generative AI in\nindustrial machine vision, focusing on recent advancements, applications, and\nresearch trends. Thus, a literature review based on the PRISMA guidelines was\nconducted, analyzing over 1,200 papers on generative AI in industrial machine\nvision. Our findings reveal various patterns in current research, with the\nprimary use of generative AI being data augmentation, for machine vision tasks\nsuch as classification and object detection. Furthermore, we gather a\ncollection of application challenges together with data requirements to enable\na successful application of generative AI in industrial machine vision. This\noverview aims to provide researchers with insights into the different areas and\napplications within current research, highlighting significant advancements and\nidentifying opportunities for future work.\n","authors":["Hans Aoyang Zhou","Dominik Wolfschläger","Constantinos Florides","Jonas Werheid","Hannes Behnen","Jan-Henrick Woltersmann","Tiago C. Pinto","Marco Kemmerling","Anas Abdelrazeq","Robert H. Schmitt"],"pdf_url":"https://arxiv.org/pdf/2408.10775v2.pdf","comment":"44 pages, 7 figures, This work has been submitted to the Journal of\n Intelligent Manufacturing"},{"id":"http://arxiv.org/abs/2408.11561v1","updated":"2024-08-21T12:15:20Z","published":"2024-08-21T12:15:20Z","title":"Self-Supervised Iterative Refinement for Anomaly Detection in Industrial\n Quality Control","summary":" This study introduces the Iterative Refinement Process (IRP), a robust\nanomaly detection methodology designed for high-stakes industrial quality\ncontrol. The IRP enhances defect detection accuracy through a cyclic data\nrefinement strategy, iteratively removing misleading data points to improve\nmodel performance and robustness. We validate the IRP's effectiveness using two\nbenchmark datasets, Kolektor SDD2 (KSDD2) and MVTec AD, covering a wide range\nof industrial products and defect types. Our experimental results demonstrate\nthat the IRP consistently outperforms traditional anomaly detection models,\nparticularly in environments with high noise levels. This study highlights the\nIRP's potential to significantly enhance anomaly detection processes in\nindustrial settings, effectively managing the challenges of sparse and noisy\ndata.\n","authors":["Muhammad Aqeel","Shakiba Sharifi","Marco Cristani","Francesco Setti"],"pdf_url":"https://arxiv.org/pdf/2408.11561v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03865v2","updated":"2024-08-21T12:08:00Z","published":"2024-08-07T16:13:43Z","title":"PackMamba: Efficient Processing of Variable-Length Sequences in Mamba\n training","summary":" With the evolution of large language models, traditional Transformer models\nbecome computationally demanding for lengthy sequences due to the quadratic\ngrowth in computation with respect to the sequence length. Mamba, emerging as a\ngroundbreaking architecture in the field of generative AI, demonstrates\nremarkable proficiency in handling elongated sequences with reduced\ncomputational and memory complexity. Nevertheless, the existing training\nframework of Mamba presents inefficiency with variable-length sequence inputs.\nEither single-sequence training results in low GPU utilization, or batched\nprocessing of variable-length sequences to a maximum length incurs considerable\nmemory and computational overhead. To address this problem, we analyze the\nperformance of bottleneck operators in Mamba under diverse tensor shapes and\nproposed PackMamba, a high-throughput Mamba that efficiently handles\nvariable-length sequences. Diving deep into state-space models (SSMs), we\nmodify the parallel operators to avoid passing information between individual\nsequences while maintaining high performance. Experimental results on an NVIDIA\nA100 GPU demonstrate throughput exceeding the baseline single-sequence\nprocessing scheme: 3.06x speedup on the 1.4B model and 2.62x on the 2.8B model.\n","authors":["Haoran Xu","Ziqian Liu","Rong Fu","Zhongling Su","Zerui Wang","Zheng Cai","Zhilin Pei","Xingcheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.03865v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01647v2","updated":"2024-08-21T11:55:22Z","published":"2024-06-03T12:58:29Z","title":"An Analysis under a Unified Fomulation of Learning Algorithms with\n Output Constraints","summary":" Neural networks (NN) perform well in diverse tasks, but sometimes produce\nnonsensical results to humans. Most NN models \"solely\" learn from (input,\noutput) pairs, occasionally conflicting with human knowledge. Many studies\nindicate injecting human knowledge by reducing output constraints during\ntraining can improve model performance and reduce constraint violations. While\nthere have been several attempts to compare different existing algorithms under\nthe same programming framework, nonetheless, there has been no previous work\nthat categorizes learning algorithms with output constraints in a unified\nmanner. Our contributions are as follows: (1) We categorize the previous\nstudies based on three axes: type of constraint loss used (e.g. probabilistic\nsoft logic, REINFORCE), exploration strategy of constraint-violating examples,\nand integration mechanism of learning signals from main task and constraint.\n(2) We propose new algorithms to integrate the information of main task and\nconstraint injection, inspired by continual-learning algorithms. (3)\nFurthermore, we propose the $H\\beta$-score as a metric for considering the main\ntask metric and constraint violation simultaneously. To provide a thorough\nanalysis, we examine all the algorithms on three NLP tasks: natural language\ninference (NLI), synthetic transduction examples (STE), and semantic role\nlabeling (SRL). We explore and reveal the key factors of various algorithms\nassociated with achieving high $H\\beta$-scores.\n","authors":["Mooho Song","Jay-Yoon Lee"],"pdf_url":"https://arxiv.org/pdf/2406.01647v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11546v1","updated":"2024-08-21T11:54:22Z","published":"2024-08-21T11:54:22Z","title":"Memorization In In-Context Learning","summary":" In-context learning (ICL) has proven to be an effective strategy for\nimproving the performance of large language models (LLMs) with no additional\ntraining. However, the exact mechanism behind these performance improvements\nremains unclear. This study is the first to show how ICL surfaces memorized\ntraining data and to explore the correlation between this memorization and\nperformance across various ICL regimes: zero-shot, few-shot, and many-shot. Our\nmost notable findings include: (1) ICL significantly surfaces memorization\ncompared to zero-shot learning in most cases; (2) demonstrations, without their\nlabels, are the most effective element in surfacing memorization; (3) ICL\nimproves performance when the surfaced memorization in few-shot regimes reaches\na high level (about 40%); and (4) there is a very strong correlation between\nperformance and memorization in ICL when it outperforms zero-shot learning.\nOverall, our study uncovers a hidden phenomenon -- memorization -- at the core\nof ICL, raising an important question: to what extent do LLMs truly generalize\nfrom demonstrations in ICL, and how much of their success is due to\nmemorization?\n","authors":["Shahriar Golchin","Mihai Surdeanu","Steven Bethard","Eduardo Blanco","Ellen Riloff"],"pdf_url":"https://arxiv.org/pdf/2408.11546v1.pdf","comment":"v1"},{"id":"http://arxiv.org/abs/2407.08750v2","updated":"2024-08-21T11:43:00Z","published":"2024-06-26T16:04:49Z","title":"Online Distributional Regression","summary":" Large-scale streaming data are common in modern machine learning applications\nand have led to the development of online learning algorithms. Many fields,\nsuch as supply chain management, weather and meteorology, energy markets, and\nfinance, have pivoted towards using probabilistic forecasts, which yields the\nneed not only for accurate learning of the expected value but also for learning\nthe conditional heteroskedasticity and conditional distribution moments.\nAgainst this backdrop, we present a methodology for online estimation of\nregularized, linear distributional models. The proposed algorithm is based on a\ncombination of recent developments for the online estimation of LASSO models\nand the well-known GAMLSS framework. We provide a case study on day-ahead\nelectricity price forecasting, in which we show the competitive performance of\nthe incremental estimation combined with strongly reduced computational effort.\nOur algorithms are implemented in a computationally efficient Python package.\n","authors":["Simon Hirsch","Jonathan Berrisch","Florian Ziel"],"pdf_url":"https://arxiv.org/pdf/2407.08750v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19183v2","updated":"2024-08-21T11:39:10Z","published":"2023-05-30T16:27:25Z","title":"Graph-based Time Series Clustering for End-to-End Hierarchical\n Forecasting","summary":" Relationships among time series can be exploited as inductive biases in\nlearning effective forecasting models. In hierarchical time series,\nrelationships among subsets of sequences induce hard constraints (hierarchical\ninductive biases) on the predicted values. In this paper, we propose a\ngraph-based methodology to unify relational and hierarchical inductive biases\nin the context of deep learning for time series forecasting. In particular, we\nmodel both types of relationships as dependencies in a pyramidal graph\nstructure, with each pyramidal layer corresponding to a level of the hierarchy.\nBy exploiting modern - trainable - graph pooling operators we show that the\nhierarchical structure, if not available as a prior, can be learned directly\nfrom data, thus obtaining cluster assignments aligned with the forecasting\nobjective. A differentiable reconciliation stage is incorporated into the\nprocessing architecture, allowing hierarchical constraints to act both as an\narchitectural bias as well as a regularization element for predictions.\nSimulation results on representative datasets show that the proposed method\ncompares favorably against the state of the art.\n","authors":["Andrea Cini","Danilo Mandic","Cesare Alippi"],"pdf_url":"https://arxiv.org/pdf/2305.19183v2.pdf","comment":"Published at ICML 2024"},{"id":"http://arxiv.org/abs/2404.10501v2","updated":"2024-08-21T11:36:47Z","published":"2024-04-16T12:19:54Z","title":"Self-Supervised Visual Preference Alignment","summary":" This paper makes the first attempt towards unsupervised preference alignment\nin Vision-Language Models (VLMs). We generate chosen and rejected responses\nwith regard to the original and augmented image pairs, and conduct preference\nalignment with direct preference optimization. It is based on a core idea:\nproperly designed augmentation to the image input will induce VLM to generate\nfalse but hard negative responses, which helps the model to learn from and\nproduce more robust and powerful answers. The whole pipeline no longer hinges\non supervision from GPT-4 or human involvement during alignment, and is highly\nefficient with few lines of code. With only 8k randomly sampled unsupervised\ndata, it achieves 90\\% relative score to GPT-4 on complex reasoning in\nLLaVA-Bench, and improves LLaVA-7B/13B by 6.7\\%/5.6\\% score on complex\nmulti-modal benchmark MM-Vet. Visualizations shows its improved ability to\nalign with user-intentions. A series of ablations are firmly conducted to\nreveal the latent mechanism of the approach, which also indicates its potential\ntowards further scaling. Code are available in\nhttps://github.com/Kevinz-code/SeVa.\n","authors":["Ke Zhu","Zheng Ge","Liang Zhao","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.10501v2.pdf","comment":"MM2024 oral"},{"id":"http://arxiv.org/abs/2408.11537v1","updated":"2024-08-21T11:32:09Z","published":"2024-08-21T11:32:09Z","title":"A Survey of Embodied Learning for Object-Centric Robotic Manipulation","summary":" Embodied learning for object-centric robotic manipulation is a rapidly\ndeveloping and challenging area in embodied AI. It is crucial for advancing\nnext-generation intelligent robots and has garnered significant interest\nrecently. Unlike data-driven machine learning methods, embodied learning\nfocuses on robot learning through physical interaction with the environment and\nperceptual feedback, making it especially suitable for robotic manipulation. In\nthis paper, we provide a comprehensive survey of the latest advancements in\nthis field and categorize the existing work into three main branches: 1)\nEmbodied perceptual learning, which aims to predict object pose and affordance\nthrough various data representations; 2) Embodied policy learning, which\nfocuses on generating optimal robotic decisions using methods such as\nreinforcement learning and imitation learning; 3) Embodied task-oriented\nlearning, designed to optimize the robot's performance based on the\ncharacteristics of different tasks in object grasping and manipulation. In\naddition, we offer an overview and discussion of public datasets, evaluation\nmetrics, representative applications, current challenges, and potential future\nresearch directions. A project associated with this survey has been established\nat https://github.com/RayYoh/OCRM_survey.\n","authors":["Ying Zheng","Lei Yao","Yuejiao Su","Yi Zhang","Yi Wang","Sicheng Zhao","Yiyi Zhang","Lap-Pui Chau"],"pdf_url":"https://arxiv.org/pdf/2408.11537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.07708v2","updated":"2024-08-21T11:26:00Z","published":"2024-06-03T08:20:06Z","title":"Joint Constellation Shaping Using Gradient Descent Approach for MU-MIMO\n Broadcast Channel","summary":" We introduce a learning-based approach to optimize a joint constellation for\na multi-user MIMO broadcast channel ($T$ Tx antennas, $K$ users, each with $R$\nRx antennas), with perfect channel knowledge. The aim of the optimizer\n(MAX-MIN) is to maximize the minimum mutual information between the transmitter\nand each receiver, under a sum-power constraint. The proposed optimization\nmethod do neither impose the transmitter to use superposition coding (SC) or\nany other linear precoding, nor to use successive interference cancellation\n(SIC) at the receiver. Instead, the approach designs a joint constellation,\noptimized such that its projection into the subspace of each receiver $k$,\nmaximizes the minimum mutual information $I(W_k;Y_k)$ between each transmitted\nbinary input $W_k$ and the output signal at the intended receiver $Y_k$. The\nrates obtained by our method are compared to those achieved with linear\nprecoders.\n","authors":["Maxime Vaillant","Alix Jeannerot","Jean-Marie Gorce"],"pdf_url":"https://arxiv.org/pdf/2407.07708v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02954v2","updated":"2024-08-21T11:09:11Z","published":"2024-05-05T14:48:13Z","title":"Source-Free Domain Adaptation Guided by Vision and Vision-Language\n Pre-Training","summary":" Source-free domain adaptation (SFDA) aims to adapt a source model trained on\na fully-labeled source domain to a related but unlabeled target domain. While\nthe source model is a key avenue for acquiring target pseudolabels, the\ngenerated pseudolabels may exhibit source bias. In the conventional SFDA\npipeline, a large data (e.g. ImageNet) pre-trained feature extractor is used to\ninitialize the source model at the start of source training, and subsequently\ndiscarded. Despite having diverse features important for generalization, the\npre-trained feature extractor can overfit to the source data distribution\nduring source training and forget relevant target domain knowledge. Rather than\ndiscarding this valuable knowledge, we introduce an integrated framework to\nincorporate pre-trained networks into the target adaptation process. The\nproposed framework is flexible and allows us to plug modern pre-trained\nnetworks into the adaptation process to leverage their stronger representation\nlearning capabilities. For adaptation, we propose the Co-learn algorithm to\nimprove target pseudolabel quality collaboratively through the source model and\na pre-trained feature extractor. Building on the recent success of the\nvision-language model CLIP in zero-shot image recognition, we present an\nextension Co-learn++ to further incorporate CLIP's zero-shot classification\ndecisions. We evaluate on 4 benchmark datasets and include more challenging\nscenarios such as open-set, partial-set and open-partial SFDA. Experimental\nresults demonstrate that our proposed strategy improves adaptation performance\nand can be successfully integrated with existing SFDA methods.\n","authors":["Wenyu Zhang","Li Shen","Chuan-Sheng Foo"],"pdf_url":"https://arxiv.org/pdf/2405.02954v2.pdf","comment":"Extension of ICCV paper arXiv:2212.07585, accepted to IJCV"},{"id":"http://arxiv.org/abs/2408.11527v1","updated":"2024-08-21T11:06:02Z","published":"2024-08-21T11:06:02Z","title":"The Vizier Gaussian Process Bandit Algorithm","summary":" Google Vizier has performed millions of optimizations and accelerated\nnumerous research and production systems at Google, demonstrating the success\nof Bayesian optimization as a large-scale service. Over multiple years, its\nalgorithm has been improved considerably, through the collective experiences of\nnumerous research efforts and user feedback. In this technical report, we\ndiscuss the implementation details and design choices of the current default\nalgorithm provided by Open Source Vizier. Our experiments on standardized\nbenchmarks reveal its robustness and versatility against well-established\nindustry baselines on multiple practical modes.\n","authors":["Xingyou Song","Qiuyi Zhang","Chansoo Lee","Emily Fertig","Tzu-Kuo Huang","Lior Belenki","Greg Kochanski","Setareh Ariafar","Srinivas Vasudevan","Sagi Perel","Daniel Golovin"],"pdf_url":"https://arxiv.org/pdf/2408.11527v1.pdf","comment":"Google DeepMind Technical Report. Code can be found in\n https://github.com/google/vizier"},{"id":"http://arxiv.org/abs/2406.19380v3","updated":"2024-08-21T10:54:14Z","published":"2024-06-27T17:55:31Z","title":"TabReD: A Benchmark of Tabular Machine Learning in-the-Wild","summary":" Benchmarks that closely reflect downstream application scenarios are\nessential for the streamlined adoption of new research in tabular machine\nlearning (ML). In this work, we examine existing tabular benchmarks and find\ntwo common characteristics of industry-grade tabular data that are\nunderrepresented in the datasets available to the academic community. First,\ntabular data often changes over time in real-world deployment scenarios. This\nimpacts model performance and requires time-based train and test splits for\ncorrect model evaluation. Yet, existing academic tabular datasets often lack\ntimestamp metadata to enable such evaluation. Second, a considerable portion of\ndatasets in production settings stem from extensive data acquisition and\nfeature engineering pipelines. For each specific dataset, this can have a\ndifferent impact on the absolute and relative number of predictive,\nuninformative, and correlated features, which in turn can affect model\nselection. To fill the aforementioned gaps in academic benchmarks, we introduce\nTabReD -- a collection of eight industry-grade tabular datasets covering a wide\nrange of domains from finance to food delivery services. We assess a large\nnumber of tabular ML models in the feature-rich, temporally-evolving data\nsetting facilitated by TabReD. We demonstrate that evaluation on time-based\ndata splits leads to different methods ranking, compared to evaluation on\nrandom splits more common in academic benchmarks. Furthermore, on the TabReD\ndatasets, MLP-like architectures and GBDT show the best results, while more\nsophisticated DL models are yet to prove their effectiveness.\n","authors":["Ivan Rubachev","Nikolay Kartashev","Yury Gorishniy","Artem Babenko"],"pdf_url":"https://arxiv.org/pdf/2406.19380v3.pdf","comment":"Code: https://github.com/yandex-research/tabred (V2: fix the link to\n the code in this comment; no changes to the PDF)"},{"id":"http://arxiv.org/abs/2408.11513v1","updated":"2024-08-21T10:44:57Z","published":"2024-08-21T10:44:57Z","title":"Last-Iterate Convergence of General Parameterized Policies in\n Constrained MDPs","summary":" We consider the problem of learning a Constrained Markov Decision Process\n(CMDP) via general parameterization. Our proposed Primal-Dual based Regularized\nAccelerated Natural Policy Gradient (PDR-ANPG) algorithm uses entropy and\nquadratic regularizers to reach this goal. For a parameterized policy class\nwith transferred compatibility approximation error, $\\epsilon_{\\mathrm{bias}}$,\nPDR-ANPG achieves a last-iterate $\\epsilon$ optimality gap and $\\epsilon$\nconstraint violation (up to some additive factor of $\\epsilon_{\\mathrm{bias}}$)\nwith a sample complexity of\n$\\tilde{\\mathcal{O}}(\\epsilon^{-2}\\min\\{\\epsilon^{-2},\\epsilon_{\\mathrm{bias}}^{-\\frac{1}{3}}\\})$.\nIf the class is incomplete ($\\epsilon_{\\mathrm{bias}}>0$), then the sample\ncomplexity reduces to $\\tilde{\\mathcal{O}}(\\epsilon^{-2})$ for\n$\\epsilon<(\\epsilon_{\\mathrm{bias}})^{\\frac{1}{6}}$. Moreover, for complete\npolicies with $\\epsilon_{\\mathrm{bias}}=0$, our algorithm achieves a\nlast-iterate $\\epsilon$ optimality gap and $\\epsilon$ constraint violation with\n$\\tilde{\\mathcal{O}}(\\epsilon^{-4})$ sample complexity. It is a significant\nimprovement of the state-of-the-art last-iterate guarantees of general\nparameterized CMDPs.\n","authors":["Washim Uddin Mondal","Vaneet Aggarwal"],"pdf_url":"https://arxiv.org/pdf/2408.11513v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09237v3","updated":"2024-08-21T10:36:44Z","published":"2023-06-15T16:19:15Z","title":"One Law, Many Languages: Benchmarking Multilingual Legal Reasoning for\n Judicial Support","summary":" Recent strides in Large Language Models (LLMs) have saturated many Natural\nLanguage Processing (NLP) benchmarks, emphasizing the need for more challenging\nones to properly assess LLM capabilities. However, domain-specific and\nmultilingual benchmarks are rare because they require in-depth expertise to\ndevelop. Still, most public models are trained predominantly on English\ncorpora, while other languages remain understudied, particularly for practical\ndomain-specific NLP tasks. In this work, we introduce a novel NLP benchmark for\nthe legal domain that challenges LLMs in five key dimensions: processing\n\\emph{long documents} (up to 50K tokens), using \\emph{domain-specific\nknowledge} (embodied in legal texts), \\emph{multilingual} understanding\n(covering five languages), \\emph{multitasking} (comprising legal\ndocument-to-document Information Retrieval, Court View Generation, Leading\nDecision Summarization, Citation Extraction, and eight challenging Text\nClassification tasks) and \\emph{reasoning} (comprising especially Court View\nGeneration, but also the Text Classification tasks). Our benchmark contains\ndiverse datasets from the Swiss legal system, allowing for a comprehensive\nstudy of the underlying non-English, inherently multilingual legal system.\nDespite the large size of our datasets (some with hundreds of thousands of\nexamples), existing publicly available multilingual models struggle with most\ntasks, even after extensive in-domain pre-training and fine-tuning. We publish\nall resources (benchmark suite, pre-trained models, code) under permissive open\nCC BY-SA licenses.\n","authors":["Ronja Stern","Vishvaksenan Rasiah","Veton Matoshi","Srinanda Brügger Bose","Matthias Stürmer","Ilias Chalkidis","Daniel E. Ho","Joel Niklaus"],"pdf_url":"https://arxiv.org/pdf/2306.09237v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.03717v2","updated":"2024-08-21T10:25:00Z","published":"2024-01-08T08:00:04Z","title":"Universal Time-Series Representation Learning: A Survey","summary":" Time-series data exists in every corner of real-world systems and services,\nranging from satellites in the sky to wearable devices on human bodies.\nLearning representations by extracting and inferring valuable information from\nthese time series is crucial for understanding the complex dynamics of\nparticular phenomena and enabling informed decisions. With the learned\nrepresentations, we can perform numerous downstream analyses more effectively.\nAmong several approaches, deep learning has demonstrated remarkable performance\nin extracting hidden patterns and features from time-series data without manual\nfeature engineering. This survey first presents a novel taxonomy based on three\nfundamental elements in designing state-of-the-art universal representation\nlearning methods for time series. According to the proposed taxonomy, we\ncomprehensively review existing studies and discuss their intuitions and\ninsights into how these methods enhance the quality of learned representations.\nFinally, as a guideline for future studies, we summarize commonly used\nexperimental setups and datasets and discuss several promising research\ndirections. An up-to-date corresponding resource is available at\nhttps://github.com/itouchz/awesome-deep-time-series-representations.\n","authors":["Patara Trirat","Yooju Shin","Junhyeok Kang","Youngeun Nam","Jihye Na","Minyoung Bae","Joeun Kim","Byunghyun Kim","Jae-Gil Lee"],"pdf_url":"https://arxiv.org/pdf/2401.03717v2.pdf","comment":"43 pages, 7 figures, reference updates"},{"id":"http://arxiv.org/abs/2406.05036v3","updated":"2024-08-21T10:22:09Z","published":"2024-06-07T15:58:12Z","title":"TimeSieve: Extracting Temporal Dynamics through Information Bottlenecks","summary":" Time series forecasting has become an increasingly popular research area due\nto its critical applications in various real-world domains such as traffic\nmanagement, weather prediction, and financial analysis. Despite significant\nadvancements, existing models face notable challenges, including the necessity\nof manual hyperparameter tuning for different datasets, and difficulty in\neffectively distinguishing signal from redundant features in data characterized\nby strong seasonality. These issues hinder the generalization and practical\napplication of time series forecasting models. To solve this issues, we propose\nan innovative time series forecasting model TimeSieve designed to address these\nchallenges. Our approach employs wavelet transforms to preprocess time series\ndata, effectively capturing multi-scale features without the need for\nadditional parameters or manual hyperparameter tuning. Additionally, we\nintroduce the information bottleneck theory that filters out redundant features\nfrom both detail and approximation coefficients, retaining only the most\npredictive information. This combination reduces significantly improves the\nmodel's accuracy. Extensive experiments demonstrate that our model outperforms\nexisting state-of-the-art methods on 70% of the datasets, achieving higher\npredictive accuracy and better generalization across diverse datasets. Our\nresults validate the effectiveness of our approach in addressing the key\nchallenges in time series forecasting, paving the way for more reliable and\nefficient predictive models in practical applications. The code for our model\nis available at https://github.com/xll0328/TimeSieve.\n","authors":["Ninghui Feng","Songning Lai","Jiayu Yang","Fobao Zhou","Zhenxiao Yin","Hang Zhao"],"pdf_url":"https://arxiv.org/pdf/2406.05036v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11500v1","updated":"2024-08-21T10:18:41Z","published":"2024-08-21T10:18:41Z","title":"Slicing Input Features to Accelerate Deep Learning: A Case Study with\n Graph Neural Networks","summary":" As graphs grow larger, full-batch GNN training becomes hard for single GPU\nmemory. Therefore, to enhance the scalability of GNN training, some studies\nhave proposed sampling-based mini-batch training and distributed graph\nlearning. However, these methods still have drawbacks, such as performance\ndegradation and heavy communication. This paper introduces SliceGCN, a\nfeature-sliced distributed large-scale graph learning method. SliceGCN slices\nthe node features, with each computing device, i.e., GPU, handling partial\nfeatures. After each GPU processes its share, partial representations are\nobtained and concatenated to form complete representations, enabling a single\nGPU's memory to handle the entire graph structure. This aims to avoid the\naccuracy loss typically associated with mini-batch training (due to incomplete\ngraph structures) and to reduce inter-GPU communication during message passing\n(the forward propagation process of GNNs). To study and mitigate potential\naccuracy reductions due to slicing features, this paper proposes feature fusion\nand slice encoding. Experiments were conducted on six node classification\ndatasets, yielding some interesting analytical results. These results indicate\nthat while SliceGCN does not enhance efficiency on smaller datasets, it does\nimprove efficiency on larger datasets. Additionally, we found that SliceGCN and\nits variants have better convergence, feature fusion and slice encoding can\nmake training more stable, reduce accuracy fluctuations, and this study also\ndiscovered that the design of SliceGCN has a potentially parameter-efficient\nnature.\n","authors":["Zhengjia Xu","Dingyang Lyu","Jinghui Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.11500v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11479v1","updated":"2024-08-21T09:44:43Z","published":"2024-08-21T09:44:43Z","title":"Learning Deep Dissipative Dynamics","summary":" This study challenges strictly guaranteeing ``dissipativity'' of a dynamical\nsystem represented by neural networks learned from given time-series data.\nDissipativity is a crucial indicator for dynamical systems that generalizes\nstability and input-output stability, known to be valid across various systems\nincluding robotics, biological systems, and molecular dynamics. By analytically\nproving the general solution to the nonlinear Kalman-Yakubovich-Popov (KYP)\nlemma, which is the necessary and sufficient condition for dissipativity, we\npropose a differentiable projection that transforms any dynamics represented by\nneural networks into dissipative ones and a learning method for the transformed\ndynamics. Utilizing the generality of dissipativity, our method strictly\nguarantee stability, input-output stability, and energy conservation of trained\ndynamical systems. Finally, we demonstrate the robustness of our method against\nout-of-domain input through applications to robotic arms and fluid dynamics.\nCode here https://github.com/kojima-r/DeepDissipativeModel\n","authors":["Yuji Okamoto","Ryosuke Kojima"],"pdf_url":"https://arxiv.org/pdf/2408.11479v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11478v1","updated":"2024-08-21T09:43:27Z","published":"2024-08-21T09:43:27Z","title":"LAKD-Activation Mapping Distillation Based on Local Learning","summary":" Knowledge distillation is widely applied in various fundamental vision models\nto enhance the performance of compact models. Existing knowledge distillation\nmethods focus on designing different distillation targets to acquire knowledge\nfrom teacher models. However, these methods often overlook the efficient\nutilization of distilled information, crudely coupling different types of\ninformation, making it difficult to explain how the knowledge from the teacher\nnetwork aids the student network in learning. This paper proposes a novel\nknowledge distillation framework, Local Attention Knowledge Distillation\n(LAKD), which more efficiently utilizes the distilled information from teacher\nnetworks, achieving higher interpretability and competitive performance. The\nframework establishes an independent interactive training mechanism through a\nseparation-decoupling mechanism and non-directional activation mapping. LAKD\ndecouples the teacher's features and facilitates progressive interaction\ntraining from simple to complex. Specifically, the student network is divided\ninto local modules with independent gradients to decouple the knowledge\ntransferred from the teacher. The non-directional activation mapping helps the\nstudent network integrate knowledge from different local modules by learning\ncoarse-grained feature knowledge. We conducted experiments on the CIFAR-10,\nCIFAR-100, and ImageNet datasets, and the results show that our LAKD method\nsignificantly outperforms existing methods, consistently achieving\nstate-of-the-art performance across different datasets.\n","authors":["Yaoze Zhang","Yuming Zhang","Yu Zhao","Yue Zhang","Feiyu Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.11478v1.pdf","comment":"8 pages,7 figures"},{"id":"http://arxiv.org/abs/2406.07126v3","updated":"2024-08-21T09:40:02Z","published":"2024-06-11T10:18:58Z","title":"Logical Distillation of Graph Neural Networks","summary":" We present a logic based interpretable model for learning on graphs and an\nalgorithm to distill this model from a Graph Neural Network (GNN). Recent\nresults have shown connections between the expressivity of GNNs and the\ntwo-variable fragment of first-order logic with counting quantifiers (C2). We\nintroduce a decision-tree based model which leverages an extension of C2 to\ndistill interpretable logical classifiers from GNNs. We test our approach on\nmultiple GNN architectures. The distilled models are interpretable, succinct,\nand attain similar accuracy to the underlying GNN. Furthermore, when the ground\ntruth is expressible in C2, our approach outperforms the GNN.\n","authors":["Alexander Pluska","Pascal Welke","Thomas Gärtner","Sagar Malhotra"],"pdf_url":"https://arxiv.org/pdf/2406.07126v3.pdf","comment":"To Appear in the Proceedings of KR 2024"},{"id":"http://arxiv.org/abs/2305.12822v2","updated":"2024-08-21T09:28:04Z","published":"2023-05-22T08:29:43Z","title":"Quantifying the effect of X-ray scattering for data generation in\n real-time defect detection","summary":" Background: X-ray imaging is widely used for the non-destructive detection of\ndefects in industrial products on a conveyor belt. In-line detection requires\nhighly accurate, robust, and fast algorithms. Deep Convolutional Neural\nNetworks (DCNNs) satisfy these requirements when a large amount of labeled data\nis available. To overcome the challenge of collecting these data, different\nmethods of X-ray image generation are considered.\n Objective: Depending on the desired degree of similarity to real data,\ndifferent physical effects should either be simulated or can be ignored. X-ray\nscattering is known to be computationally expensive to simulate, and this\neffect can greatly affect the accuracy of a generated X-ray image. We aim to\nquantitatively evaluate the effect of scattering on defect detection.\n Methods: Monte-Carlo simulation is used to generate X-ray scattering\ndistribution. DCNNs are trained on the data with and without scattering and\napplied to the same test datasets. Probability of Detection (POD) curves are\ncomputed to compare their performance, characterized by the size of the\nsmallest detectable defect.\n Results: We apply the methodology to a model problem of defect detection in\ncylinders. When trained on data without scattering, DCNNs reliably detect\ndefects larger than 1.3 mm, and using data with scattering improves performance\nby less than 5%. If the analysis is performed on the cases with large\nscattering-to-primary ratio ($1 < SPR < 5$), the difference in performance\ncould reach 15% (approx. 0.4 mm).\n Conclusion: Excluding the scattering signal from the training data has the\nlargest effect on the smallest detectable defects, and the difference decreases\nfor larger defects. The scattering-to-primary ratio has a significant effect on\ndetection performance and the required accuracy of data generation.\n","authors":["Vladyslav Andriiashen","Robert van Liere","Tristan van Leeuwen","K. Joost Batenburg"],"pdf_url":"https://arxiv.org/pdf/2305.12822v2.pdf","comment":"This paper appears in: Journal of X-Ray Science and Technology, vol.\n 32, no. 4, pp. 1099-1119, 2024. Print ISSN: 0895-3996 Online ISSN: 1095-9114\n Digital Object Identifier: https://doi.org/10.3233/XST-230389"},{"id":"http://arxiv.org/abs/2408.11455v1","updated":"2024-08-21T09:21:59Z","published":"2024-08-21T09:21:59Z","title":"Using Part-based Representations for Explainable Deep Reinforcement\n Learning","summary":" Utilizing deep learning models to learn part-based representations holds\nsignificant potential for interpretable-by-design approaches, as these models\nincorporate latent causes obtained from feature representations through simple\naddition. However, training a part-based learning model presents challenges,\nparticularly in enforcing non-negative constraints on the model's parameters,\nwhich can result in training difficulties such as instability and convergence\nissues. Moreover, applying such approaches in Deep Reinforcement Learning (RL)\nis even more demanding due to the inherent instabilities that impact many\noptimization methods. In this paper, we propose a non-negative training\napproach for actor models in RL, enabling the extraction of part-based\nrepresentations that enhance interpretability while adhering to non-negative\nconstraints. To this end, we employ a non-negative initialization technique, as\nwell as a modified sign-preserving training method, which can ensure better\ngradient flow compared to existing approaches. We demonstrate the effectiveness\nof the proposed approach using the well-known Cartpole benchmark.\n","authors":["Manos Kirtas","Konstantinos Tsampazis","Loukia Avramelou","Nikolaos Passalis","Nikolaos Passalis"],"pdf_url":"https://arxiv.org/pdf/2408.11455v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11450v1","updated":"2024-08-21T09:10:27Z","published":"2024-08-21T09:10:27Z","title":"Persistent Homology via Ellipsoids","summary":" Persistent homology is one of the most popular methods in Topological Data\nAnalysis. An initial step in any analysis with persistent homology involves\nconstructing a nested sequence of simplicial complexes, called a filtration,\nfrom a point cloud. There is an abundance of different complexes to choose\nfrom, with Rips, Alpha, and witness complexes being popular choices. In this\nmanuscript, we build a different type of a geometrically-informed simplicial\ncomplex, called an ellipsoid complex. This complex is based on the idea that\nellipsoids aligned with tangent directions better approximate the data compared\nto conventional (Euclidean) balls centered at sample points that are used in\nthe construction of Rips and Alpha complexes, for instance. We use Principal\nComponent Analysis to estimate tangent spaces directly from samples and present\nalgorithms as well as an implementation for computing ellipsoid barcodes, i.e.,\ntopological descriptors based on ellipsoid complexes. Furthermore, we conduct\nextensive experiments and compare ellipsoid barcodes with standard Rips\nbarcodes. Our findings indicate that ellipsoid complexes are particularly\neffective for estimating homology of manifolds and spaces with bottlenecks from\nsamples. In particular, the persistence intervals corresponding to a\nground-truth topological feature are longer compared to the intervals obtained\nwhen using the Rips complex of the data. Furthermore, ellipsoid barcodes lead\nto better classification results in sparsely-sampled point clouds. Finally, we\ndemonstrate that ellipsoid barcodes outperform Rips barcodes in classification\ntasks.\n","authors":["Sara Kališnik","Bastian Rieck","Ana Žegarac"],"pdf_url":"https://arxiv.org/pdf/2408.11450v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04249v2","updated":"2024-08-21T09:04:41Z","published":"2024-05-07T12:07:06Z","title":"Federated Learning for Collaborative Inference Systems: The Case of\n Early Exit Networks","summary":" As Internet of Things (IoT) technology advances, end devices like sensors and\nsmartphones are progressively equipped with AI models tailored to their local\nmemory and computational constraints. Local inference reduces communication\ncosts and latency; however, these smaller models typically underperform\ncompared to more sophisticated models deployed on edge servers or in the cloud.\nCooperative Inference Systems (CISs) address this performance trade-off by\nenabling smaller devices to offload part of their inference tasks to more\ncapable devices. These systems often deploy hierarchical models that share\nnumerous parameters, exemplified by Deep Neural Networks (DNNs) that utilize\nstrategies like early exits or ordered dropout. In such instances, Federated\nLearning (FL) may be employed to jointly train the models within a CIS. Yet,\ntraditional training methods have overlooked the operational dynamics of CISs\nduring inference, particularly the potential high heterogeneity in serving\nrates across clients. To address this gap, we propose a novel FL approach\ndesigned explicitly for use in CISs that accounts for these variations in\nserving rates. Our framework not only offers rigorous theoretical guarantees,\nbut also surpasses state-of-the-art (SOTA) training algorithms for CISs,\nespecially in scenarios where inference request rates or data availability are\nuneven among clients.\n","authors":["Caelin Kaplan","Angelo Rodio","Tareq Si Salem","Chuan Xu","Giovanni Neglia"],"pdf_url":"https://arxiv.org/pdf/2405.04249v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11438v1","updated":"2024-08-21T08:50:19Z","published":"2024-08-21T08:50:19Z","title":"DABench: A Benchmark Dataset for Data-Driven Weather Data Assimilation","summary":" Recent advancements in deep learning (DL) have led to the development of\nseveral Large Weather Models (LWMs) that rival state-of-the-art (SOTA)\nnumerical weather prediction (NWP) systems. Up to now, these models still rely\non traditional NWP-generated analysis fields as input and are far from being an\nautonomous system. While researchers are exploring data-driven data\nassimilation (DA) models to generate accurate initial fields for LWMs, the lack\nof a standard benchmark impedes the fair evaluation among different data-driven\nDA algorithms. Here, we introduce DABench, a benchmark dataset utilizing ERA5\ndata as ground truth to guide the development of end-to-end data-driven weather\nprediction systems. DABench contributes four standard features: (1) sparse and\nnoisy simulated observations under the guidance of the observing system\nsimulation experiment method; (2) a skillful pre-trained weather prediction\nmodel to generate background fields while fairly evaluating the impact of\nassimilation outcomes on predictions; (3) standardized evaluation metrics for\nmodel comparison; (4) a strong baseline called the DA Transformer (DaT). DaT\nintegrates the four-dimensional variational DA prior knowledge into the\nTransformer model and outperforms the SOTA in physical state reconstruction,\nnamed 4DVarNet. Furthermore, we exemplify the development of an end-to-end\ndata-driven weather prediction system by integrating DaT with the prediction\nmodel. Researchers can leverage DABench to develop their models and compare\nperformance against established baselines, which will benefit the future\nadvancements of data-driven weather prediction systems. The code is available\non this Github repository and the dataset is available at the Baidu Drive.\n","authors":["Wuxin Wang","Weicheng Ni","Tao Han","Lei Bai","Boheng Duan","Kaijun Ren"],"pdf_url":"https://arxiv.org/pdf/2408.11438v1.pdf","comment":"37pages, 12 figures, 6 tables"},{"id":"http://arxiv.org/abs/2404.10261v3","updated":"2024-08-21T08:50:00Z","published":"2024-04-16T03:31:28Z","title":"Lighter, Better, Faster Multi-Source Domain Adaptation with Gaussian\n Mixture Models and Optimal Transport","summary":" In this paper, we tackle Multi-Source Domain Adaptation (MSDA), a task in\ntransfer learning where one adapts multiple heterogeneous, labeled source\nprobability measures towards a different, unlabeled target measure. We propose\na novel framework for MSDA, based on Optimal Transport (OT) and Gaussian\nMixture Models (GMMs). Our framework has two key advantages. First, OT between\nGMMs can be solved efficiently via linear programming. Second, it provides a\nconvenient model for supervised learning, especially classification, as\ncomponents in the GMM can be associated with existing classes. Based on the\nGMM-OT problem, we propose a novel technique for calculating barycenters of\nGMMs. Based on this novel algorithm, we propose two new strategies for MSDA:\nGMM-Wasserstein Barycenter Transport (WBT) and GMM-Dataset Dictionary Learning\n(DaDiL). We empirically evaluate our proposed methods on four benchmarks in\nimage classification and fault diagnosis, showing that we improve over the\nprior art while being faster and involving fewer parameters. Our code is\npublicly available at https://github.com/eddardd/gmm_msda\n","authors":["Eduardo Fernandes Montesuma","Fred Ngolè Mboula","Antoine Souloumiac"],"pdf_url":"https://arxiv.org/pdf/2404.10261v3.pdf","comment":"13 pages, 6 figures, accepted as a research track paper at the\n ECML-PKDD 2024 conference"},{"id":"http://arxiv.org/abs/2303.15975v4","updated":"2024-08-21T08:45:12Z","published":"2023-03-28T13:47:16Z","title":"Large-scale Pre-trained Models are Surprisingly Strong in Incremental\n Novel Class Discovery","summary":" Discovering novel concepts in unlabelled datasets and in a continuous manner\nis an important desideratum of lifelong learners. In the literature such\nproblems have been partially addressed under very restricted settings, where\nnovel classes are learned by jointly accessing a related labelled set (e.g.,\nNCD) or by leveraging only a supervisedly pre-trained model (e.g., class-iNCD).\nIn this work we challenge the status quo in class-iNCD and propose a learning\nparadigm where class discovery occurs continuously and truly unsupervisedly,\nwithout needing any related labelled set. In detail, we propose to exploit the\nricher priors from strong self-supervised pre-trained models (PTM). To this\nend, we propose simple baselines, composed of a frozen PTM backbone and a\nlearnable linear classifier, that are not only simple to implement but also\nresilient under longer learning scenarios. We conduct extensive empirical\nevaluation on a multitude of benchmarks and show the effectiveness of our\nproposed baselines when compared with sophisticated state-of-the-art methods.\nThe code is open source.\n","authors":["Mingxuan Liu","Subhankar Roy","Zhun Zhong","Nicu Sebe","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2303.15975v4.pdf","comment":"Accepted as a conference paper to ICPR 2024"},{"id":"http://arxiv.org/abs/2306.16156v2","updated":"2024-08-21T08:44:44Z","published":"2023-06-28T12:37:23Z","title":"Recent Advances in Optimal Transport for Machine Learning","summary":" Recently, Optimal Transport has been proposed as a probabilistic framework in\nMachine Learning for comparing and manipulating probability distributions. This\nis rooted in its rich history and theory, and has offered new solutions to\ndifferent problems in machine learning, such as generative modeling and\ntransfer learning. In this survey we explore contributions of Optimal Transport\nfor Machine Learning over the period 2012 -- 2023, focusing on four sub-fields\nof Machine Learning: supervised, unsupervised, transfer and reinforcement\nlearning. We further highlight the recent development in computational Optimal\nTransport and its extensions, such as partial, unbalanced, Gromov and Neural\nOptimal Transport, and its interplay with Machine Learning practice.\n","authors":["Eduardo Fernandes Montesuma","Fred Ngolè Mboula","Antoine Souloumiac"],"pdf_url":"https://arxiv.org/pdf/2306.16156v2.pdf","comment":"20 pages,15 figures,under review"},{"id":"http://arxiv.org/abs/2408.11433v1","updated":"2024-08-21T08:42:21Z","published":"2024-08-21T08:42:21Z","title":"Towards Aligned Data Removal via Twin Machine Unlearning","summary":" Modern privacy regulations have spurred the evolution of machine unlearning,\na technique that enables the removal of data from an already trained ML model\nwithout requiring retraining from scratch. Previous unlearning methods tend to\ninduce the model to achieve lowest classification accuracy on the removal data.\nNonetheless, the authentic objective of machine unlearning is to align the\nunlearned model with the gold model, i.e., achieving the same classification\naccuracy as the gold model. For this purpose, we present a Twin Machine\nUnlearning (TMU) approach, where a twin unlearning problem is defined\ncorresponding to the original unlearning problem. As a results, the\ngeneralization-label predictor trained on the twin problem can be transferred\nto the original problem, facilitating aligned data removal. Comprehensive\nempirical experiments illustrate that our approach significantly enhances the\nalignment between the unlearned model and the gold model. Meanwhile, our method\nallows data removal without compromising the model accuracy.\n","authors":["Yuyao Sun","Zhenxing Niu","Gang hua","Rong jin"],"pdf_url":"https://arxiv.org/pdf/2408.11433v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11412v1","updated":"2024-08-21T08:18:39Z","published":"2024-08-21T08:18:39Z","title":"Linear-time One-Class Classification with Repeated Element-wise Folding","summary":" This paper proposes an easy-to-use method for one-class classification:\nRepeated Element-wise Folding (REF). The algorithm consists of repeatedly\nstandardizing and applying an element-wise folding operation on the one-class\ntraining data. Equivalent mappings are performed on unknown test items and the\nclassification prediction is based on the item's distance to the origin of the\nfinal distribution. As all the included operations have linear time complexity,\nthe proposed algorithm provides a linear-time alternative for the commonly used\ncomputationally much more demanding approaches. Furthermore, REF can avoid the\nchallenges of hyperparameter setting in one-class classification by providing\nrobust default settings. The experiments show that the proposed method can\nproduce similar classification performance or even outperform the more complex\nalgorithms on various benchmark datasets. Matlab codes for REF are publicly\navailable at https://github.com/JenniRaitoharju/REF.\n","authors":["Jenni Raitoharju"],"pdf_url":"https://arxiv.org/pdf/2408.11412v1.pdf","comment":"Accepted to EUSIPCO 2024"},{"id":"http://arxiv.org/abs/2405.11932v2","updated":"2024-08-21T08:11:59Z","published":"2024-05-20T10:16:26Z","title":"Nonequilbrium physics of generative diffusion models","summary":" Generative diffusion models apply the concept of Langevin dynamics in physics\nto machine leaning, attracting a lot of interests from engineering, statistics\nand physics, but a complete picture about inherent mechanisms is still lacking.\nIn this paper, we provide a transparent physics analysis of diffusion models,\nformulating the fluctuation theorem, entropy production, equilibrium measure,\nand Franz-Parisi potential to understand the dynamic process and intrinsic\nphase transitions. Our analysis is rooted in a path integral representation of\nboth forward and backward dynamics, and in treating the reverse diffusion\ngenerative process as a statistical inference, where the time-dependent state\nvariables serve as quenched disorder akin to that in spin glass theory. Our\nstudy thus links stochastic thermodynamics, statistical inference and geometry\nbased analysis together to yield a coherent picture about how the generative\ndiffusion models work.\n","authors":["Zhendong Yu","Haiping Huang"],"pdf_url":"https://arxiv.org/pdf/2405.11932v2.pdf","comment":"24 pages, 9 figures, 30 refs"},{"id":"http://arxiv.org/abs/2406.01290v4","updated":"2024-08-21T08:01:04Z","published":"2024-06-03T13:01:09Z","title":"Resource-constrained Fairness","summary":" Access to resources strongly constrains the decisions we make. While we might\nwish to offer every student a scholarship, or schedule every patient for\nfollow-up meetings with a specialist, limited resources mean that this is not\npossible. When deploying machine learning systems, these resource constraints\nare simply enforced by varying the threshold of a classifier. However, these\nfinite resource limitations are disregarded by most existing tools for fair\nmachine learning, which do not allow the specification of resource limitations\nand do not remain fair when varying thresholds. This makes them ill-suited for\nreal-world deployment. Our research introduces the concept of\n\"resource-constrained fairness\" and quantifies the cost of fairness within this\nframework. We demonstrate that the level of available resources significantly\ninfluences this cost, a factor overlooked in previous evaluations.\n","authors":["Sofie Goethals","Eoin Delaney","Brent Mittelstadt","Chris Russell"],"pdf_url":"https://arxiv.org/pdf/2406.01290v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11401v1","updated":"2024-08-21T07:58:34Z","published":"2024-08-21T07:58:34Z","title":"Revisiting FunnyBirds evaluation framework for prototypical parts\n networks","summary":" Prototypical parts networks, such as ProtoPNet, became popular due to their\npotential to produce more genuine explanations than post-hoc methods. However,\nfor a long time, this potential has been strictly theoretical, and no\nsystematic studies have existed to support it. That changed recently with the\nintroduction of the FunnyBirds benchmark, which includes metrics for evaluating\ndifferent aspects of explanations.\n However, this benchmark employs attribution maps visualization for all\nexplanation techniques except for the ProtoPNet, for which the bounding boxes\nare used. This choice significantly influences the metric scores and questions\nthe conclusions stated in FunnyBirds publication.\n In this study, we comprehensively compare metric scores obtained for two\ntypes of ProtoPNet visualizations: bounding boxes and similarity maps. Our\nanalysis indicates that employing similarity maps aligns better with the\nessence of ProtoPNet, as evidenced by different metric scores obtained from\nFunnyBirds. Therefore, we advocate using similarity maps as a visualization\ntechnique for prototypical parts networks in explainability evaluation\nbenchmarks.\n","authors":["Szymon Opłatek","Dawid Rymarczyk","Bartosz Zieliński"],"pdf_url":"https://arxiv.org/pdf/2408.11401v1.pdf","comment":"Published at 2nd XAI World Conference"},{"id":"http://arxiv.org/abs/2408.09420v3","updated":"2024-08-21T07:50:40Z","published":"2024-08-18T09:31:13Z","title":"Enhancing Startup Success Predictions in Venture Capital: A GraphRAG\n Augmented Multivariate Time Series Method","summary":" In the Venture Capital(VC) industry, predicting the success of startups is\nchallenging due to limited financial data and the need for subjective revenue\nforecasts. Previous methods based on time series analysis or deep learning\noften fall short as they fail to incorporate crucial inter-company\nrelationships such as competition and collaboration. Regarding the issues, we\npropose a novel approach using GrahphRAG augmented time series model. With\nGraphRAG, time series predictive methods are enhanced by integrating these\nvital relationships into the analysis framework, allowing for a more dynamic\nunderstanding of the startup ecosystem in venture capital. Our experimental\nresults demonstrate that our model significantly outperforms previous models in\nstartup success predictions. To the best of our knowledge, our work is the\nfirst application work of GraphRAG.\n","authors":["Zitian Gao","Yihao Xiao"],"pdf_url":"https://arxiv.org/pdf/2408.09420v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11393v1","updated":"2024-08-21T07:38:51Z","published":"2024-08-21T07:38:51Z","title":"First Activations Matter: Training-Free Methods for Dynamic Activation\n in Large Language Models","summary":" Dynamic activation (DA) techniques, such as DejaVu and MoEfication, have\ndemonstrated their potential to significantly enhance the inference efficiency\nof large language models (LLMs). However, these techniques often rely on ReLU\nactivation functions or require additional parameters and training to maintain\nperformance. This paper introduces a training-free Threshold-based Dynamic\nActivation(TDA) method that leverage sequence information to exploit the\ninherent sparsity of models across various architectures. This method is\ndesigned to accelerate generation speed by 18-25\\% without significantly\ncompromising task performance, thereby addressing the limitations of existing\nDA techniques. Moreover, we delve into the root causes of LLM sparsity and\ntheoretically analyze two of its critical features: history-related activation\nuncertainty and semantic-irrelevant activation inertia. Our comprehensive\nanalyses not only provide a robust theoretical foundation for DA methods but\nalso offer valuable insights to guide future research in optimizing LLMs for\ngreater efficiency and effectiveness.\n","authors":["Chi Ma","Mincong Huang","Ying Zhang","Chao Wang","Yujie Wang","Lei Yu","Chuan Liu","Wei Lin"],"pdf_url":"https://arxiv.org/pdf/2408.11393v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11384v1","updated":"2024-08-21T07:26:43Z","published":"2024-08-21T07:26:43Z","title":"Data-Centric Machine Learning for Earth Observation: Necessary and\n Sufficient Features","summary":" The availability of temporal geospatial data in multiple modalities has been\nextensively leveraged to enhance the performance of machine learning models.\nWhile efforts on the design of adequate model architectures are approaching a\nlevel of saturation, focusing on a data-centric perspective can complement\nthese efforts to achieve further enhancements in data usage efficiency and\nmodel generalization capacities. This work contributes to this direction. We\nleverage model explanation methods to identify the features crucial for the\nmodel to reach optimal performance and the smallest set of features sufficient\nto achieve this performance. We evaluate our approach on three temporal\nmultimodal geospatial datasets and compare multiple model explanation\ntechniques. Our results reveal that some datasets can reach their optimal\naccuracy with less than 20% of the temporal instances, while in other datasets,\nthe time series of a single band from a single modality is sufficient.\n","authors":["Hiba Najjar","Marlon Nuske","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2408.11384v1.pdf","comment":"Accepted at MACLEAN workshop, ECML/PKDD 2024"},{"id":"http://arxiv.org/abs/2408.11374v1","updated":"2024-08-21T06:49:59Z","published":"2024-08-21T06:49:59Z","title":"A Unified Framework for Continual Learning and Machine Unlearning","summary":" Continual learning and machine unlearning are crucial challenges in machine\nlearning, typically addressed separately. Continual learning focuses on\nadapting to new knowledge while preserving past information, whereas unlearning\ninvolves selectively forgetting specific subsets of data. In this paper, we\nintroduce a novel framework that jointly tackles both tasks by leveraging\ncontrolled knowledge distillation. Our approach enables efficient learning with\nminimal forgetting and effective targeted unlearning. By incorporating a fixed\nmemory buffer, the system supports learning new concepts while retaining prior\nknowledge. The distillation process is carefully managed to ensure a balance\nbetween acquiring new information and forgetting specific data as needed.\nExperimental results on benchmark datasets show that our method matches or\nexceeds the performance of existing approaches in both continual learning and\nmachine unlearning. This unified framework is the first to address both\nchallenges simultaneously, paving the way for adaptable models capable of\ndynamic learning and forgetting while maintaining strong overall performance.\n","authors":["Romit Chatterjee","Vikram Chundawat","Ayush Tarun","Ankur Mali","Murari Mandal"],"pdf_url":"https://arxiv.org/pdf/2408.11374v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.16077v2","updated":"2024-08-21T06:45:07Z","published":"2024-04-24T09:20:33Z","title":"CompilerDream: Learning a Compiler World Model for General Code\n Optimization","summary":" Effective code optimization in compilers is crucial for computer and software\nengineering. The success of these optimizations primarily depends on the\nselection and ordering of the optimization passes applied to the code. While\nmost compilers rely on a fixed sequence of optimization passes, current methods\nto find the optimal sequence either employ impractically slow search algorithms\nor learning methods that struggle to generalize to code unseen during training.\nWe introduce CompilerDream, a model-based reinforcement learning approach to\ngeneral code optimization. CompilerDream comprises a compiler world model that\naccurately simulates the intrinsic properties of optimization passes and an\nagent trained on this model to produce effective optimization strategies. By\ntraining on a large-scale program dataset, CompilerDream is equipped to serve\nas a general code optimizer across various application scenarios and\nsource-code languages. Our extensive experiments first highlight\nCompilerDream's strong optimization capabilities for autotuning, where it leads\nthe CompilerGym leaderboard. More importantly, the zero-shot generalization\nability of large-scale trained compiler world model and agent, excels across\ndiverse datasets, surpassing LLVM's built-in optimizations and other\nstate-of-the-art methods in both settings of value prediction and end-to-end\ncode optimization.\n","authors":["Chaoyi Deng","Jialong Wu","Ningya Feng","Jianmin Wang","Mingsheng Long"],"pdf_url":"https://arxiv.org/pdf/2404.16077v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11370v1","updated":"2024-08-21T06:42:22Z","published":"2024-08-21T06:42:22Z","title":"Graph Classification via Reference Distribution Learning: Theory and\n Practice","summary":" Graph classification is a challenging problem owing to the difficulty in\nquantifying the similarity between graphs or representing graphs as vectors,\nthough there have been a few methods using graph kernels or graph neural\nnetworks (GNNs). Graph kernels often suffer from computational costs and manual\nfeature engineering, while GNNs commonly utilize global pooling operations,\nrisking the loss of structural or semantic information. This work introduces\nGraph Reference Distribution Learning (GRDL), an efficient and accurate graph\nclassification method. GRDL treats each graph's latent node embeddings given by\nGNN layers as a discrete distribution, enabling direct classification without\nglobal pooling, based on maximum mean discrepancy to adaptively learned\nreference distributions. To fully understand this new model (the existing\ntheories do not apply) and guide its configuration (e.g., network architecture,\nreferences' sizes, number, and regularization) for practical use, we derive\ngeneralization error bounds for GRDL and verify them numerically. More\nimportantly, our theoretical and numerical results both show that GRDL has a\nstronger generalization ability than GNNs with global pooling operations.\nExperiments on moderate-scale and large-scale graph datasets show the\nsuperiority of GRDL over the state-of-the-art, emphasizing its remarkable\nefficiency, being at least 10 times faster than leading competitors in both\ntraining and inference stages.\n","authors":["Zixiao Wang","Jicong Fan"],"pdf_url":"https://arxiv.org/pdf/2408.11370v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11367v1","updated":"2024-08-21T06:38:49Z","published":"2024-08-21T06:38:49Z","title":"Towards Probabilistic Inductive Logic Programming with Neurosymbolic\n Inference and Relaxation","summary":" Many inductive logic programming (ILP) methods are incapable of learning\nprograms from probabilistic background knowledge, e.g. coming from sensory data\nor neural networks with probabilities. We propose Propper, which handles flawed\nand probabilistic background knowledge by extending ILP with a combination of\nneurosymbolic inference, a continuous criterion for hypothesis selection (BCE)\nand a relaxation of the hypothesis constrainer (NoisyCombo). For relational\npatterns in noisy images, Propper can learn programs from as few as 8 examples.\nIt outperforms binary ILP and statistical models such as a Graph Neural\nNetwork.\n","authors":["Fieke Hillerstrom","Gertjan Burghouts"],"pdf_url":"https://arxiv.org/pdf/2408.11367v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2408.11366v1","updated":"2024-08-21T06:35:21Z","published":"2024-08-21T06:35:21Z","title":"GeoReasoner: Reasoning On Geospatially Grounded Context For Natural\n Language Understanding","summary":" In human reading and communication, individuals tend to engage in geospatial\nreasoning, which involves recognizing geographic entities and making informed\ninferences about their interrelationships. To mimic such cognitive process,\ncurrent methods either utilize conventional natural language understanding\ntoolkits, or directly apply models pretrained on geo-related natural language\ncorpora. However, these methods face two significant challenges: i) they do not\ngeneralize well to unseen geospatial scenarios, and ii) they overlook the\nimportance of integrating geospatial context from geographical databases with\nlinguistic information from the Internet. To handle these challenges, we\npropose GeoReasoner, a language model capable of reasoning on geospatially\ngrounded natural language. Specifically, it first leverages Large Language\nModels (LLMs) to generate a comprehensive location description based on\nlinguistic and geospatial information. It also encodes direction and distance\ninformation into spatial embedding via treating them as pseudo-sentences.\nConsequently, the model is trained on both anchor-level and neighbor-level\ninputs to learn geo-entity representation. Extensive experimental results\ndemonstrate GeoReasoner's superiority in three tasks: toponym recognition,\ntoponym linking, and geo-entity typing, compared to the state-of-the-art\nbaselines.\n","authors":["Yibo Yan","Joey Lee"],"pdf_url":"https://arxiv.org/pdf/2408.11366v1.pdf","comment":"Accepted by International Conference on Information and Knowledge\n Management 2024"},{"id":"http://arxiv.org/abs/2404.12406v2","updated":"2024-08-21T06:21:52Z","published":"2024-04-15T22:53:30Z","title":"Lowering PyTorch's Memory Consumption for Selective Differentiation","summary":" Memory is a limiting resource for many deep learning tasks. Beside the neural\nnetwork weights, one main memory consumer is the computation graph built up by\nautomatic differentiation (AD) for backpropagation. We observe that PyTorch's\ncurrent AD implementation neglects information about parameter\ndifferentiability when storing the computation graph. This information is\nuseful though to reduce memory whenever gradients are requested for a parameter\nsubset, as is the case in many modern fine-tuning tasks. Specifically, inputs\nto layers that act linearly in their parameters (dense, convolution, or\nnormalization layers) can be discarded whenever the parameters are marked as\nnon-differentiable. We provide a drop-in, differentiability-agnostic\nimplementation of such layers and demonstrate its ability to reduce memory\nwithout affecting run time.\n","authors":["Samarth Bhatia","Felix Dangel"],"pdf_url":"https://arxiv.org/pdf/2404.12406v2.pdf","comment":"The code is available at\n https://github.com/plutonium-239/memsave_torch . This paper was accepted to\n WANT@ICML'24"},{"id":"http://arxiv.org/abs/2405.05606v3","updated":"2024-08-21T06:20:34Z","published":"2024-05-09T07:55:52Z","title":"Optimizing E-commerce Search: Toward a Generalizable and Rank-Consistent\n Pre-Ranking Model","summary":" In large e-commerce platforms, search systems are typically composed of a\nseries of modules, including recall, pre-ranking, and ranking phases. The\npre-ranking phase, serving as a lightweight module, is crucial for filtering\nout the bulk of products in advance for the downstream ranking module.\nIndustrial efforts on optimizing the pre-ranking model have predominantly\nfocused on enhancing ranking consistency, model structure, and generalization\ntowards long-tail items. Beyond these optimizations, meeting the system\nperformance requirements presents a significant challenge. Contrasting with\nexisting industry works, we propose a novel method: a Generalizable and\nRAnk-ConsistEnt Pre-Ranking Model (GRACE), which achieves: 1) Ranking\nconsistency by introducing multiple binary classification tasks that predict\nwhether a product is within the top-k results as estimated by the ranking\nmodel, which facilitates the addition of learning objectives on common\npoint-wise ranking models; 2) Generalizability through contrastive learning of\nrepresentation for all products by pre-training on a subset of ranking product\nembeddings; 3) Ease of implementation in feature construction and online\ndeployment. Our extensive experiments demonstrate significant improvements in\nboth offline metrics and online A/B test: a 0.75% increase in AUC and a 1.28%\nincrease in CVR.\n","authors":["Enqiang Xu","Yiming Qiu","Junyang Bai","Ping Zhang","Dadong Miao","Songlin Wang","Guoyu Tang","Lin Liu","Mingming Li"],"pdf_url":"https://arxiv.org/pdf/2405.05606v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11363v1","updated":"2024-08-21T06:16:22Z","published":"2024-08-21T06:16:22Z","title":"ProteinGPT: Multimodal LLM for Protein Property Prediction and Structure\n Understanding","summary":" Understanding biological processes, drug development, and biotechnological\nadvancements requires detailed analysis of protein structures and sequences, a\ntask in protein research that is inherently complex and time-consuming when\nperformed manually. To streamline this process, we introduce ProteinGPT, a\nstate-of-the-art multi-modal protein chat system, that allows users to upload\nprotein sequences and/or structures for comprehensive protein analysis and\nresponsive inquiries. ProteinGPT seamlessly integrates protein sequence and\nstructure encoders with linear projection layers for precise representation\nadaptation, coupled with a large language model (LLM) to generate accurate and\ncontextually relevant responses. To train ProteinGPT, we construct a\nlarge-scale dataset of 132,092 proteins with annotations, and optimize the\ninstruction-tuning process using GPT-4o. This innovative system ensures\naccurate alignment between the user-uploaded data and prompts, simplifying\nprotein analysis. Experiments show that ProteinGPT can produce promising\nresponses to proteins and their corresponding questions.\n","authors":["Yijia Xiao","Edward Sun","Yiqiao Jin","Qifan Wang","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2408.11363v1.pdf","comment":"19 pages, 9 figures, 5 tables"},{"id":"http://arxiv.org/abs/2107.04795v3","updated":"2024-08-21T06:13:47Z","published":"2021-07-10T08:53:14Z","title":"Semi-Supervised Learning with Multi-Head Co-Training","summary":" Co-training, extended from self-training, is one of the frameworks for\nsemi-supervised learning. Without natural split of features, single-view\nco-training works at the cost of training extra classifiers, where the\nalgorithm should be delicately designed to prevent individual classifiers from\ncollapsing into each other. To remove these obstacles which deter the adoption\nof single-view co-training, we present a simple and efficient algorithm\nMulti-Head Co-Training. By integrating base learners into a multi-head\nstructure, the model is in a minimal amount of extra parameters. Every\nclassification head in the unified model interacts with its peers through a\n\"Weak and Strong Augmentation\" strategy, in which the diversity is naturally\nbrought by the strong data augmentation. Therefore, the proposed method\nfacilitates single-view co-training by 1). promoting diversity implicitly and\n2). only requiring a small extra computational overhead. The effectiveness of\nMulti-Head Co-Training is demonstrated in an empirical study on standard\nsemi-supervised learning benchmarks.\n","authors":["Mingcai Chen","Yuntao Du","Yi Zhang","Shuwei Qian","Chongjun Wang"],"pdf_url":"https://arxiv.org/pdf/2107.04795v3.pdf","comment":"The 36th AAAI Conference on Artificial Intelligence (AAAI-22)"},{"id":"http://arxiv.org/abs/2401.10685v2","updated":"2024-08-21T06:10:02Z","published":"2024-01-19T13:32:55Z","title":"Towards End-to-End GPS Localization with Neural Pseudorange Correction","summary":" The pseudorange error is one of the root causes of localization inaccuracy in\nGPS. Previous data-driven methods regress and eliminate pseudorange errors\nusing handcrafted intermediate labels. Unlike them, we propose an end-to-end\nGPS localization framework, E2E-PrNet, to train a neural network for\npseudorange correction (PrNet) directly using the final task loss calculated\nwith the ground truth of GPS receiver states. The gradients of the loss with\nrespect to learnable parameters are backpropagated through a Differentiable\nNonlinear Least Squares (DNLS) optimizer to PrNet. The feasibility of fusing\nthe data-driven neural network and the model-based DNLS module is verified with\nGPS data collected by Android phones, showing that E2E-PrNet outperforms the\nbaseline weighted least squares method and the state-of-the-art end-to-end\ndata-driven approach. Finally, we discuss the explainability of E2E-PrNet.\n","authors":["Xu Weng","KV Ling","Haochen Liu","Kun Cao"],"pdf_url":"https://arxiv.org/pdf/2401.10685v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11359v1","updated":"2024-08-21T06:04:02Z","published":"2024-08-21T06:04:02Z","title":"Hypergraph Learning based Recommender System for Anomaly Detection,\n Control and Optimization","summary":" Anomaly detection is fundamental yet, challenging problem with practical\napplications in industry. The current approaches neglect the higher-order\ndependencies within the networks of interconnected sensors in the\nhigh-dimensional time series(multisensor data) for anomaly detection. To this\nend, we present a self-adapting anomaly detection framework for joint learning\nof (a) discrete hypergraph structure and (b) modeling the temporal trends and\nspatial relations among the interdependent sensors using the hierarchical\nencoder-decoder architecture to overcome the challenges. The hypergraph\nrepresentation learning-based framework exploits the relational inductive\nbiases in the hypergraph-structured data to learn the pointwise\nsingle-step-ahead forecasts through the self-supervised autoregressive task and\npredicts the anomalies based on the forecast error. Furthermore, our framework\nincentivizes learning the anomaly-diagnosis ontology through a differentiable\napproach. It derives the anomaly information propagation-based computational\nhypergraphs for root cause analysis and provides recommendations through an\noffline, optimal predictive control policy to remedy an anomaly. We conduct\nextensive experiments to evaluate the proposed method on the benchmark datasets\nfor fair and rigorous comparison with the popular baselines. The proposed\nmethod outperforms the baseline models and achieves SOTA performance. We report\nthe ablation studies to support the efficacy of the framework.\n","authors":["Sakhinana Sagar Srinivas","Rajat Kumar Sarkar","Venkataramana Runkana"],"pdf_url":"https://arxiv.org/pdf/2408.11359v1.pdf","comment":"16 pages, 10 figure, Accepted at IEEE International Conference on Big\n Data 2022, Osaka, Japan"},{"id":"http://arxiv.org/abs/2406.14281v3","updated":"2024-08-21T06:01:52Z","published":"2024-06-20T13:07:06Z","title":"FairX: A comprehensive benchmarking tool for model analysis using\n fairness, utility, and explainability","summary":" We present FairX, an open-source Python-based benchmarking tool designed for\nthe comprehensive analysis of models under the umbrella of fairness, utility,\nand eXplainability (XAI). FairX enables users to train benchmarking\nbias-removal models and evaluate their fairness using a wide array of fairness\nmetrics, data utility metrics, and generate explanations for model predictions,\nall within a unified framework. Existing benchmarking tools do not have the way\nto evaluate synthetic data generated from fair generative models, also they do\nnot have the support for training fair generative models either. In FairX, we\nadd fair generative models in the collection of our fair-model library\n(pre-processing, in-processing, post-processing) and evaluation metrics for\nevaluating the quality of synthetic fair data. This version of FairX supports\nboth tabular and image datasets. It also allows users to provide their own\ncustom datasets. The open-source FairX benchmarking package is publicly\navailable at https://github.com/fahim-sikder/FairX.\n","authors":["Md Fahim Sikder","Resmi Ramachandranpillai","Daniel de Leng","Fredrik Heintz"],"pdf_url":"https://arxiv.org/pdf/2406.14281v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09121v2","updated":"2024-08-21T06:01:08Z","published":"2024-08-17T07:11:02Z","title":"Selective Prompt Anchoring for Code Generation","summary":" Recent advances in large language models (LLMs) such as Copilot and ChatGPT\nhave transformed software development by automating coding tasks. Despite these\nadvancements, challenges remain in reducing error rates and fully meeting user\nexpectations. Our empirical study reveals LLMs tend to dilute their\nself-attention on the initial prompt as more code tokens are generated. We\nhypothesize this self-attention dilution issue is one of the root causes of\ninaccuracies in LLM-generated code. To mitigate this issue, we propose\nSelective Prompt Anchoring (SPA). SPA amplifies the influence of the selected\nparts in the initial prompt, which we refer to as ``anchored text'', during\ncode generation. Specifically, SPA calculates the logit distribution difference\nwith and without the anchored text. We prove this difference approximates the\nanchored text's contextual contribution to the output logits. SPA creates an\naugmented logit distribution by linearly combining the original logit\ndistribution and the logit difference. We evaluate SPA with five LLMs on four\nbenchmarks. Our results demonstrate that using SPA can consistently improve\nPass@1 rates by up to 9.7% in all settings. Notably, with selective text\nanchoring, a small version of DeepSeek-Coder (6.7B) can achieve better\nperformance than an original much larger version (33B). Our code is available\nat https://github.com/magic-YuanTian/Selective-Prompt-Anchoring.\n","authors":["Yuan Tian","Tianyi Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.09121v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07091v2","updated":"2024-08-21T05:58:36Z","published":"2024-08-09T14:57:53Z","title":"Node Level Graph Autoencoder: Unified Pretraining for Textual Graph\n Learning","summary":" Textual graphs are ubiquitous in real-world applications, featuring rich text\ninformation with complex relationships, which enables advanced research across\nvarious fields. Textual graph representation learning aims to generate\nlow-dimensional feature embeddings from textual graphs that can improve the\nperformance of downstream tasks. A high-quality feature embedding should\neffectively capture both the structural and the textual information in a\ntextual graph. However, most textual graph dataset benchmarks rely on word2vec\ntechniques to generate feature embeddings, which inherently limits their\ncapabilities. Recent works on textual graph representation learning can be\ncategorized into two folds: supervised and unsupervised methods. Supervised\nmethods finetune a language model on labeled nodes, which have limited\ncapabilities when labeled data is scarce. Unsupervised methods, on the other\nhand, extract feature embeddings by developing complex training pipelines. To\naddress these limitations, we propose a novel unified unsupervised learning\nautoencoder framework, named Node Level Graph AutoEncoder (NodeGAE). We employ\nlanguage models as the backbone of the autoencoder, with pretraining on text\nreconstruction. Additionally, we add an auxiliary loss term to make the feature\nembeddings aware of the local graph structure. Our method maintains simplicity\nin the training process and demonstrates generalizability across diverse\ntextual graphs and downstream tasks. We evaluate our method on two core graph\nrepresentation learning downstream tasks: node classification and link\nprediction. Comprehensive experiments demonstrate that our approach\nsubstantially enhances the performance of diverse graph neural networks (GNNs)\nacross multiple textual graph datasets.\n","authors":["Wenbin Hu","Huihao Jing","Qi Hu","Haoran Li","Yangqiu Song"],"pdf_url":"https://arxiv.org/pdf/2408.07091v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11356v1","updated":"2024-08-21T05:53:50Z","published":"2024-08-21T05:53:50Z","title":"One-step Structure Prediction and Screening for Protein-Ligand Complexes\n using Multi-Task Geometric Deep Learning","summary":" Understanding the structure of the protein-ligand complex is crucial to drug\ndevelopment. Existing virtual structure measurement and screening methods are\ndominated by docking and its derived methods combined with deep learning.\nHowever, the sampling and scoring methodology have largely restricted the\naccuracy and efficiency. Here, we show that these two fundamental tasks can be\naccurately tackled with a single model, namely LigPose, based on multi-task\ngeometric deep learning. By representing the ligand and the protein pair as a\ngraph, LigPose directly optimizes the three-dimensional structure of the\ncomplex, with the learning of binding strength and atomic interactions as\nauxiliary tasks, enabling its one-step prediction ability without docking\ntools. Extensive experiments show LigPose achieved state-of-the-art performance\non major tasks in drug research. Its considerable improvements indicate a\npromising paradigm of AI-based pipeline for drug development.\n","authors":["Kelei He","Tiejun Dong","Jinhui Wu","Junfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.11356v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19342v2","updated":"2024-08-21T05:44:11Z","published":"2024-07-27T21:12:46Z","title":"Parameter-Efficient Fine-Tuning via Circular Convolution","summary":" Low-Rank Adaptation (LoRA) has gained popularity for fine-tuning large\nfoundation models, leveraging low-rank matrices $\\mathbf{A}$ and $\\mathbf{B}$\nto represent weight changes (i.e., $\\Delta \\mathbf{W} = \\mathbf{B}\n\\mathbf{A}$). This method reduces trainable parameters and mitigates heavy\nmemory consumption associated with full delta matrices by sequentially\nmultiplying $\\mathbf{A}$ and $\\mathbf{B}$ with the activation. Despite its\nsuccess, the intrinsic low-rank characteristic may limit its performance.\nAlthough several variants have been proposed to address this issue, they often\noverlook the crucial computational and memory efficiency brought by LoRA. In\nthis paper, we propose Circular Convolution Adaptation (C$^3$A), which not only\nachieves high-rank adaptation with enhanced performance but also excels in both\ncomputational power and memory utilization. Extensive experiments demonstrate\nthat C$^3$A consistently outperforms LoRA and its variants across various\nfine-tuning tasks.\n","authors":["Aochuan Chen","Jiashun Cheng","Zijing Liu","Ziqi Gao","Fugee Tsung","Yu Li","Jia Li"],"pdf_url":"https://arxiv.org/pdf/2407.19342v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2408.11351v1","updated":"2024-08-21T05:36:53Z","published":"2024-08-21T05:36:53Z","title":"Vision HgNN: An Electron-Micrograph is Worth Hypergraph of Hypernodes","summary":" Material characterization using electron micrographs is a crucial but\nchallenging task with applications in various fields, such as semiconductors,\nquantum materials, batteries, etc. The challenges in categorizing electron\nmicrographs include but are not limited to the complexity of patterns, high\nlevel of detail, and imbalanced data distribution(long-tail distribution).\nExisting methods have difficulty in modeling the complex relational structure\nin electron micrographs, hindering their ability to effectively capture the\ncomplex relationships between different spatial regions of micrographs. We\npropose a hypergraph neural network(HgNN) backbone architecture, a conceptually\nalternative approach, to better model the complex relationships in electron\nmicrographs and improve material characterization accuracy. By utilizing\ncost-effective GPU hardware, our proposed framework outperforms popular\nbaselines. The results of the ablation studies demonstrate that the proposed\nframework is effective in achieving state-of-the-art performance on benchmark\ndatasets and efficient in terms of computational and memory requirements for\nhandling large-scale electron micrograph-based datasets.\n","authors":["Sakhinana Sagar Srinivas","Rajat Kumar Sarkar","Sreeja Gangasani","Venkataramana Runkana"],"pdf_url":"https://arxiv.org/pdf/2408.11351v1.pdf","comment":"21 pages, Accepted in PML4DC Workshop at International Conference on\n Learning Representations (ICLR) 2023"},{"id":"http://arxiv.org/abs/2406.03390v3","updated":"2024-08-21T05:36:19Z","published":"2024-06-05T15:41:02Z","title":"What Drives Online Popularity: Author, Content or Sharers? Estimating\n Spread Dynamics with Bayesian Mixture Hawkes","summary":" The spread of content on social media is shaped by intertwining factors on\nthree levels: the source, the content itself, and the pathways of content\nspread. At the lowest level, the popularity of the sharing user determines its\neventual reach. However, higher-level factors such as the nature of the online\nitem and the credibility of its source also play crucial roles in determining\nhow widely and rapidly the online item spreads. In this work, we propose the\nBayesian Mixture Hawkes (BMH) model to jointly learn the influence of source,\ncontent and spread. We formulate the BMH model as a hierarchical mixture model\nof separable Hawkes processes, accommodating different classes of Hawkes\ndynamics and the influence of feature sets on these classes. We test the BMH\nmodel on two learning tasks, cold-start popularity prediction and temporal\nprofile generalization performance, applying to two real-world retweet cascade\ndatasets referencing articles from controversial and traditional media\npublishers. The BMH model outperforms the state-of-the-art models and\npredictive baselines on both datasets and utilizes cascade- and item-level\ninformation better than the alternatives. Lastly, we perform a counter-factual\nanalysis where we apply the trained publisher-level BMH models to a set of\narticle headlines and show that effectiveness of headline writing style\n(neutral, clickbait, inflammatory) varies across publishers. The BMH model\nunveils differences in style effectiveness between controversial and reputable\npublishers, where we find clickbait to be notably more effective for reputable\npublishers as opposed to controversial ones, which links to the latter's\noveruse of clickbait.\n","authors":["Pio Calderon","Marian-Andrei Rizoiu"],"pdf_url":"https://arxiv.org/pdf/2406.03390v3.pdf","comment":"accepted as a full paper in the Research Track at the European\n Conference on Machine Learning and Principles and Practice of Knowledge\n Discovery in Databases (ECML-PKDD) 2024"},{"id":"http://arxiv.org/abs/2408.11348v1","updated":"2024-08-21T05:28:12Z","published":"2024-08-21T05:28:12Z","title":"Learning Flock: Enhancing Sets of Particles for Multi~Sub-State Particle\n Filtering with Neural Augmentation","summary":" A leading family of algorithms for state estimation in dynamic systems with\nmultiple sub-states is based on particle filters (PFs). PFs often struggle when\noperating under complex or approximated modelling (necessitating many\nparticles) with low latency requirements (limiting the number of particles), as\nis typically the case in multi target tracking (MTT). In this work, we\nintroduce a deep neural network (DNN) augmentation for PFs termed learning\nflock (LF). LF learns to correct a particles-weights set, which we coin flock,\nbased on the relationships between all sub-particles in the set itself, while\ndisregarding the set acquisition procedure. Our proposed LF, which can be\nreadily incorporated into different PFs flow, is designed to facilitate rapid\noperation by maintaining accuracy with a reduced number of particles. We\nintroduce a dedicated training algorithm, allowing both supervised and\nunsupervised training, and yielding a module that supports a varying number of\nsub-states and particles without necessitating re-training. We experimentally\nshow the improvements in performance, robustness, and latency of LF\naugmentation for radar multi-target tracking, as well its ability to mitigate\nthe effect of a mismatched observation modelling. We also compare and\nillustrate the advantages of LF over a state-of-the-art DNN-aided PF, and\ndemonstrate that LF enhances both classic PFs as well as DNN-based filters.\n","authors":["Itai Nuri","Nir Shlezinger"],"pdf_url":"https://arxiv.org/pdf/2408.11348v1.pdf","comment":"Under review for publication in the IEEE"},{"id":"http://arxiv.org/abs/2402.03655v2","updated":"2024-08-21T05:09:53Z","published":"2024-02-06T03:06:06Z","title":"Operator SVD with Neural Networks via Nested Low-Rank Approximation","summary":" Computing eigenvalue decomposition (EVD) of a given linear operator, or\nfinding its leading eigenvalues and eigenfunctions, is a fundamental task in\nmany machine learning and scientific computing problems. For high-dimensional\neigenvalue problems, training neural networks to parameterize the\neigenfunctions is considered as a promising alternative to the classical\nnumerical linear algebra techniques. This paper proposes a new optimization\nframework based on the low-rank approximation characterization of a truncated\nsingular value decomposition, accompanied by new techniques called\n\\emph{nesting} for learning the top-$L$ singular values and singular functions\nin the correct order. The proposed method promotes the desired orthogonality in\nthe learned functions implicitly and efficiently via an unconstrained\noptimization formulation, which is easy to solve with off-the-shelf\ngradient-based optimization algorithms. We demonstrate the effectiveness of the\nproposed optimization framework for use cases in computational physics and\nmachine learning.\n","authors":["J. Jon Ryu","Xiangxiang Xu","H. S. Melihcan Erol","Yuheng Bu","Lizhong Zheng","Gregory W. Wornell"],"pdf_url":"https://arxiv.org/pdf/2402.03655v2.pdf","comment":"36 pages, 7 figures. ICML 2024. Almost identical to the conference\n version, except a few updates for fixing typos and mistakes"},{"id":"http://arxiv.org/abs/2408.11344v1","updated":"2024-08-21T05:04:25Z","published":"2024-08-21T05:04:25Z","title":"Clinical Context-aware Radiology Report Generation from Medical Images\n using Transformers","summary":" Recent developments in the field of Natural Language Processing, especially\nlanguage models such as the transformer have brought state-of-the-art results\nin language understanding and language generation. In this work, we investigate\nthe use of the transformer model for radiology report generation from chest\nX-rays. We also highlight limitations in evaluating radiology report generation\nusing only the standard language generation metrics. We then applied a\ntransformer based radiology report generation architecture, and also compare\nthe performance of a transformer based decoder with the recurrence based\ndecoder. Experiments were performed using the IU-CXR dataset, showing superior\nresults to its LSTM counterpart and being significantly faster. Finally, we\nidentify the need of evaluating radiology report generation system using both\nlanguage generation metrics and classification metrics, which helps to provide\nrobust measure of generated reports in terms of their coherence and diagnostic\nvalue.\n","authors":["Sonit Singh"],"pdf_url":"https://arxiv.org/pdf/2408.11344v1.pdf","comment":"21 pages, 6 figures, 8 tables"},{"id":"http://arxiv.org/abs/2403.01942v3","updated":"2024-08-21T05:02:28Z","published":"2024-03-04T11:24:51Z","title":"Mitigating Label Noise on Graph via Topological Sample Selection","summary":" Despite the success of the carefully-annotated benchmarks, the effectiveness\nof existing graph neural networks (GNNs) can be considerably impaired in\npractice when the real-world graph data is noisily labeled. Previous\nexplorations in sample selection have been demonstrated as an effective way for\nrobust learning with noisy labels, however, the conventional studies focus on\ni.i.d data, and when moving to non-iid graph data and GNNs, two notable\nchallenges remain: (1) nodes located near topological class boundaries are very\ninformative for classification but cannot be successfully distinguished by the\nheuristic sample selection. (2) there is no available measure that considers\nthe graph topological information to promote sample selection in a graph. To\naddress this dilemma, we propose a $\\textit{Topological Sample Selection}$\n(TSS) method that boosts the informative sample selection process in a graph by\nutilising topological information. We theoretically prove that our procedure\nminimizes an upper bound of the expected risk under target clean distribution,\nand experimentally show the superiority of our method compared with\nstate-of-the-art baselines.\n","authors":["Yuhao Wu","Jiangchao Yao","Xiaobo Xia","Jun Yu","Ruxin Wang","Bo Han","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2403.01942v3.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2407.10784v2","updated":"2024-08-21T05:01:17Z","published":"2024-07-15T15:02:53Z","title":"AdapTable: Test-Time Adaptation for Tabular Data via Shift-Aware\n Uncertainty Calibrator and Label Distribution Handler","summary":" In real-world applications, tabular data often suffer from distribution\nshifts due to their widespread and abundant nature, leading to erroneous\npredictions of pre-trained machine learning models. However, addressing such\ndistribution shifts in the tabular domain has been relatively underexplored due\nto unique challenges such as varying attributes and dataset sizes, as well as\nthe limited representation learning capabilities of deep learning models for\ntabular data. Particularly, with the recent promising paradigm of test-time\nadaptation (TTA), where we adapt the off-the-shelf model to the unlabeled\ntarget domain during the inference phase without accessing the source domain,\nwe observe that directly adopting commonly used TTA methods from other domains\noften leads to model collapse. We systematically explore challenges in tabular\ndata test-time adaptation, including skewed entropy, complex latent space\ndecision boundaries, confidence calibration issues with both overconfident and\nunder-confident, and model bias towards source label distributions along with\nclass imbalances. Based on these insights, we introduce AdapTable, a novel\ntabular test-time adaptation method that directly modifies output probabilities\nby estimating target label distributions and adjusting initial probabilities\nbased on calibrated uncertainty. Extensive experiments on both natural\ndistribution shifts and synthetic corruptions demonstrate the adaptation\nefficacy of the proposed method.\n","authors":["Changhun Kim","Taewon Kim","Seungyeon Woo","June Yong Yang","Eunho Yang"],"pdf_url":"https://arxiv.org/pdf/2407.10784v2.pdf","comment":"Under Review at AAAI 2025"},{"id":"http://arxiv.org/abs/2408.11338v1","updated":"2024-08-21T04:45:12Z","published":"2024-08-21T04:45:12Z","title":"Automatic Dataset Construction (ADC): Sample Collection, Data Curation,\n and Beyond","summary":" Large-scale data collection is essential for developing personalized training\ndata, mitigating the shortage of training data, and fine-tuning specialized\nmodels. However, creating high-quality datasets quickly and accurately remains\na challenge due to annotation errors, the substantial time and costs associated\nwith human labor. To address these issues, we propose Automatic Dataset\nConstruction (ADC), an innovative methodology that automates dataset creation\nwith negligible cost and high efficiency. Taking the image classification task\nas a starting point, ADC leverages LLMs for the detailed class design and code\ngeneration to collect relevant samples via search engines, significantly\nreducing the need for manual annotation and speeding up the data generation\nprocess. Despite these advantages, ADC also encounters real-world challenges\nsuch as label errors (label noise) and imbalanced data distributions (label\nbias). We provide open-source software that incorporates existing methods for\nlabel error detection, robust learning under noisy and biased data, ensuring a\nhigher-quality training data and more robust model training procedure.\nFurthermore, we design three benchmark datasets focused on label noise\ndetection, label noise learning, and class-imbalanced learning. These datasets\nare vital because there are few existing datasets specifically for label noise\ndetection, despite its importance. Finally, we evaluate the performance of\nexisting popular methods on these datasets, thereby facilitating further\nresearch in the field.\n","authors":["Minghao Liu","Zonglin Di","Jiaheng Wei","Zhongruo Wang","Hengxiang Zhang","Ruixuan Xiao","Haoyu Wang","Jinlong Pang","Hao Chen","Ankit Shah","Hongxin Wei","Xinlei He","Zhaowei Zhao","Haobo Wang","Lei Feng","Jindong Wang","James Davis","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2408.11338v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11336v1","updated":"2024-08-21T04:40:18Z","published":"2024-08-21T04:40:18Z","title":"FATE: Focal-modulated Attention Encoder for Temperature Prediction","summary":" One of the major challenges of the twenty-first century is climate change,\nevidenced by rising sea levels, melting glaciers, and increased storm\nfrequency. Accurate temperature forecasting is vital for understanding and\nmitigating these impacts. Traditional data-driven models often use recurrent\nneural networks (RNNs) but face limitations in parallelization, especially with\nlonger sequences. To address this, we introduce a novel approach based on the\nFocalNet Transformer architecture. Our Focal modulation Attention Encoder\n(FATE) framework operates in a multi-tensor format, utilizing tensorized\nmodulation to capture spatial and temporal nuances in meteorological data.\nComparative evaluations against existing transformer encoders, 3D CNNs, LSTM,\nand ConvLSTM models show that FATE excels at identifying complex patterns in\ntemperature data. Additionally, we present a new labeled dataset, the Climate\nChange Parameter dataset (CCPD), containing 40 years of data from Jammu and\nKashmir on seven climate-related parameters. Experiments with real-world\ntemperature datasets from the USA, Canada, and Europe show accuracy\nimprovements of 12\\%, 23\\%, and 28\\%, respectively, over current\nstate-of-the-art models. Our CCPD dataset also achieved a 24\\% improvement in\naccuracy. To support reproducible research, we have released the source code\nand pre-trained FATE model at\n\\href{https://github.com/Tajamul21/FATE}{https://github.com/Tajamul21/FATE}.\n","authors":["Tajamul Ashraf","Janibul Bashir"],"pdf_url":"https://arxiv.org/pdf/2408.11336v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11330v1","updated":"2024-08-21T04:27:44Z","published":"2024-08-21T04:27:44Z","title":"Design Principle Transfer in Neural Architecture Search via Large\n Language Models","summary":" Transferable neural architecture search (TNAS) has been introduced to design\nefficient neural architectures for multiple tasks, to enhance the practical\napplicability of NAS in real-world scenarios. In TNAS, architectural knowledge\naccumulated in previous search processes is reused to warm up the architecture\nsearch for new tasks. However, existing TNAS methods still search in an\nextensive search space, necessitating the evaluation of numerous architectures.\nTo overcome this challenge, this work proposes a novel transfer paradigm, i.e.,\ndesign principle transfer. In this work, the linguistic description of various\nstructural components' effects on architectural performance is termed design\nprinciples. They are learned from established architectures and then can be\nreused to reduce the search space by discarding unpromising architectures.\nSearching in the refined search space can boost both the search performance and\nefficiency for new NAS tasks. To this end, a large language model\n(LLM)-assisted design principle transfer (LAPT) framework is devised. In LAPT,\nLLM is applied to automatically reason the design principles from a set of\ngiven architectures, and then a principle adaptation method is applied to\nrefine these principles progressively based on the new search results.\nExperimental results show that LAPT can beat the state-of-the-art TNAS methods\non most tasks and achieve comparable performance on others.\n","authors":["Xun Zhou","Liang Feng","Xingyu Wu","Zhichao Lu","Kay Chen Tan"],"pdf_url":"https://arxiv.org/pdf/2408.11330v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11322v1","updated":"2024-08-21T04:10:38Z","published":"2024-08-21T04:10:38Z","title":"Transfer Learning and the Early Estimation of Single-Photon Source\n Quality using Machine Learning Methods","summary":" The use of single-photon sources (SPSs) is central to numerous systems and\ndevices proposed amidst a modern surge in quantum technology. However,\nmanufacturing schemes remain imperfect, and single-photon emission purity must\noften be experimentally verified via interferometry. Such a process is\ntypically slow and costly, which has motivated growing research into whether\nSPS quality can be more rapidly inferred from incomplete emission statistics.\nHence, this study is a sequel to previous work that demonstrated significant\nuncertainty in the standard method of quality estimation, i.e. the\nleast-squares fitting of a physically motivated function, and asks: can machine\nlearning (ML) do better? The study leverages eight datasets obtained from\nmeasurements involving an exemplary quantum emitter, i.e. a single InGaAs/GaAs\nepitaxial quantum dot; these eight contexts predominantly vary in the intensity\nof the exciting laser. Specifically, via a form of `transfer learning', five ML\nmodels, three linear and two ensemble-based, are trained on data from seven of\nthe contexts and tested on the eighth. Validation metrics quickly reveal that\neven a linear regressor can outperform standard fitting when it is tested on\nthe same contexts it was trained on, but the success of transfer learning is\nless assured, even though statistical analysis, made possible by data\naugmentation, suggests its superiority as an early estimator. Accordingly, the\nstudy concludes by discussing future strategies for grappling with the problem\nof SPS context dissimilarity, e.g. feature engineering and model adaptation.\n","authors":["David Jacob Kedziora","Anna Musiał","Wojciech Rudno-Rudziński","Bogdan Gabrys"],"pdf_url":"https://arxiv.org/pdf/2408.11322v1.pdf","comment":"The data and software that supports the findings of this study are\n openly available at https://github.com/UTS-CASLab/sps-quality"},{"id":"http://arxiv.org/abs/2407.11463v2","updated":"2024-08-21T03:48:56Z","published":"2024-07-16T07:55:25Z","title":"Investigating Imperceptibility of Adversarial Attacks on Tabular Data:\n An Empirical Analysis","summary":" Adversarial attacks are a potential threat to machine learning models by\ncausing incorrect predictions through imperceptible perturbations to the input\ndata. While these attacks have been extensively studied in unstructured data\nlike images, applying them to tabular data, poses new challenges. These\nchallenges arise from the inherent heterogeneity and complex feature\ninterdependencies in tabular data, which differ from the image data. To account\nfor this distinction, it is necessary to establish tailored imperceptibility\ncriteria specific to tabular data. However, there is currently a lack of\nstandardised metrics for assessing the imperceptibility of adversarial attacks\non tabular data. To address this gap, we propose a set of key properties and\ncorresponding metrics designed to comprehensively characterise imperceptible\nadversarial attacks on tabular data. These are: proximity to the original\ninput, sparsity of altered features, deviation from the original data\ndistribution, sensitivity in perturbing features with narrow distribution,\nimmutability of certain features that should remain unchanged, feasibility of\nspecific feature values that should not go beyond valid practical ranges, and\nfeature interdependencies capturing complex relationships between data\nattributes. We evaluate the imperceptibility of five adversarial attacks,\nincluding both bounded attacks and unbounded attacks, on tabular data using the\nproposed imperceptibility metrics. The results reveal a trade-off between the\nimperceptibility and effectiveness of these attacks. The study also identifies\nlimitations in current attack algorithms, offering insights that can guide\nfuture research in the area. The findings gained from this empirical analysis\nprovide valuable direction for enhancing the design of adversarial attack\nalgorithms, thereby advancing adversarial machine learning on tabular data.\n","authors":["Zhipeng He","Chun Ouyang","Laith Alzubaidi","Alistair Barros","Catarina Moreira"],"pdf_url":"https://arxiv.org/pdf/2407.11463v2.pdf","comment":"33 pages"},{"id":"http://arxiv.org/abs/2405.20763v2","updated":"2024-08-21T03:46:50Z","published":"2024-05-31T12:32:34Z","title":"Improving Generalization and Convergence by Enhancing Implicit\n Regularization","summary":" In this work, we propose an Implicit Regularization Enhancement (IRE)\nframework to accelerate the discovery of flat solutions in deep learning,\nthereby improving generalization and convergence. Specifically, IRE decouples\nthe dynamics of flat and sharp directions, which boosts the sharpness reduction\nalong flat directions while maintaining the training stability in sharp\ndirections. We show that IRE can be practically incorporated with {\\em generic\nbase optimizers} without introducing significant computational overload.\nExperiments show that IRE consistently improves the generalization performance\nfor image classification tasks across a variety of benchmark datasets\n(CIFAR-10/100, ImageNet) and models (ResNets and ViTs). Surprisingly, IRE also\nachieves a $2\\times$ {\\em speed-up} compared to AdamW in the pre-training of\nLlama models (of sizes ranging from 60M to 229M) on datasets including\nWikitext-103, Minipile, and Openwebtext. Moreover, we provide theoretical\nguarantees, showing that IRE can substantially accelerate the convergence\ntowards flat minima in Sharpness-aware Minimization (SAM).\n","authors":["Mingze Wang","Jinbo Wang","Haotian He","Zilin Wang","Guanhua Huang","Feiyu Xiong","Zhiyu Li","Weinan E","Lei Wu"],"pdf_url":"https://arxiv.org/pdf/2405.20763v2.pdf","comment":"35 pages"},{"id":"http://arxiv.org/abs/2408.11309v1","updated":"2024-08-21T03:26:16Z","published":"2024-08-21T03:26:16Z","title":"Improving Out-of-Distribution Data Handling and Corruption Resistance\n via Modern Hopfield Networks","summary":" This study explores the potential of Modern Hopfield Networks (MHN) in\nimproving the ability of computer vision models to handle out-of-distribution\ndata. While current computer vision models can generalize to unseen samples\nfrom the same distribution, they are susceptible to minor perturbations such as\nblurring, which limits their effectiveness in real-world applications. We\nsuggest integrating MHN into the baseline models to enhance their robustness.\nThis integration can be implemented during the test time for any model and\ncombined with any adversarial defense method. Our research shows that the\nproposed integration consistently improves model performance on the MNIST-C\ndataset, achieving a state-of-the-art increase of 13.84% in average corruption\naccuracy, a 57.49% decrease in mean Corruption Error (mCE), and a 60.61%\ndecrease in relative mCE compared to the baseline model. Additionally, we\ninvestigate the capability of MHN to converge to the original non-corrupted\ndata. Notably, our method does not require test-time adaptation or augmentation\nwith corruptions, underscoring its practical viability for real-world\ndeployment. (Source code publicly available at:\nhttps://github.com/salehsargolzaee/Hopfield-integrated-test)\n","authors":["Saleh Sargolzaei","Luis Rueda"],"pdf_url":"https://arxiv.org/pdf/2408.11309v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11306v1","updated":"2024-08-21T03:21:52Z","published":"2024-08-21T03:21:52Z","title":"KAN4TSF: Are KAN and KAN-based models Effective for Time Series\n Forecasting?","summary":" Time series forecasting is a crucial task that predicts the future values of\nvariables based on historical data. Time series forecasting techniques have\nbeen developing in parallel with the machine learning community, from early\nstatistical learning methods to current deep learning methods. Although\nexisting methods have made significant progress, they still suffer from two\nchallenges. The mathematical theory of mainstream deep learning-based methods\ndoes not establish a clear relation between network sizes and fitting\ncapabilities, and these methods often lack interpretability. To this end, we\nintroduce the Kolmogorov-Arnold Network (KAN) into time series forecasting\nresearch, which has better mathematical properties and interpretability. First,\nwe propose the Reversible Mixture of KAN experts (RMoK) model, which is a\nKAN-based model for time series forecasting. RMoK uses a mixture-of-experts\nstructure to assign variables to KAN experts. Then, we compare performance,\nintegration, and speed between RMoK and various baselines on real-world\ndatasets, and the experimental results show that RMoK achieves the best\nperformance in most cases. And we find the relationship between temporal\nfeature weights and data periodicity through visualization, which roughly\nexplains RMoK's mechanism. Thus, we conclude that KAN and KAN-based models\n(RMoK) are effective in time series forecasting. Code is available at KAN4TSF:\nhttps://github.com/2448845600/KAN4TSF.\n","authors":["Xiao Han","Xinfeng Zhang","Yiling Wu","Zhenduo Zhang","Zhe Wu"],"pdf_url":"https://arxiv.org/pdf/2408.11306v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11304v1","updated":"2024-08-21T03:16:12Z","published":"2024-08-21T03:16:12Z","title":"FedMoE: Personalized Federated Learning via Heterogeneous Mixture of\n Experts","summary":" As Large Language Models (LLMs) push the boundaries of AI capabilities, their\ndemand for data is growing. Much of this data is private and distributed across\nedge devices, making Federated Learning (FL) a de-facto alternative for\nfine-tuning (i.e., FedLLM). However, it faces significant challenges due to the\ninherent heterogeneity among clients, including varying data distributions and\ndiverse task types. Towards a versatile FedLLM, we replace traditional dense\nmodel with a sparsely-activated Mixture-of-Experts (MoE) architecture, whose\nparallel feed-forward networks enable greater flexibility. To make it more\npractical in resource-constrained environments, we present FedMoE, the\nefficient personalized FL framework to address data heterogeneity, constructing\nan optimal sub-MoE for each client and bringing the knowledge back to global\nMoE. FedMoE is composed of two fine-tuning stages. In the first stage, FedMoE\nsimplifies the problem by conducting a heuristic search based on observed\nactivation patterns, which identifies a suboptimal submodel for each client. In\nthe second stage, these submodels are distributed to clients for further\ntraining and returned for server aggregating through a novel modular\naggregation strategy. Meanwhile, FedMoE progressively adjusts the submodels to\noptimal through global expert recommendation. Experimental results demonstrate\nthe superiority of our method over previous personalized FL methods.\n","authors":["Hanzi Mei","Dongqi Cai","Ao Zhou","Shangguang Wang","Mengwei Xu"],"pdf_url":"https://arxiv.org/pdf/2408.11304v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11303v1","updated":"2024-08-21T03:15:37Z","published":"2024-08-21T03:15:37Z","title":"Koopman AutoEncoder via Singular Value Decomposition for Data-Driven\n Long-Term Prediction","summary":" The Koopman autoencoder, a data-driven technique, has gained traction for\nmodeling nonlinear dynamics using deep learning methods in recent years. Given\nthe linear characteristics inherent to the Koopman operator, controlling its\neigenvalues offers an opportunity to enhance long-term prediction performance,\na critical task for forecasting future trends in time-series datasets with\nlong-term behaviors. However, controlling eigenvalues is challenging due to\nhigh computational complexity and difficulties in managing them during the\ntraining process. To tackle this issue, we propose leveraging the singular\nvalue decomposition (SVD) of the Koopman matrix to adjust the singular values\nfor better long-term prediction. Experimental results demonstrate that, during\ntraining, the loss term for singular values effectively brings the eigenvalues\nclose to the unit circle, and the proposed approach outperforms existing\nbaseline methods for long-term prediction tasks.\n","authors":["Jinho Choi","Sivaram Krishnan","Jihong Park"],"pdf_url":"https://arxiv.org/pdf/2408.11303v1.pdf","comment":"6 pages, 5 figures, to be presented at IEEE MLSP 2024"},{"id":"http://arxiv.org/abs/2408.11302v1","updated":"2024-08-21T03:14:25Z","published":"2024-08-21T03:14:25Z","title":"Modeling Reference-dependent Choices with Graph Neural Networks","summary":" While the classic Prospect Theory has highlighted the reference-dependent and\ncomparative nature of consumers' product evaluation processes, few models have\nsuccessfully integrated this theoretical hypothesis into data-driven preference\nquantification, particularly in the realm of recommender systems development.\nTo bridge this gap, we propose a new research problem of modeling\nreference-dependent preferences from a data-driven perspective, and design a\nnovel deep learning-based framework named Attributed Reference-dependent Choice\nModel for Recommendation (ArcRec) to tackle the inherent challenges associated\nwith this problem. ArcRec features in building a reference network from\naggregated historical purchase records for instantiating theoretical reference\npoints, which is then decomposed into product attribute specific sub-networks\nand represented through Graph Neural Networks. In this way, the reference\npoints of a consumer can be encoded at the attribute-level individually from\nher past experiences but also reflect the crowd influences. ArcRec also makes\nnovel contributions to quantifying consumers' reference-dependent preferences\nusing a deep neural network-based utility function that integrates both\ninterest-inspired and price-inspired preferences, with their complex\ninteraction effects captured by an attribute-aware price sensitivity mechanism.\nMost importantly, ArcRec introduces a novel Attribute-level Willingness-To-Pay\nmeasure to the reference-dependent utility function, which captures a\nconsumer's heterogeneous salience of product attributes via observing her\nattribute-level price tolerance to a product. Empirical evaluations on both\nsynthetic and real-world online shopping datasets demonstrate ArcRec's superior\nperformances over fourteen state-of-the-art baselines.\n","authors":["Liang Zhang","Guannan Liu","Junjie Wu","Yong Tan"],"pdf_url":"https://arxiv.org/pdf/2408.11302v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11300v1","updated":"2024-08-21T03:05:06Z","published":"2024-08-21T03:05:06Z","title":"Offline Policy Learning via Skill-step Abstraction for Long-horizon\n Goal-Conditioned Tasks","summary":" Goal-conditioned (GC) policy learning often faces a challenge arising from\nthe sparsity of rewards, when confronting long-horizon goals. To address the\nchallenge, we explore skill-based GC policy learning in offline settings, where\nskills are acquired from existing data and long-horizon goals are decomposed\ninto sequences of near-term goals that align with these skills. Specifically,\nwe present an `offline GC policy learning via skill-step abstraction' framework\n(GLvSA) tailored for tackling long-horizon GC tasks affected by goal\ndistribution shifts. In the framework, a GC policy is progressively learned\noffline in conjunction with the incremental modeling of skill-step abstractions\non the data. We also devise a GC policy hierarchy that not only accelerates GC\npolicy learning within the framework but also allows for parameter-efficient\nfine-tuning of the policy. Through experiments with the maze and Franka kitchen\nenvironments, we demonstrate the superiority and efficiency of our GLvSA\nframework in adapting GC policies to a wide range of long-horizon goals. The\nframework achieves competitive zero-shot and few-shot adaptation performance,\noutperforming existing GC policy learning and skill-based methods.\n","authors":["Donghoon Kim","Minjong Yoo","Honguk Woo"],"pdf_url":"https://arxiv.org/pdf/2408.11300v1.pdf","comment":"9 pages, 4 figures, International Joint Conference on Artificial\n Intelligence 2024, Published version"},{"id":"http://arxiv.org/abs/2303.11081v2","updated":"2024-08-21T02:54:45Z","published":"2023-03-17T05:16:49Z","title":"Provably Convergent Subgraph-wise Sampling for Fast GNN Training","summary":" Subgraph-wise sampling -- a promising class of mini-batch training techniques\nfor graph neural networks (GNNs -- is critical for real-world applications.\nDuring the message passing (MP) in GNNs, subgraph-wise sampling methods discard\nmessages outside the mini-batches in backward passes to avoid the well-known\nneighbor explosion problem, i.e., the exponentially increasing dependencies of\nnodes with the number of MP iterations. However, discarding messages may\nsacrifice the gradient estimation accuracy, posing significant challenges to\ntheir convergence analysis and convergence speeds. To address this challenge,\nwe propose a novel subgraph-wise sampling method with a convergence guarantee,\nnamely Local Message Compensation (LMC). To the best of our knowledge, LMC is\nthe first subgraph-wise sampling method with provable convergence. The key idea\nis to retrieve the discarded messages in backward passes based on a message\npassing formulation of backward passes. By efficient and effective\ncompensations for the discarded messages in both forward and backward passes,\nLMC computes accurate mini-batch gradients and thus accelerates convergence.\nMoreover, LMC is applicable to various MP-based GNN architectures, including\nconvolutional GNNs (finite message passing iterations with different layers)\nand recurrent GNNs (infinite message passing iterations with a shared layer).\nExperiments on large-scale benchmarks demonstrate that LMC is significantly\nfaster than state-of-the-art subgraph-wise sampling methods.\n","authors":["Jie Wang","Zhihao Shi","Xize Liang","Defu Lian","Shuiwang Ji","Bin Li","Enhong Chen","Feng Wu"],"pdf_url":"https://arxiv.org/pdf/2303.11081v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2302.00924"},{"id":"http://arxiv.org/abs/2408.11293v1","updated":"2024-08-21T02:48:42Z","published":"2024-08-21T02:48:42Z","title":"ViIK: Flow-based Vision Inverse Kinematics Solver with Fusing Collision\n Checking","summary":" Inverse Kinematics (IK) is to find the robot's configurations that satisfy\nthe target pose of the end effector. In motion planning, diverse configurations\nwere required in case a feasible trajectory was not found. Meanwhile, collision\nchecking (CC), e.g. Oriented bounding box (OBB), Discrete Oriented Polytope\n(DOP), and Quickhull \\cite{quickhull}, needs to be done for each configuration\nprovided by the IK solver to ensure every goal configuration for motion\nplanning is available. This means the classical IK solver and CC algorithm\nshould be executed repeatedly for every configuration. Thus, the preparation\ntime is long when the required number of goal configurations is large, e.g.\nmotion planning in cluster environments. Moreover, structured maps, which might\nbe difficult to obtain, were required by classical collision-checking\nalgorithms. To sidestep such two issues, we propose a flow-based vision method\nthat can output diverse available configurations by fusing inverse kinematics\nand collision checking, named Vision Inverse Kinematics solver (ViIK).\nMoreover, ViIK uses RGB images as the perception of environments. ViIK can\noutput 1000 configurations within 40 ms, and the accuracy is about 3\nmillimeters and 1.5 degrees. The higher accuracy can be obtained by being\nrefined by the classical IK solver within a few iterations. The self-collision\nrates can be lower than 2%. The collision-with-env rates can be lower than 10%\nin most scenes. The code is available at: https://github.com/AdamQLMeng/ViIK.\n","authors":["Qinglong Meng","Chongkun Xia","Xueqian Wang"],"pdf_url":"https://arxiv.org/pdf/2408.11293v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09724v3","updated":"2024-08-21T02:45:15Z","published":"2024-03-12T17:07:53Z","title":"ClaimVer: Explainable Claim-Level Verification and Evidence Attribution\n of Text Through Knowledge Graphs","summary":" In the midst of widespread misinformation and disinformation through social\nmedia and the proliferation of AI-generated texts, it has become increasingly\ndifficult for people to validate and trust information they encounter. Many\nfact-checking approaches and tools have been developed, but they often lack\nappropriate explainability or granularity to be useful in various contexts. A\ntext validation method that is easy to use, accessible, and can perform\nfine-grained evidence attribution has become crucial. More importantly,\nbuilding user trust in such a method requires presenting the rationale behind\neach prediction, as research shows this significantly influences people's\nbelief in automated systems. Localizing and bringing users' attention to the\nspecific problematic content is also paramount, instead of providing simple\nblanket labels. In this paper, we present ClaimVer, a human-centric framework\ntailored to meet users' informational and verification needs by generating rich\nannotations and thereby reducing cognitive load. Designed to deliver\ncomprehensive evaluations of texts, it highlights each claim, verifies it\nagainst a trusted knowledge graph (KG), presents the evidence, and provides\nsuccinct, clear explanations for each claim prediction. Finally, our framework\nintroduces an attribution score, enhancing applicability across a wide range of\ndownstream tasks.\n","authors":["Preetam Prabhu Srikar Dammu","Himanshu Naidu","Mouly Dewan","YoungMin Kim","Tanya Roosta","Aman Chadha","Chirag Shah"],"pdf_url":"https://arxiv.org/pdf/2403.09724v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03637v3","updated":"2024-08-21T02:32:43Z","published":"2024-07-04T05:13:58Z","title":"QET: Enhancing Quantized LLM Parameters and KV cache Compression through\n Element Substitution and Residual Clustering","summary":" Matrix quantization compresses matrix elements into a more compact form to\nreduce storage requirements, with dequantization enabling reconstruction for\nuse. We define the Quantization Error Minimization (QEM) problem as minimizing\nthe difference between the original and quantized matrices while ensuring the\nquantized matrix remains within fixed memory constraints. This technique is\ncrucial in applications like Large Language Model (LLM) weight compression and\nKV cache compression, where large matrix sizes demand efficient storage\nsolutions.\n As modern LLMs like GPT-4 and BERT continue to grow, effective matrix\ncompression is increasingly important. These models contain billions of\nparameters in matrix form, making efficient weight quantization essential for\nboth storage and computational efficiency. Similarly, KV caches, storing\nintermediate inference results, are matrix-based and benefit significantly from\noptimized compression techniques.\n To address the QEM problem in the context of LLM weight and KV cache\ncompression, we propose Quantum Entanglement Trees (QET). QET leverages the\nlocal structure of matrix elements by iteratively swapping elements to create a\nlocally ordered matrix, which is then grouped and quantized column by column.\nTo enhance QET, we introduce two optimizations: residual quantization to\nfurther reduce Mean Squared Error (MSE) and masking with batch processing to\naccelerate the algorithm.\n Our experiments demonstrate that QET can reduce MSE to 12.3% of its original\nvalue at the same compression ratio, outperforming leading baseline methods.\nOur contributions include framing the QEM problem specifically for LLM and KV\ncache compression, developing the QET algorithm, and implementing optimizations\nthat improve accuracy and processing speed.\n","authors":["Yanshu Wang","Wang Li","Tong Yang"],"pdf_url":"https://arxiv.org/pdf/2407.03637v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11287v1","updated":"2024-08-21T02:19:54Z","published":"2024-08-21T02:19:54Z","title":"Taming Generative Diffusion for Universal Blind Image Restoration","summary":" Diffusion models have been widely utilized for image restoration. However,\nprevious blind image restoration methods still need to assume the type of\ndegradation model while leaving the parameters to be optimized, limiting their\nreal-world applications. Therefore, we aim to tame generative diffusion prior\nfor universal blind image restoration dubbed BIR-D, which utilizes an\noptimizable convolutional kernel to simulate the degradation model and\ndynamically update the parameters of the kernel in the diffusion steps,\nenabling it to achieve blind image restoration results even in various complex\nsituations. Besides, based on mathematical reasoning, we have provided an\nempirical formula for the chosen of adaptive guidance scale, eliminating the\nneed for a grid search for the optimal parameter. Experimentally, Our BIR-D has\ndemonstrated superior practicality and versatility than off-the-shelf\nunsupervised methods across various tasks both on real-world and synthetic\ndatasets, qualitatively and quantitatively. BIR-D is able to fulfill\nmulti-guidance blind image restoration. Moreover, BIR-D can also restore images\nthat undergo multiple and complicated degradations, demonstrating the practical\napplications.\n","authors":["Siwei Tu","Weidong Yang","Ben Fei"],"pdf_url":"https://arxiv.org/pdf/2408.11287v1.pdf","comment":"14 pages, 9 figures, 8 tables"},{"id":"http://arxiv.org/abs/2408.11276v1","updated":"2024-08-21T01:59:27Z","published":"2024-08-21T01:59:27Z","title":"Chernoff Bounds for Tensor Expanders on Riemannian Manifolds Using Graph\n Laplacian Approximation","summary":" This paper addresses the advancement of probability tail bound analysis, a\ncrucial statistical tool for assessing the probability of large deviations of\nrandom variables from their expected values. Traditional tail bounds, such as\nMarkov's, Chebyshev's, and Chernoff bounds, have proven valuable across\nnumerous scientific and engineering fields. However, as data complexity grows,\nthere is a pressing need to extend tail bound estimation from scalar variables\nto high-dimensional random objects. Existing studies often rely on the\nassumption of independence among high-dimensional random objects, an assumption\nthat may not always be valid. Building on the work of researchers like Garg et\nal. and Chang, who employed random walks to model high-dimensional ensembles,\nthis study introduces a more generalized approach by exploring random walks\nover manifolds. To address the challenges of constructing an appropriate\nunderlying graph for a manifold, we propose a novel method that enhances random\nwalks on graphs approximating the manifold. This approach ensures spectral\nsimilarity between the original manifold and the approximated graph, including\nmatching eigenvalues, eigenvectors, and eigenfunctions. Leveraging graph\napproximation technique proposed by Burago et al. for manifolds, we derive the\ntensor Chernoff bound and establish its range for random walks on a Riemannian\nmanifold according to the underlying manifold's spectral characteristics.\n","authors":["Shih-Yu Chang"],"pdf_url":"https://arxiv.org/pdf/2408.11276v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02047v4","updated":"2024-08-21T01:58:38Z","published":"2024-02-03T05:52:28Z","title":"Calibration and Correctness of Language Models for Code","summary":" Machine learning models are widely used, but can also often be wrong. Users\nwould benefit from a reliable indication of whether a given output from a given\nmodel should be trusted, so a rational decision can be made whether to use the\noutput or not. For example, outputs can be associated with a confidence\nmeasure; if this confidence measure is strongly associated with likelihood of\ncorrectness, then the model is said to be well-calibrated.\n A well-calibrated confidence measure can serve as a basis for rational,\ngraduated decision-making on how much review and care is needed when using\ngenerated code. Calibration has so far been studied in mostly non-generative\n(e.g. classification) settings, especially in software engineering. However,\ngenerated code can quite often be wrong: Given generated code, developers must\ndecide whether to use directly, use after varying intensity of careful review,\nor discard model-generated code. Thus, calibration is vital in generative\nsettings.\n We make several contributions. We develop a framework for evaluating the\ncalibration of code-generating models. We consider several tasks, correctness\ncriteria, datasets, and approaches, and find that, by and large, generative\ncode models we test are not well-calibrated out of the box. We then show how\ncalibration can be improved using standard methods, such as Platt scaling.\nSince Platt scaling relies on the prior availability of correctness data, we\nevaluate the applicability and generalizability of Platt scaling in software\nengineering, discuss settings where it has good potential for practical use,\nand settings where it does not. Our contributions will lead to\nbetter-calibrated decision-making in the current use of code generated by\nlanguage models, and offers a framework for future research to further improve\ncalibration methods for generative models in software engineering.\n","authors":["Claudio Spiess","David Gros","Kunal Suresh Pai","Michael Pradel","Md Rafiqul Islam Rabin","Amin Alipour","Susmit Jha","Prem Devanbu","Toufique Ahmed"],"pdf_url":"https://arxiv.org/pdf/2402.02047v4.pdf","comment":"Published in ICSE'25"},{"id":"http://arxiv.org/abs/2407.01619v2","updated":"2024-08-21T01:58:00Z","published":"2024-06-28T17:28:53Z","title":"TabSketchFM: Sketch-based Tabular Representation Learning for Data\n Discovery over Data Lakes","summary":" Enterprises have a growing need to identify relevant tables in data lakes;\ne.g. tables that are unionable, joinable, or subsets of each other. Tabular\nneural models can be helpful for such data discovery tasks. In this paper, we\npresent TabSketchFM, a neural tabular model for data discovery over data lakes.\nFirst, we propose novel pre-training: a sketch-based approach to enhance the\neffectiveness of data discovery in neural tabular models. Second, we finetune\nthe pretrained model for identifying unionable, joinable, and subset table\npairs and show significant improvement over previous tabular neural models.\nThird, we present a detailed ablation study to highlight which sketches are\ncrucial for which tasks. Fourth, we use these finetuned models to perform table\nsearch; i.e., given a query table, find other tables in a corpus that are\nunionable, joinable, or that are subsets of the query. Our results demonstrate\nsignificant improvements in F1 scores for search compared to state-of-the-art\ntechniques. Finally, we show significant transfer across datasets and tasks\nestablishing that our model can generalize across different tasks and over\ndifferent data lakes.\n","authors":["Aamod Khatiwada","Harsha Kokel","Ibrahim Abdelaziz","Subhajit Chaudhury","Julian Dolby","Oktie Hassanzadeh","Zhenhan Huang","Tejaswini Pedapati","Horst Samulowitz","Kavitha Srinivas"],"pdf_url":"https://arxiv.org/pdf/2407.01619v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10919v2","updated":"2024-08-21T01:57:15Z","published":"2024-08-20T15:04:14Z","title":"CrossFi: A Cross Domain Wi-Fi Sensing Framework Based on Siamese Network","summary":" In recent years, Wi-Fi sensing has garnered significant attention due to its\nnumerous benefits, such as privacy protection, low cost, and penetration\nability. Extensive research has been conducted in this field, focusing on areas\nsuch as gesture recognition, people identification, and fall detection.\nHowever, many data-driven methods encounter challenges related to domain shift,\nwhere the model fails to perform well in environments different from the\ntraining data. One major factor contributing to this issue is the limited\navailability of Wi-Fi sensing datasets, which makes models learn excessive\nirrelevant information and over-fit to the training set. Unfortunately,\ncollecting large-scale Wi-Fi sensing datasets across diverse scenarios is a\nchallenging task. To address this problem, we propose CrossFi, a siamese\nnetwork-based approach that excels in both in-domain scenario and cross-domain\nscenario, including few-shot, zero-shot scenarios, and even works in few-shot\nnew-class scenario where testing set contains new categories. The core\ncomponent of CrossFi is a sample-similarity calculation network called CSi-Net,\nwhich improves the structure of the siamese network by using an attention\nmechanism to capture similarity information, instead of simply calculating the\ndistance or cosine similarity. Based on it, we develop an extra Weight-Net that\ncan generate a template for each class, so that our CrossFi can work in\ndifferent scenarios. Experimental results demonstrate that our CrossFi achieves\nstate-of-the-art performance across various scenarios. In gesture recognition\ntask, our CrossFi achieves an accuracy of 98.17% in in-domain scenario, 91.72%\nin one-shot cross-domain scenario, 64.81% in zero-shot cross-domain scenario,\nand 84.75% in one-shot new-class scenario. To facilitate future research, we\nwill release the code for our model upon publication.\n","authors":["Zijian Zhao","Tingwei Chen","Zhijie Cai","Xiaoyang Li","Hang Li","Qimei Chen","Guangxu Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.10919v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11267v1","updated":"2024-08-21T01:39:42Z","published":"2024-08-21T01:39:42Z","title":"Inverting the Leverage Score Gradient: An Efficient Approximate Newton\n Method","summary":" Leverage scores have become essential in statistics and machine learning,\naiding regression analysis, randomized matrix computations, and various other\ntasks. This paper delves into the inverse problem, aiming to recover the\nintrinsic model parameters given the leverage scores gradient. This endeavor\nnot only enriches the theoretical understanding of models trained with leverage\nscore techniques but also has substantial implications for data privacy and\nadversarial security. We specifically scrutinize the inversion of the leverage\nscore gradient, denoted as $g(x)$. An innovative iterative algorithm is\nintroduced for the approximate resolution of the regularized least squares\nproblem stated as $\\min_{x \\in \\mathbb{R}^d} 0.5 \\|g(x) - c\\|_2^2 +\n0.5\\|\\mathrm{diag}(w)Ax\\|_2^2$. Our algorithm employs subsampled leverage score\ndistributions to compute an approximate Hessian in each iteration, under\nstandard assumptions, considerably mitigating the time complexity. Given that a\ntotal of $T = \\log(\\| x_0 - x^* \\|_2/ \\epsilon)$ iterations are required, the\ncost per iteration is optimized to the order of $O( (\\mathrm{nnz}(A) +\nd^{\\omega} ) \\cdot \\mathrm{poly}(\\log(n/\\delta))$, where $\\mathrm{nnz}(A)$\ndenotes the number of non-zero entries of $A$.\n","authors":["Chenyang Li","Zhao Song","Zhaoxing Xu","Junze Yin"],"pdf_url":"https://arxiv.org/pdf/2408.11267v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2404.13785"},{"id":"http://arxiv.org/abs/2408.11266v1","updated":"2024-08-21T01:34:20Z","published":"2024-08-21T01:34:20Z","title":"Practical Aspects on Solving Differential Equations Using Deep Learning:\n A Primer","summary":" Deep learning has become a popular tool across many scientific fields,\nincluding the study of differential equations, particularly partial\ndifferential equations. This work introduces the basic principles of deep\nlearning and the Deep Galerkin method, which uses deep neural networks to solve\ndifferential equations. This primer aims to provide technical and practical\ninsights into the Deep Galerkin method and its implementation. We demonstrate\nhow to solve the one-dimensional heat equation step-by-step. We also show how\nto apply the Deep Galerkin method to solve systems of ordinary differential\nequations and integral equations, such as the Fredholm of the second kind.\nAdditionally, we provide code snippets within the text and the complete source\ncode on Github. The examples are designed so that one can run them on a simple\ncomputer without needing a GPU.\n","authors":["Georgios Is. Detorakis"],"pdf_url":"https://arxiv.org/pdf/2408.11266v1.pdf","comment":"32 pages, 12 figures, primer (tutorial)"},{"id":"http://arxiv.org/abs/2407.06496v2","updated":"2024-08-21T01:20:25Z","published":"2024-07-09T01:58:19Z","title":"It's Our Loss: No Privacy Amplification for Hidden State DP-SGD With\n Non-Convex Loss","summary":" Differentially Private Stochastic Gradient Descent (DP-SGD) is a popular\niterative algorithm used to train machine learning models while formally\nguaranteeing the privacy of users. However, the privacy analysis of DP-SGD\nmakes the unrealistic assumption that all intermediate iterates (aka internal\nstate) of the algorithm are released since, in practice, only the final trained\nmodel, i.e., the final iterate of the algorithm is released. In this hidden\nstate setting, prior work has provided tighter analyses, albeit only when the\nloss function is constrained, e.g., strongly convex and smooth or linear. On\nthe other hand, the privacy leakage observed empirically from hidden state\nDP-SGD, even when using non-convex loss functions, suggests that there is in\nfact a gap between the theoretical privacy analysis and the privacy guarantees\nachieved in practice. Therefore, it remains an open question whether hidden\nstate privacy amplification for DP-SGD is possible for all (possibly\nnon-convex) loss functions in general.\n In this work, we design a counter-example and show, both theoretically and\nempirically, that a hidden state privacy amplification result for DP-SGD for\nall loss functions in general is not possible. By carefully constructing a loss\nfunction for DP-SGD, we show that for specific loss functions, the final\niterate of DP-SGD alone leaks as much information as the sequence of all\niterates combined. Furthermore, we empirically verify this result by evaluating\nthe privacy leakage from the final iterate of DP-SGD with our loss function and\nshow that this exactly matches the theoretical upper bound guaranteed by DP.\nTherefore, we show that the current privacy analysis for DP-SGD is tight for\ngeneral loss functions and conclude that no privacy amplification is possible\nfor DP-SGD in general for all (possibly non-convex) loss functions.\n","authors":["Meenatchi Sundaram Muthu Selva Annamalai"],"pdf_url":"https://arxiv.org/pdf/2407.06496v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11264v1","updated":"2024-08-21T01:11:32Z","published":"2024-08-21T01:11:32Z","title":"Correlation Analysis of Adversarial Attack in Time Series Classification","summary":" This study investigates the vulnerability of time series classification\nmodels to adversarial attacks, with a focus on how these models process local\nversus global information under such conditions. By leveraging the Normalized\nAuto Correlation Function (NACF), an exploration into the inclination of neural\nnetworks is conducted. It is demonstrated that regularization techniques,\nparticularly those employing Fast Fourier Transform (FFT) methods and targeting\nfrequency components of perturbations, markedly enhance the effectiveness of\nattacks. Meanwhile, the defense strategies, like noise introduction and\nGaussian filtering, are shown to significantly lower the Attack Success Rate\n(ASR), with approaches based on noise introducing notably effective in\ncountering high-frequency distortions. Furthermore, models designed to\nprioritize global information are revealed to possess greater resistance to\nadversarial manipulations. These results underline the importance of designing\nattack and defense mechanisms, informed by frequency domain analysis, as a\nmeans to considerably reinforce the resilience of neural network models against\nadversarial threats.\n","authors":["Zhengyang Li","Wenhao Liang","Chang Dong","Weitong Chen","Dong Huang"],"pdf_url":"https://arxiv.org/pdf/2408.11264v1.pdf","comment":"15 pages, 7 figures"},{"id":"http://arxiv.org/abs/2210.05102v3","updated":"2024-08-21T01:05:35Z","published":"2022-10-11T02:39:06Z","title":"Pre-Training Representations of Binary Code Using Contrastive Learning","summary":" Compiled software is delivered as executable binary code. Developers write\nsource code to express the software semantics, but the compiler converts it to\na binary format that the CPU can directly execute. Therefore, binary code\nanalysis is critical to applications in reverse engineering and computer\nsecurity tasks where source code is not available. However, unlike source code\nand natural language that contain rich semantic information, binary code is\ntypically difficult for human engineers to understand and analyze. While\nexisting work uses AI models to assist source code analysis, few studies have\nconsidered binary code. In this paper, we propose a COntrastive learning Model\nfor Binary cOde Analysis, or COMBO, that incorporates source code and comment\ninformation into binary code during representation learning. Specifically, we\npresent three components in COMBO: (1) a primary contrastive learning method\nfor cold-start pre-training, (2) a simplex interpolation method to incorporate\nsource code, comments, and binary code, and (3) an intermediate representation\nlearning algorithm to provide binary code embeddings. Finally, we evaluate the\neffectiveness of the pre-trained representations produced by COMBO using three\nindicative downstream tasks relating to binary code: algorithmic functionality\nclassification, binary code similarity, and vulnerability detection. Our\nexperimental results show that COMBO facilitates representation learning of\nbinary code visualized by distribution analysis, and improves the performance\non all three downstream tasks by 5.45% on average compared to state-of-the-art\nlarge-scale language representation models. To the best of our knowledge, COMBO\nis the first language representation model that incorporates source code,\nbinary code, and comments into contrastive code representation learning and\nunifies multiple tasks for binary code analysis.\n","authors":["Yifan Zhang","Chen Huang","Kevin Cao","Yueke Zhang","Scott Thomas Andersen","Huajie Shao","Kevin Leach","Yu Huang"],"pdf_url":"https://arxiv.org/pdf/2210.05102v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.08843v2","updated":"2024-08-21T00:57:16Z","published":"2024-07-11T19:58:19Z","title":"Inflationary Flows: Calibrated Bayesian Inference with Diffusion-Based\n Models","summary":" Beyond estimating parameters of interest from data, one of the key goals of\nstatistical inference is to properly quantify uncertainty in these estimates.\nIn Bayesian inference, this uncertainty is provided by the posterior\ndistribution, the computation of which typically involves an intractable\nhigh-dimensional integral. Among available approximation methods,\nsampling-based approaches come with strong theoretical guarantees but scale\npoorly to large problems, while variational approaches scale well but offer few\ntheoretical guarantees. In particular, variational methods are known to produce\noverconfident estimates of posterior uncertainty and are typically\nnon-identifiable, with many latent variable configurations generating\nequivalent predictions. Here, we address these challenges by showing how\ndiffusion-based models (DBMs), which have recently produced state-of-the-art\nperformance in generative modeling tasks, can be repurposed for performing\ncalibrated, identifiable Bayesian inference. By exploiting a previously\nestablished connection between the stochastic and probability flow ordinary\ndifferential equations (pfODEs) underlying DBMs, we derive a class of models,\ninflationary flows, that uniquely and deterministically map high-dimensional\ndata to a lower-dimensional Gaussian distribution via ODE integration. This map\nis both invertible and neighborhood-preserving, with controllable numerical\nerror, with the result that uncertainties in the data are correctly propagated\nto the latent space. We demonstrate how such maps can be learned via standard\nDBM training using a novel noise schedule and are effective at both preserving\nand reducing intrinsic data dimensionality. The result is a class of highly\nexpressive generative models, uniquely defined on a low-dimensional latent\nspace, that afford principled Bayesian inference.\n","authors":["Daniela de Albuquerque","John Pearson"],"pdf_url":"https://arxiv.org/pdf/2407.08843v2.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.10258v2","updated":"2024-08-21T00:52:28Z","published":"2024-08-13T13:21:53Z","title":"NeRF-US: Removing Ultrasound Imaging Artifacts from Neural Radiance\n Fields in the Wild","summary":" Current methods for performing 3D reconstruction and novel view synthesis\n(NVS) in ultrasound imaging data often face severe artifacts when training\nNeRF-based approaches. The artifacts produced by current approaches differ from\nNeRF floaters in general scenes because of the unique nature of ultrasound\ncapture. Furthermore, existing models fail to produce reasonable 3D\nreconstructions when ultrasound data is captured or obtained casually in\nuncontrolled environments, which is common in clinical settings. Consequently,\nexisting reconstruction and NVS methods struggle to handle ultrasound motion,\nfail to capture intricate details, and cannot model transparent and reflective\nsurfaces. In this work, we introduced NeRF-US, which incorporates 3D-geometry\nguidance for border probability and scattering density into NeRF training,\nwhile also utilizing ultrasound-specific rendering over traditional volume\nrendering. These 3D priors are learned through a diffusion model. Through\nexperiments conducted on our new \"Ultrasound in the Wild\" dataset, we observed\naccurate, clinically plausible, artifact-free reconstructions.\n","authors":["Rishit Dagli","Atsuhiro Hibi","Rahul G. Krishnan","Pascal N. Tyrrell"],"pdf_url":"https://arxiv.org/pdf/2408.10258v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.11404v3","updated":"2024-08-21T00:31:29Z","published":"2024-02-17T23:41:15Z","title":"Evaluating the Stability of Deep Learning Latent Feature Spaces","summary":" High-dimensional datasets present substantial challenges in statistical\nmodeling across various disciplines, necessitating effective dimensionality\nreduction methods. Deep learning approaches, notable for their capacity to\ndistill essential features from complex data, facilitate modeling,\nvisualization, and compression through reduced dimensionality latent feature\nspaces, have wide applications from bioinformatics to earth sciences. This\nstudy introduces a novel workflow to evaluate the stability of these latent\nspaces, ensuring consistency and reliability in subsequent analyses. Stability,\ndefined as the invariance of latent spaces to minor data, training\nrealizations, and parameter perturbations, is crucial yet often overlooked.\n Our proposed methodology delineates three stability types, sample,\nstructural, and inferential, within latent spaces, and introduces a suite of\nmetrics for comprehensive evaluation. We implement this workflow across 500\nautoencoder realizations and three datasets, encompassing both synthetic and\nreal-world scenarios to explain latent space dynamics. Employing k-means\nclustering and the modified Jonker-Volgenant algorithm for class alignment,\nalongside anisotropy metrics and convex hull analysis, we introduce adjusted\nstress and Jaccard dissimilarity as novel stability indicators.\n Our findings highlight inherent instabilities in latent feature spaces and\ndemonstrate the workflow's efficacy in quantifying and interpreting these\ninstabilities. This work advances the understanding of latent feature spaces,\npromoting improved model interpretability and quality control for more informed\ndecision-making for diverse analytical workflows that leverage deep learning.\n","authors":["Ademide O. Mabadeje","Michael J. Pyrcz"],"pdf_url":"https://arxiv.org/pdf/2402.11404v3.pdf","comment":"30 pages, 11 figures, submitted to Journal"},{"id":"http://arxiv.org/abs/2408.08459v2","updated":"2024-08-21T00:24:53Z","published":"2024-08-15T23:57:02Z","title":"JPEG-LM: LLMs as Image Generators with Canonical Codec Representations","summary":" Recent work in image and video generation has been adopting the\nautoregressive LLM architecture due to its generality and potentially easy\nintegration into multi-modal systems. The crux of applying autoregressive\ntraining in language generation to visual generation is discretization --\nrepresenting continuous data like images and videos as discrete tokens. Common\nmethods of discretizing images and videos include modeling raw pixel values,\nwhich are prohibitively lengthy, or vector quantization, which requires\nconvoluted pre-hoc training. In this work, we propose to directly model images\nand videos as compressed files saved on computers via canonical codecs (e.g.,\nJPEG, AVC/H.264). Using the default Llama architecture without any\nvision-specific modifications, we pretrain JPEG-LM from scratch to generate\nimages (and AVC-LM to generate videos as a proof of concept), by directly\noutputting compressed file bytes in JPEG and AVC formats. Evaluation of image\ngeneration shows that this simple and straightforward approach is more\neffective than pixel-based modeling and sophisticated vector quantization\nbaselines (on which our method yields a 31% reduction in FID). Our analysis\nshows that JPEG-LM has an especial advantage over vector quantization models in\ngenerating long-tail visual elements. Overall, we show that using canonical\ncodec representations can help lower the barriers between language generation\nand visual generation, facilitating future research on multi-modal\nlanguage/image/video LLMs.\n","authors":["Xiaochuang Han","Marjan Ghazvininejad","Pang Wei Koh","Yulia Tsvetkov"],"pdf_url":"https://arxiv.org/pdf/2408.08459v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09596v2","updated":"2024-08-21T00:12:16Z","published":"2024-01-17T21:08:41Z","title":"Efficient generative adversarial networks using linear\n additive-attention Transformers","summary":" Although the capacity of deep generative models for image generation, such as\nDiffusion Models (DMs) and Generative Adversarial Networks (GANs), has\ndramatically improved in recent years, much of their success can be attributed\nto computationally expensive architectures. This has limited their adoption and\nuse to research laboratories and companies with large resources, while\nsignificantly raising the carbon footprint for training, fine-tuning, and\ninference. In this work, we present LadaGAN, an efficient generative\nadversarial network that is built upon a novel Transformer block named\nLadaformer. The main component of this block is a linear additive-attention\nmechanism that computes a single attention vector per head instead of the\nquadratic dot-product attention. We employ Ladaformer in both the generator and\ndiscriminator, which reduces the computational complexity and overcomes the\ntraining instabilities often associated with Transformer GANs. LadaGAN\nconsistently outperforms existing convolutional and Transformer GANs on\nbenchmark datasets at different resolutions while being significantly more\nefficient. Moreover, LadaGAN shows competitive performance compared to\nstate-of-the-art multi-step generative models (e.g. DMs) using orders of\nmagnitude less computational resources.\n","authors":["Emilio Morales-Juarez","Gibran Fuentes-Pineda"],"pdf_url":"https://arxiv.org/pdf/2401.09596v2.pdf","comment":"12 pages, 6 figures"}],"Multimedia":[{"id":"http://arxiv.org/abs/2407.09774v2","updated":"2024-08-21T14:17:31Z","published":"2024-07-13T05:02:42Z","title":"ContextualStory: Consistent Visual Storytelling with Spatially-Enhanced\n and Storyline Context","summary":" Visual storytelling involves generating a sequence of coherent frames from a\ntextual storyline while maintaining consistency in characters and scenes.\nExisting autoregressive methods, which rely on previous frame-sentence pairs,\nstruggle with high memory usage, slow generation speeds, and limited context\nintegration. To address these issues, we propose ContextualStory, a novel\nframework designed to generate coherent story frames and extend frames for\nstory continuation. ContextualStory utilizes Spatially-Enhanced Temporal\nAttention to capture spatial and temporal dependencies, handling significant\ncharacter movements effectively. Additionally, we introduces a Storyline\nContextualizer to enrich context in storyline embedding and a StoryFlow Adapter\nto measure scene changes between frames for guiding model. Extensive\nexperiments on PororoSV and FlintstonesSV benchmarks demonstrate that\nContextualStory significantly outperforms existing methods in both story\nvisualization and story continuation.\n","authors":["Sixiao Zheng","Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2407.09774v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11593v1","updated":"2024-08-21T12:59:42Z","published":"2024-08-21T12:59:42Z","title":"MCDubber: Multimodal Context-Aware Expressive Video Dubbing","summary":" Automatic Video Dubbing (AVD) aims to take the given script and generate\nspeech that aligns with lip motion and prosody expressiveness. Current AVD\nmodels mainly utilize visual information of the current sentence to enhance the\nprosody of synthesized speech. However, it is crucial to consider whether the\nprosody of the generated dubbing aligns with the multimodal context, as the\ndubbing will be combined with the original context in the final video. This\naspect has been overlooked in previous studies. To address this issue, we\npropose a Multimodal Context-aware video Dubbing model, termed\n\\textbf{MCDubber}, to convert the modeling object from a single sentence to a\nlonger sequence with context information to ensure the consistency of the\nglobal context prosody. MCDubber comprises three main components: (1) A context\nduration aligner aims to learn the context-aware alignment between the text and\nlip frames; (2) A context prosody predictor seeks to read the global context\nvisual sequence and predict the context-aware global energy and pitch; (3) A\ncontext acoustic decoder ultimately predicts the global context mel-spectrogram\nwith the assistance of adjacent ground-truth mel-spectrograms of the target\nsentence. Through this process, MCDubber fully considers the influence of\nmultimodal context on the prosody expressiveness of the current sentence when\ndubbing. The extracted mel-spectrogram belonging to the target sentence from\nthe output context mel-spectrograms is the final required dubbing audio.\nExtensive experiments on the Chem benchmark dataset demonstrate that our\nMCDubber significantly improves dubbing expressiveness compared to all advanced\nbaselines. The code and demos are available at\nhttps://github.com/XiaoYuanJun-zy/MCDubber.\n","authors":["Yuan Zhao","Zhenqi Jia","Rui Liu","De Hu","Feilong Bao","Guanglai Gao"],"pdf_url":"https://arxiv.org/pdf/2408.11593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05966v2","updated":"2024-08-21T10:28:18Z","published":"2024-08-12T07:44:19Z","title":"Freehand Sketch Generation from Mechanical Components","summary":" Drawing freehand sketches of mechanical components on multimedia devices for\nAI-based engineering modeling has become a new trend. However, its development\nis being impeded because existing works cannot produce suitable sketches for\ndata-driven research. These works either generate sketches lacking a freehand\nstyle or utilize generative models not originally designed for this task\nresulting in poor effectiveness. To address this issue, we design a two-stage\ngenerative framework mimicking the human sketching behavior pattern, called\nMSFormer, which is the first time to produce humanoid freehand sketches\ntailored for mechanical components. The first stage employs Open CASCADE\ntechnology to obtain multi-view contour sketches from mechanical components,\nfiltering perturbing signals for the ensuing generation process. Meanwhile, we\ndesign a view selector to simulate viewpoint selection tasks during human\nsketching for picking out information-rich sketches. The second stage\ntranslates contour sketches into freehand sketches by a transformer-based\ngenerator. To retain essential modeling features as much as possible and\nrationalize stroke distribution, we introduce a novel edge-constraint stroke\ninitialization. Furthermore, we utilize a CLIP vision encoder and a new loss\nfunction incorporating the Hausdorff distance to enhance the generalizability\nand robustness of the model. Extensive experiments demonstrate that our\napproach achieves state-of-the-art performance for generating freehand sketches\nin the mechanical domain. Project page: https://mcfreeskegen.github.io .\n","authors":["Zhichao Liao","Di Huang","Heming Fang","Yue Ma","Fengyuan Piao","Xinghui Li","Long Zeng","Pingfa Feng"],"pdf_url":"https://arxiv.org/pdf/2408.05966v2.pdf","comment":"Published at ACM Multimedia (ACM MM) 2024"},{"id":"http://arxiv.org/abs/2403.12667v3","updated":"2024-08-21T09:47:33Z","published":"2024-03-19T12:05:09Z","title":"ICE: Interactive 3D Game Character Editing via Dialogue","summary":" ost recent popular Role-Playing Games (RPGs) allow players to create in-game\ncharacters with hundreds of adjustable parameters, including bone positions and\nvarious makeup options. Although text-driven auto-customization systems have\nbeen developed to simplify the complex process of adjusting these intricate\ncharacter parameters, they are limited by their single-round generation and\nlack the capability for further editing and fine-tuning. In this paper, we\npropose an Interactive Character Editing framework (ICE) to achieve a\nmulti-round dialogue-based refinement process. In a nutshell, our ICE offers a\nmore user-friendly way to enable players to convey creative ideas iteratively\nwhile ensuring that created characters align with the expectations of players.\nSpecifically, we propose an Instruction Parsing Module (IPM) that utilizes\nlarge language models (LLMs) to parse multi-round dialogues into clear editing\ninstruction prompts in each round. To reliably and swiftly modify character\ncontrol parameters at a fine-grained level, we propose a Semantic-guided\nLow-dimension Parameter Solver (SLPS) that edits character control parameters\naccording to prompts in a zero-shot manner. Our SLPS first localizes the\ncharacter control parameters related to the fine-grained modification, and then\noptimizes the corresponding parameters in a low-dimension space to avoid\nunrealistic results. Extensive experimental results demonstrate the\neffectiveness of our proposed ICE for in-game character creation and the\nsuperior editing performance of ICE.\n","authors":["Haoqian Wu","Minda Zhao","Zhipeng Hu","Lincheng Li","Weijie Chen","Rui Zhao","Changjie Fan","Xin Yu"],"pdf_url":"https://arxiv.org/pdf/2403.12667v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20775v2","updated":"2024-08-21T02:56:47Z","published":"2024-05-26T19:11:21Z","title":"Medical MLLM is Vulnerable: Cross-Modality Jailbreak and Mismatched\n Attacks on Medical Multimodal Large Language Models","summary":" Security concerns related to Large Language Models (LLMs) have been\nextensively explored, yet the safety implications for Multimodal Large Language\nModels (MLLMs), particularly in medical contexts (MedMLLMs), remain\ninsufficiently studied. This paper delves into the underexplored security\nvulnerabilities of MedMLLMs, especially when deployed in clinical environments\nwhere the accuracy and relevance of question-and-answer interactions are\ncritically tested against complex medical challenges. By combining existing\nclinical medical data with atypical natural phenomena, we define the mismatched\nmalicious attack (2M-attack) and introduce its optimized version, known as the\noptimized mismatched malicious attack (O2M-attack or 2M-optimization). Using\nthe voluminous 3MAD dataset that we construct, which covers a wide range of\nmedical image modalities and harmful medical scenarios, we conduct a\ncomprehensive analysis and propose the MCM optimization method, which\nsignificantly enhances the attack success rate on MedMLLMs. Evaluations with\nthis dataset and attack methods, including white-box attacks on LLaVA-Med and\ntransfer attacks (black-box) on four other SOTA models, indicate that even\nMedMLLMs designed with enhanced security features remain vulnerable to security\nbreaches. Our work underscores the urgent need for a concerted effort to\nimplement robust security measures and enhance the safety and efficacy of\nopen-source MedMLLMs, particularly given the potential severity of jailbreak\nattacks and other malicious or clinically significant exploits in medical\nsettings. Our code is available at https://github.com/dirtycomputer/O2M_attack.\n","authors":["Xijie Huang","Xinyuan Wang","Hantao Zhang","Yinghao Zhu","Jiawen Xi","Jingkun An","Hao Wang","Hao Liang","Chengwei Pan"],"pdf_url":"https://arxiv.org/pdf/2405.20775v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12035v1","updated":"2024-08-21T23:38:02Z","published":"2024-08-21T23:38:02Z","title":"Let Community Rules Be Reflected in Online Content Moderation","summary":" Content moderation is a widely used strategy to prevent the dissemination of\nirregular information on social media platforms. Despite extensive research on\ndeveloping automated models to support decision-making in content moderation,\nthere remains a notable scarcity of studies that integrate the rules of online\ncommunities into content moderation. This study addresses this gap by proposing\na community rule-based content moderation framework that directly integrates\ncommunity rules into the moderation of user-generated content. Our experiment\nresults with datasets collected from two domains demonstrate the superior\nperformance of models based on the framework to baseline models across all\nevaluation metrics. In particular, incorporating community rules substantially\nenhances model performance in content moderation. The findings of this research\nhave significant research and practical implications for improving the\neffectiveness and generalizability of content moderation models in online\ncommunities.\n","authors":["Wangjiaxuan Xin","Kanlun Wang","Zhe Fu","Lina Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.12035v1.pdf","comment":"10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2408.11982v1","updated":"2024-08-21T20:32:45Z","published":"2024-08-21T20:32:45Z","title":"AIM 2024 Challenge on Compressed Video Quality Assessment: Methods and\n Results","summary":" Video quality assessment (VQA) is a crucial task in the development of video\ncompression standards, as it directly impacts the viewer experience. This paper\npresents the results of the Compressed Video Quality Assessment challenge, held\nin conjunction with the Advances in Image Manipulation (AIM) workshop at ECCV\n2024. The challenge aimed to evaluate the performance of VQA methods on a\ndiverse dataset of 459 videos, encoded with 14 codecs of various compression\nstandards (AVC/H.264, HEVC/H.265, AV1, and VVC/H.266) and containing a\ncomprehensive collection of compression artifacts. To measure the methods\nperformance, we employed traditional correlation coefficients between their\npredictions and subjective scores, which were collected via large-scale\ncrowdsourced pairwise human comparisons. For training purposes, participants\nwere provided with the Compressed Video Quality Assessment Dataset (CVQAD), a\npreviously developed dataset of 1022 videos. Up to 30 participating teams\nregistered for the challenge, while we report the results of 6 teams, which\nsubmitted valid final solutions and code for reproducing the results. Moreover,\nwe calculated and present the performance of state-of-the-art VQA methods on\nthe developed dataset, providing a comprehensive benchmark for future research.\nThe dataset, results, and online leaderboard are publicly available at\nhttps://challenges.videoprocessing.ai/challenges/compressed-video-quality-assessment.html.\n","authors":["Maksim Smirnov","Aleksandr Gushchin","Anastasia Antsiferova","Dmitry Vatolin","Radu Timofte","Ziheng Jia","Zicheng Zhang","Wei Sun","Jiaying Qian","Yuqin Cao","Yinan Sun","Yuxin Zhu","Xiongkuo Min","Guangtao Zhai","Kanjar De","Qing Luo","Ao-Xiang Zhang","Peng Zhang","Haibo Lei","Linyan Jiang","Yaqing Li","Wenhui Meng","Xiaoheng Tan","Haiqiang Wang","Xiaozhong Xu","Shan Liu","Zhenzhong Chen","Zhengxue Cheng","Jiahao Xiao","Jun Xu","Chenlong He","Qi Zheng","Ruoxi Zhu","Min Li","Yibo Fan","Zhengzhong Tu"],"pdf_url":"https://arxiv.org/pdf/2408.11982v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10500v2","updated":"2024-08-21T18:58:26Z","published":"2024-08-20T02:46:03Z","title":"SZTU-CMU at MER2024: Improving Emotion-LLaMA with Conv-Attention for\n Multimodal Emotion Recognition","summary":" This paper presents our winning approach for the MER-NOISE and MER-OV tracks\nof the MER2024 Challenge on multimodal emotion recognition. Our system\nleverages the advanced emotional understanding capabilities of Emotion-LLaMA to\ngenerate high-quality annotations for unlabeled samples, addressing the\nchallenge of limited labeled data. To enhance multimodal fusion while\nmitigating modality-specific noise, we introduce Conv-Attention, a lightweight\nand efficient hybrid framework. Extensive experimentation vali-dates the\neffectiveness of our approach. In the MER-NOISE track, our system achieves a\nstate-of-the-art weighted average F-score of 85.30%, surpassing the second and\nthird-place teams by 1.47% and 1.65%, respectively. For the MER-OV track, our\nutilization of Emotion-LLaMA for open-vocabulary annotation yields an 8.52%\nimprovement in average accuracy and recall compared to GPT-4V, securing the\nhighest score among all participating large multimodal models. The code and\nmodel for Emotion-LLaMA are available at\nhttps://github.com/ZebangCheng/Emotion-LLaMA.\n","authors":["Zebang Cheng","Shuyuan Tu","Dawei Huang","Minghan Li","Xiaojiang Peng","Zhi-Qi Cheng","Alexander G. Hauptmann"],"pdf_url":"https://arxiv.org/pdf/2408.10500v2.pdf","comment":"Ranked 1st in MER24@IJCAI and MRAC24@ACM MM (MER-NOISE & MER-OV\n (self-evaluated))"},{"id":"http://arxiv.org/abs/2408.11915v1","updated":"2024-08-21T18:06:15Z","published":"2024-08-21T18:06:15Z","title":"Video-Foley: Two-Stage Video-To-Sound Generation via Temporal Event\n Condition For Foley Sound","summary":" Foley sound synthesis is crucial for multimedia production, enhancing user\nexperience by synchronizing audio and video both temporally and semantically.\nRecent studies on automating this labor-intensive process through\nvideo-to-sound generation face significant challenges. Systems lacking explicit\ntemporal features suffer from poor controllability and alignment, while\ntimestamp-based models require costly and subjective human annotation. We\npropose Video-Foley, a video-to-sound system using Root Mean Square (RMS) as a\ntemporal event condition with semantic timbre prompts (audio or text). RMS, a\nframe-level intensity envelope feature closely related to audio semantics,\nensures high controllability and synchronization. The annotation-free\nself-supervised learning framework consists of two stages, Video2RMS and\nRMS2Sound, incorporating novel ideas including RMS discretization and\nRMS-ControlNet with a pretrained text-to-audio model. Our extensive evaluation\nshows that Video-Foley achieves state-of-the-art performance in audio-visual\nalignment and controllability for sound timing, intensity, timbre, and nuance.\nCode, model weights, and demonstrations are available on the accompanying\nwebsite. (https://jnwnlee.github.io/video-foley-demo)\n","authors":["Junwon Lee","Jaekwon Im","Dabin Kim","Juhan Nam"],"pdf_url":"https://arxiv.org/pdf/2408.11915v1.pdf","comment":null}]},"2024-08-22T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2408.12599v1","updated":"2024-08-22T17:59:04Z","published":"2024-08-22T17:59:04Z","title":"Controllable Text Generation for Large Language Models: A Survey","summary":" In Natural Language Processing (NLP), Large Language Models (LLMs) have\ndemonstrated high text generation quality. However, in real-world applications,\nLLMs must meet increasingly complex requirements. Beyond avoiding misleading or\ninappropriate content, LLMs are also expected to cater to specific user needs,\nsuch as imitating particular writing styles or generating text with poetic\nrichness. These varied demands have driven the development of Controllable Text\nGeneration (CTG) techniques, which ensure that outputs adhere to predefined\ncontrol conditions--such as safety, sentiment, thematic consistency, and\nlinguistic style--while maintaining high standards of helpfulness, fluency, and\ndiversity.\n This paper systematically reviews the latest advancements in CTG for LLMs,\noffering a comprehensive definition of its core concepts and clarifying the\nrequirements for control conditions and text quality. We categorize CTG tasks\ninto two primary types: content control and attribute control. The key methods\nare discussed, including model retraining, fine-tuning, reinforcement learning,\nprompt engineering, latent space manipulation, and decoding-time intervention.\nWe analyze each method's characteristics, advantages, and limitations,\nproviding nuanced insights for achieving generation control. Additionally, we\nreview CTG evaluation methods, summarize its applications across domains, and\naddress key challenges in current research, including reduced fluency and\npracticality. We also propose several appeals, such as placing greater emphasis\non real-world applications in future research. This paper aims to offer\nvaluable guidance to researchers and developers in the field. Our reference\nlist and Chinese version are open-sourced at\nhttps://github.com/IAAR-Shanghai/CTGSurvey.\n","authors":["Xun Liang","Hanyu Wang","Yezhaohui Wang","Shichao Song","Jiawei Yang","Simin Niu","Jie Hu","Dan Liu","Shunyu Yao","Feiyu Xiong","Zhiyu Li"],"pdf_url":"https://arxiv.org/pdf/2408.12599v1.pdf","comment":"52 pages, 11 figures, 7 tables, 11 equations"},{"id":"http://arxiv.org/abs/2407.13709v2","updated":"2024-08-22T17:56:15Z","published":"2024-07-18T17:08:10Z","title":"Understanding Reference Policies in Direct Preference Optimization","summary":" Direct Preference Optimization (DPO) has become a widely used training method\nfor the instruction fine-tuning of large language models (LLMs). In this work,\nwe explore an under-investigated aspect of DPO - its dependency on the\nreference model or policy. Such reference policies, typically instantiated as\nthe model to be further fine-tuned, are important since they can impose an\nupper limit on DPO's effectiveness. Therefore, we address three related\nresearch questions in this work. First, we explore the optimal strength of the\nKL divergence constraint in DPO, which penalizes deviations from the reference\npolicy, and find that DPO is sensitive to this strength. Next, we examine the\nnecessity of the KL-constraint from the reference policies in DPO by providing\nboth theoretical and empirical comparisons between DPO and related learning\nobjectives, demonstrating DPO's superiority in this controlled setting.\nAdditionally, we investigate whether DPO benefits from stronger reference\npolicies, finding that a stronger reference policy can lead to improved\nperformance, but only when it is similar to the model being fine-tuned. Our\nfindings highlight the confounding role of reference policies in DPO and offer\ninsights for best practices, while also identifying open research questions for\nfuture studies.\n","authors":["Yixin Liu","Pengfei Liu","Arman Cohan"],"pdf_url":"https://arxiv.org/pdf/2407.13709v2.pdf","comment":"GitHub Repo: https://github.com/yale-nlp/refdpo"},{"id":"http://arxiv.org/abs/2408.12579v1","updated":"2024-08-22T17:44:40Z","published":"2024-08-22T17:44:40Z","title":"RuleAlign: Making Large Language Models Better Physicians with\n Diagnostic Rule Alignment","summary":" Large Language Models (LLMs) like GPT-4, MedPaLM-2, and Med-Gemini achieve\nperformance competitively with human experts across various medical benchmarks.\nHowever, they still face challenges in making professional diagnoses akin to\nphysicians, particularly in efficiently gathering patient information and\nreasoning the final diagnosis. To this end, we introduce the RuleAlign\nframework, designed to align LLMs with specific diagnostic rules. We develop a\nmedical dialogue dataset comprising rule-based communications between patients\nand physicians and design an alignment learning approach through preference\nlearning. Experimental results demonstrate the effectiveness of the proposed\napproach. We hope that our work can serve as an inspiration for exploring the\npotential of LLMs as AI physicians.\n","authors":["Xiaohan Wang","Xiaoyan Yang","Yuqi Zhu","Yue Shen","Jian Wang","Peng Wei","Lei Liang","Jinjie Gu","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.12579v1.pdf","comment":"Ongoing work"},{"id":"http://arxiv.org/abs/2408.12574v1","updated":"2024-08-22T17:41:45Z","published":"2024-08-22T17:41:45Z","title":"MuMA-ToM: Multi-modal Multi-Agent Theory of Mind","summary":" Understanding people's social interactions in complex real-world scenarios\noften relies on intricate mental reasoning. To truly understand how and why\npeople interact with one another, we must infer the underlying mental states\nthat give rise to the social interactions, i.e., Theory of Mind reasoning in\nmulti-agent interactions. Additionally, social interactions are often\nmulti-modal -- we can watch people's actions, hear their conversations, and/or\nread about their past behaviors. For AI systems to successfully and safely\ninteract with people in real-world environments, they also need to understand\npeople's mental states as well as their inferences about each other's mental\nstates based on multi-modal information about their interactions. For this, we\nintroduce MuMA-ToM, a Multi-modal Multi-Agent Theory of Mind benchmark.\nMuMA-ToM is the first multi-modal Theory of Mind benchmark that evaluates\nmental reasoning in embodied multi-agent interactions. In MuMA-ToM, we provide\nvideo and text descriptions of people's multi-modal behavior in realistic\nhousehold environments. Based on the context, we then ask questions about\npeople's goals, beliefs, and beliefs about others' goals. We validated MuMA-ToM\nin a human experiment and provided a human baseline. We also proposed a novel\nmulti-modal, multi-agent ToM model, LIMP (Language model-based Inverse\nMulti-agent Planning). Our experimental results show that LIMP significantly\noutperforms state-of-the-art methods, including large multi-modal models (e.g.,\nGPT-4o, Gemini-1.5 Pro) and a recent multi-modal ToM model, BIP-ALM.\n","authors":["Haojun Shi","Suyu Ye","Xinyu Fang","Chuanyang Jin","Layla Isik","Yen-Ling Kuo","Tianmin Shu"],"pdf_url":"https://arxiv.org/pdf/2408.12574v1.pdf","comment":"Project website: https://scai.cs.jhu.edu/projects/MuMA-ToM/ Code:\n https://github.com/SCAI-JHU/MuMA-ToM"},{"id":"http://arxiv.org/abs/2408.12570v1","updated":"2024-08-22T17:38:59Z","published":"2024-08-22T17:38:59Z","title":"Jamba-1.5: Hybrid Transformer-Mamba Models at Scale","summary":" We present Jamba-1.5, new instruction-tuned large language models based on\nour Jamba architecture. Jamba is a hybrid Transformer-Mamba mixture of experts\narchitecture, providing high throughput and low memory usage across context\nlengths, while retaining the same or better quality as Transformer models. We\nrelease two model sizes: Jamba-1.5-Large, with 94B active parameters, and\nJamba-1.5-Mini, with 12B active parameters. Both models are fine-tuned for a\nvariety of conversational and instruction-following capabilties, and have an\neffective context length of 256K tokens, the largest amongst open-weight\nmodels. To support cost-effective inference, we introduce ExpertsInt8, a novel\nquantization technique that allows fitting Jamba-1.5-Large on a machine with 8\n80GB GPUs when processing 256K-token contexts without loss of quality. When\nevaluated on a battery of academic and chatbot benchmarks, Jamba-1.5 models\nachieve excellent results while providing high throughput and outperforming\nother open-weight models on long-context benchmarks. The model weights for both\nsizes are publicly available under the Jamba Open Model License and we release\nExpertsInt8 as open source.\n","authors":[" Jamba Team","Barak Lenz","Alan Arazi","Amir Bergman","Avshalom Manevich","Barak Peleg","Ben Aviram","Chen Almagor","Clara Fridman","Dan Padnos","Daniel Gissin","Daniel Jannai","Dor Muhlgay","Dor Zimberg","Edden M Gerber","Elad Dolev","Eran Krakovsky","Erez Safahi","Erez Schwartz","Gal Cohen","Gal Shachaf","Haim Rozenblum","Hofit Bata","Ido Blass","Inbal Magar","Itay Dalmedigos","Jhonathan Osin","Julie Fadlon","Maria Rozman","Matan Danos","Michael Gokhman","Mor Zusman","Naama Gidron","Nir Ratner","Noam Gat","Noam Rozen","Oded Fried","Ohad Leshno","Omer Antverg","Omri Abend","Opher Lieber","Or Dagan","Orit Cohavi","Raz Alon","Ro'i Belson","Roi Cohen","Rom Gilad","Roman Glozman","Shahar Lev","Shaked Meirom","Tal Delbari","Tal Ness","Tomer Asida","Tom Ben Gal","Tom Braude","Uriya Pumerantz","Yehoshua Cohen","Yonatan Belinkov","Yuval Globerson","Yuval Peleg Levy","Yoav Shoham"],"pdf_url":"https://arxiv.org/pdf/2408.12570v1.pdf","comment":"Webpage: https://www.ai21.com/jamba"},{"id":"http://arxiv.org/abs/2309.13080v2","updated":"2024-08-22T17:27:56Z","published":"2023-09-21T10:55:26Z","title":"SPICED: News Similarity Detection Dataset with Multiple Topics and\n Complexity Levels","summary":" The proliferation of news media outlets has increased the demand for\nintelligent systems capable of detecting redundant information in news articles\nin order to enhance user experience. However, the heterogeneous nature of news\ncan lead to spurious findings in these systems: Simple heuristics such as\nwhether a pair of news are both about politics can provide strong but deceptive\ndownstream performance. Segmenting news similarity datasets into topics\nimproves the training of these models by forcing them to learn how to\ndistinguish salient characteristics under more narrow domains. However, this\nrequires the existence of topic-specific datasets, which are currently lacking.\nIn this article, we propose a novel dataset of similar news, SPICED, which\nincludes seven topics: Crime & Law, Culture & Entertainment, Disasters &\nAccidents, Economy & Business, Politics & Conflicts, Science & Technology, and\nSports. Futhermore, we present four different levels of complexity,\nspecifically designed for news similarity detection task. We benchmarked the\ncreated datasets using MinHash, BERT, SBERT, and SimCSE models.\n","authors":["Elena Shushkevich","Long Mai","Manuel V. Loureiro","Steven Derby","Tri Kurniawan Wijaya"],"pdf_url":"https://arxiv.org/pdf/2309.13080v2.pdf","comment":"LREC-COLING 2024"},{"id":"http://arxiv.org/abs/2408.08924v2","updated":"2024-08-22T17:21:34Z","published":"2024-08-15T14:51:32Z","title":"Prefix Guidance: A Steering Wheel for Large Language Models to Defend\n Against Jailbreak Attacks","summary":" In recent years, the rapid development of large language models (LLMs) has\nachieved remarkable performance across various tasks. However, research\nindicates that LLMs are vulnerable to jailbreak attacks, where adversaries can\ninduce the generation of harmful content through meticulously crafted prompts.\nThis vulnerability poses significant challenges to the secure use and promotion\nof LLMs. Existing defense methods offer protection from different perspectives\nbut often suffer from insufficient effectiveness or a significant impact on the\nmodel's capabilities. In this paper, we propose a plug-and-play and\neasy-to-deploy jailbreak defense framework, namely Prefix Guidance (PG), which\nguides the model to identify harmful prompts by directly setting the first few\ntokens of the model's output. This approach combines the model's inherent\nsecurity capabilities with an external classifier to defend against jailbreak\nattacks. We demonstrate the effectiveness of PG across three models and five\nattack methods. Compared to baselines, our approach is generally more effective\non average. Additionally, results on the Just-Eval benchmark further confirm\nPG's superiority to preserve the model's performance. our code is available at\nhttps://github.com/weiyezhimeng/Prefix-Guidance.\n","authors":["Jiawei Zhao","Kejiang Chen","Xiaojian Yuan","Weiming Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.08924v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08981v2","updated":"2024-08-22T17:20:27Z","published":"2024-08-16T19:10:48Z","title":"From Lazy to Prolific: Tackling Missing Labels in Open Vocabulary\n Extreme Classification by Positive-Unlabeled Sequence Learning","summary":" Open-vocabulary Extreme Multi-label Classification (OXMC) extends traditional\nXMC by allowing prediction beyond an extremely large, predefined label set\n(typically $10^3$ to $10^{12}$ labels), addressing the dynamic nature of\nreal-world labeling tasks. However, self-selection bias in data annotation\nleads to significant missing labels in both training and test data,\nparticularly for less popular inputs. This creates two critical challenges:\ngeneration models learn to be \"lazy'\" by under-generating labels, and\nevaluation becomes unreliable due to insufficient annotation in the test set.\nIn this work, we introduce Positive-Unlabeled Sequence Learning (PUSL), which\nreframes OXMC as an infinite keyphrase generation task, addressing the\ngeneration model's laziness. Additionally, we propose to adopt a suite of\nevaluation metrics, F1@$\\mathcal{O}$ and newly proposed B@$k$, to reliably\nassess OXMC models with incomplete ground truths. In a highly imbalanced\ne-commerce dataset with substantial missing labels, PUSL generates 30% more\nunique labels, and 72% of its predictions align with actual user queries. On\nthe less skewed EURLex-4.3k dataset, PUSL demonstrates superior F1 scores,\nespecially as label counts increase from 15 to 30. Our approach effectively\ntackles both the modeling and evaluation challenges in OXMC with missing\nlabels.\n","authors":["Ranran Haoran Zhang","Bensu Uçar","Soumik Dey","Hansi Wu","Binbin Li","Rui Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.08981v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.02458v2","updated":"2024-08-22T17:07:11Z","published":"2023-01-06T10:54:54Z","title":"Topics as Entity Clusters: Entity-based Topics from Large Language\n Models and Graph Neural Networks","summary":" Topic models aim to reveal latent structures within a corpus of text,\ntypically through the use of term-frequency statistics over bag-of-words\nrepresentations from documents. In recent years, conceptual entities --\ninterpretable, language-independent features linked to external knowledge\nresources -- have been used in place of word-level tokens, as words typically\nrequire extensive language processing with a minimal assurance of\ninterpretability. However, current literature is limited when it comes to\nexploring purely entity-driven neural topic modeling. For instance, despite the\nadvantages of using entities for eliciting thematic structure, it is unclear\nwhether current techniques are compatible with these sparsely organised,\ninformation-dense conceptual units. In this work, we explore entity-based\nneural topic modeling and propose a novel topic clustering approach using\nbimodal vector representations of entities. Concretely, we extract these latent\nrepresentations from large language models and graph neural networks trained on\na knowledge base of symbolic relations, in order to derive the most salient\naspects of these conceptual units. Analysis of coherency metrics confirms that\nour approach is better suited to working with entities in comparison to\nstate-of-the-art models, particularly when using graph-based embeddings trained\non a knowledge base.\n","authors":["Manuel V. Loureiro","Steven Derby","Tri Kurniawan Wijaya"],"pdf_url":"https://arxiv.org/pdf/2301.02458v2.pdf","comment":"16 pages, 1 figure. LREC-COLING 2024"},{"id":"http://arxiv.org/abs/2408.12547v1","updated":"2024-08-22T17:01:34Z","published":"2024-08-22T17:01:34Z","title":"Towards Evaluating and Building Versatile Large Language Models for\n Medicine","summary":" In this study, we present MedS-Bench, a comprehensive benchmark designed to\nevaluate the performance of large language models (LLMs) in clinical contexts.\nUnlike existing benchmarks that focus on multiple-choice question answering,\nMedS-Bench spans 11 high-level clinical tasks, including clinical report\nsummarization, treatment recommendations, diagnosis, named entity recognition,\nand medical concept explanation, among others. We evaluated six leading LLMs,\ne.g., MEDITRON, Mistral, InternLM 2, Llama 3, GPT-4, and Claude-3.5 using\nfew-shot prompting, and found that even the most sophisticated models struggle\nwith these complex tasks. To address these limitations, we developed MedS-Ins,\na large-scale instruction tuning dataset for medicine. MedS-Ins comprises 58\nmedically oriented language corpora, totaling 13.5 million samples across 122\ntasks. To demonstrate the dataset's utility, we conducted a proof-of-concept\nexperiment by performing instruction tuning on a lightweight, open-source\nmedical language model. The resulting model, MMedIns-Llama 3, significantly\noutperformed existing models across nearly all clinical tasks. To promote\nfurther advancements in the application of LLMs to clinical challenges, we have\nmade the MedS-Ins dataset fully accessible and invite the research community to\ncontribute to its expansion.Additionally, we have launched a dynamic\nleaderboard for MedS-Bench, which we plan to regularly update the test set to\ntrack progress and enhance the adaptation of general LLMs to the medical\ndomain. Leaderboard: https://henrychur.github.io/MedS-Bench/. Github:\nhttps://github.com/MAGIC-AI4Med/MedS-Ins.\n","authors":["Chaoyi Wu","Pengcheng Qiu","Jinxin Liu","Hongfei Gu","Na Li","Ya Zhang","Yanfeng Wang","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2408.12547v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06400v3","updated":"2024-08-22T16:46:33Z","published":"2024-01-12T06:49:49Z","title":"Generalizing Visual Question Answering from Synthetic to Human-Written\n Questions via a Chain of QA with a Large Language Model","summary":" Visual question answering (VQA) is a task where an image is given, and a\nseries of questions are asked about the image. To build an efficient VQA\nalgorithm, a large amount of QA data is required which is very expensive.\nGenerating synthetic QA pairs based on templates is a practical way to obtain\ndata. However, VQA models trained on those data do not perform well on complex,\nhuman-written questions. To address this issue, we propose a new method called\n{\\it chain of QA for human-written questions} (CoQAH). CoQAH utilizes a\nsequence of QA interactions between a large language model and a VQA model\ntrained on synthetic data to reason and derive logical answers for\nhuman-written questions. We tested the effectiveness of CoQAH on two types of\nhuman-written VQA datasets for 3D-rendered and chest X-ray images and found\nthat it achieved state-of-the-art accuracy in both types of data. Notably,\nCoQAH outperformed general vision-language models, VQA models, and medical\nfoundation models with no finetuning.\n","authors":["Taehee Kim","Yeongjae Cho","Heejun Shin","Yohan Jo","Dongmyung Shin"],"pdf_url":"https://arxiv.org/pdf/2401.06400v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12503v1","updated":"2024-08-22T15:53:23Z","published":"2024-08-22T15:53:23Z","title":"The Russian-focused embedders' exploration: ruMTEB benchmark and Russian\n embedding model design","summary":" Embedding models play a crucial role in Natural Language Processing (NLP) by\ncreating text embeddings used in various tasks such as information retrieval\nand assessing semantic text similarity. This paper focuses on research related\nto embedding models in the Russian language. It introduces a new\nRussian-focused embedding model called ru-en-RoSBERTa and the ruMTEB benchmark,\nthe Russian version extending the Massive Text Embedding Benchmark (MTEB). Our\nbenchmark includes seven categories of tasks, such as semantic textual\nsimilarity, text classification, reranking, and retrieval. The research also\nassesses a representative set of Russian and multilingual models on the\nproposed benchmark. The findings indicate that the new model achieves results\nthat are on par with state-of-the-art models in Russian. We release the model\nru-en-RoSBERTa, and the ruMTEB framework comes with open-source code,\nintegration into the original framework and a public leaderboard.\n","authors":["Artem Snegirev","Maria Tikhonova","Anna Maksimova","Alena Fenogenova","Alexander Abramov"],"pdf_url":"https://arxiv.org/pdf/2408.12503v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10259v3","updated":"2024-08-22T15:52:13Z","published":"2024-04-16T03:26:43Z","title":"Uncovering Latent Arguments in Social Media Messaging by Employing\n LLMs-in-the-Loop Strategy","summary":" The widespread use of social media has led to a surge in popularity for\nautomated methods of analyzing public opinion. Supervised methods are adept at\ntext categorization, yet the dynamic nature of social media discussions poses a\ncontinual challenge for these techniques due to the constant shifting of the\nfocus. On the other hand, traditional unsupervised methods for extracting\nthemes from public discourse, such as topic modeling, often reveal overarching\npatterns that might not capture specific nuances. Consequently, a significant\nportion of research into social media discourse still depends on\nlabor-intensive manual coding techniques and a human-in-the-loop approach,\nwhich are both time-consuming and costly. In this work, we study the problem of\ndiscovering arguments associated with a specific theme. We propose a generic\nLLMs-in-the-Loop strategy that leverages the advanced capabilities of Large\nLanguage Models (LLMs) to extract latent arguments from social media messaging.\nTo demonstrate our approach, we apply our framework to contentious topics. We\nuse two publicly available datasets: (1) the climate campaigns dataset of 14k\nFacebook ads with 25 themes and (2) the COVID-19 vaccine campaigns dataset of\n9k Facebook ads with 14 themes. Additionally, we design a downstream task as\nstance prediction by leveraging talking points in climate debates. Furthermore,\nwe analyze demographic targeting and the adaptation of messaging based on\nreal-world events.\n","authors":["Tunazzina Islam","Dan Goldwasser"],"pdf_url":"https://arxiv.org/pdf/2404.10259v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11484v5","updated":"2024-08-22T15:44:27Z","published":"2024-07-16T08:20:39Z","title":"The Oscars of AI Theater: A Survey on Role-Playing with Language Models","summary":" This survey explores the burgeoning field of role-playing with language\nmodels, focusing on their development from early persona-based models to\nadvanced character-driven simulations facilitated by Large Language Models\n(LLMs). Initially confined to simple persona consistency due to limited model\ncapabilities, role-playing tasks have now expanded to embrace complex character\nportrayals involving character consistency, behavioral alignment, and overall\nattractiveness. We provide a comprehensive taxonomy of the critical components\nin designing these systems, including data, models and alignment, agent\narchitecture and evaluation. This survey not only outlines the current\nmethodologies and challenges, such as managing dynamic personal profiles and\nachieving high-level persona consistency but also suggests avenues for future\nresearch in improving the depth and realism of role-playing applications. The\ngoal is to guide future research by offering a structured overview of current\nmethodologies and identifying potential areas for improvement. Related\nresources and papers are available at\nhttps://github.com/nuochenpku/Awesome-Role-Play-Papers.\n","authors":["Nuo Chen","Yan Wang","Yang Deng","Jia Li"],"pdf_url":"https://arxiv.org/pdf/2407.11484v5.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2408.12494v1","updated":"2024-08-22T15:35:46Z","published":"2024-08-22T15:35:46Z","title":"GenderCARE: A Comprehensive Framework for Assessing and Reducing Gender\n Bias in Large Language Models","summary":" Large language models (LLMs) have exhibited remarkable capabilities in\nnatural language generation, but they have also been observed to magnify\nsocietal biases, particularly those related to gender. In response to this\nissue, several benchmarks have been proposed to assess gender bias in LLMs.\nHowever, these benchmarks often lack practical flexibility or inadvertently\nintroduce biases. To address these shortcomings, we introduce GenderCARE, a\ncomprehensive framework that encompasses innovative Criteria, bias Assessment,\nReduction techniques, and Evaluation metrics for quantifying and mitigating\ngender bias in LLMs. To begin, we establish pioneering criteria for gender\nequality benchmarks, spanning dimensions such as inclusivity, diversity,\nexplainability, objectivity, robustness, and realisticity. Guided by these\ncriteria, we construct GenderPair, a novel pair-based benchmark designed to\nassess gender bias in LLMs comprehensively. Our benchmark provides standardized\nand realistic evaluations, including previously overlooked gender groups such\nas transgender and non-binary individuals. Furthermore, we develop effective\ndebiasing techniques that incorporate counterfactual data augmentation and\nspecialized fine-tuning strategies to reduce gender bias in LLMs without\ncompromising their overall performance. Extensive experiments demonstrate a\nsignificant reduction in various gender bias benchmarks, with reductions\npeaking at over 90% and averaging above 35% across 17 different LLMs.\nImportantly, these reductions come with minimal variability in mainstream\nlanguage tasks, remaining below 2%. By offering a realistic assessment and\ntailored reduction of gender biases, we hope that our GenderCARE can represent\na significant step towards achieving fairness and equity in LLMs. More details\nare available at https://github.com/kstanghere/GenderCARE-ccs24.\n","authors":["Kunsheng Tang","Wenbo Zhou","Jie Zhang","Aishan Liu","Gelei Deng","Shuai Li","Peigui Qi","Weiming Zhang","Tianwei Zhang","Nenghai Yu"],"pdf_url":"https://arxiv.org/pdf/2408.12494v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12480v1","updated":"2024-08-22T15:15:51Z","published":"2024-08-22T15:15:51Z","title":"Vintern-1B: An Efficient Multimodal Large Language Model for Vietnamese","summary":" In this report, we introduce Vintern-1B, a reliable 1-billion-parameters\nmultimodal large language model (MLLM) for Vietnamese language tasks. By\nintegrating the Qwen2-0.5B-Instruct language model with the\nInternViT-300M-448px visual model, Vintern-1B is optimized for a range of\napplications, including optical character recognition (OCR), document\nextraction, and general question-answering in Vietnamese context. The model is\nfine-tuned on an extensive dataset of over 3 million image-question-answer\npairs, achieving robust performance and reliable results across multiple\nVietnamese language benchmarks like OpenViVQA and ViTextVQA. Vintern-1B is\nsmall enough to fit into various on-device applications easily. Additionally,\nwe have open-sourced several Vietnamese vision question answering (VQA)\ndatasets for text and diagrams, created with Gemini 1.5 Flash. Our models are\navailable at: https://huggingface.co/5CD-AI/Vintern-1B-v2.\n","authors":["Khang T. Doan","Bao G. Huynh","Dung T. Hoang","Thuc D. Pham","Nhat H. Pham","Quan T. M. Nguyen","Bang Q. Vo","Suong N. Hoang"],"pdf_url":"https://arxiv.org/pdf/2408.12480v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2404.16821 by other authors"},{"id":"http://arxiv.org/abs/2408.12456v1","updated":"2024-08-22T14:53:33Z","published":"2024-08-22T14:53:33Z","title":"Enhancing Multi-hop Reasoning through Knowledge Erasure in Large\n Language Model Editing","summary":" Large language models (LLMs) face challenges with internal knowledge\ninaccuracies and outdated information. Knowledge editing has emerged as a\npivotal approach to mitigate these issues. Although current knowledge editing\ntechniques exhibit promising performance in single-hop reasoning tasks, they\nshow limitations when applied to multi-hop reasoning. Drawing on cognitive\nneuroscience and the operational mechanisms of LLMs, we hypothesize that the\nresidual single-hop knowledge after editing causes edited models to revert to\ntheir original answers when processing multi-hop questions, thereby undermining\ntheir performance in multihop reasoning tasks. To validate this hypothesis, we\nconduct a series of experiments that empirically confirm our assumptions.\nBuilding on the validated hypothesis, we propose a novel knowledge editing\nmethod that incorporates a Knowledge Erasure mechanism for Large language model\nEditing (KELE). Specifically, we design an erasure function for residual\nknowledge and an injection function for new knowledge. Through joint\noptimization, we derive the optimal recall vector, which is subsequently\nutilized within a rank-one editing framework to update the parameters of\ntargeted model layers. Extensive experiments on GPT-J and GPT-2 XL demonstrate\nthat KELE substantially enhances the multi-hop reasoning capability of edited\nLLMs.\n","authors":["Mengqi Zhang","Bowen Fang","Qiang Liu","Pengjie Ren","Shu Wu","Zhumin Chen","Liang Wang"],"pdf_url":"https://arxiv.org/pdf/2408.12456v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12430v1","updated":"2024-08-22T14:24:20Z","published":"2024-08-22T14:24:20Z","title":"Positional Description for Numerical Normalization","summary":" We present a Positional Description Scheme (PDS) tailored for digit\nsequences, integrating placeholder value information for each digit. Given the\nstructural limitations of subword tokenization algorithms, language models\nencounter critical Text Normalization (TN) challenges when handling numerical\ntasks. Our schema addresses this challenge through straightforward\npre-processing, preserving the model architecture while significantly\nsimplifying number normalization, rendering the problem tractable. This\nsimplifies the task and facilitates more compact production-ready models\ncapable of learning from smaller datasets. Furthermore, our investigations\nreveal that PDS enhances the arithmetic processing capabilities of language\nmodels, resulting in a relative accuracy improvement of 23% to 51% on complex\narithmetic tasks. We demonstrate that PDS effectively mitigates fatal numerical\nnormalization errors in neural models, requiring only a modest amount of\ntraining data without rule-based Finite State Transducers (FST). We demonstrate\nthat PDS is essential for both the Text-To-Speech and Speech Recognition text\nprocessing, enabling effective TN under production constraints.\n","authors":["Deepanshu Gupta","Javier Latorre"],"pdf_url":"https://arxiv.org/pdf/2408.12430v1.pdf","comment":"Published at Interspeech 2024"},{"id":"http://arxiv.org/abs/2303.12767v2","updated":"2024-08-22T14:19:06Z","published":"2023-03-22T17:32:56Z","title":"Can we trust the evaluation on ChatGPT?","summary":" ChatGPT, the first large language model (LLM) with mass adoption, has\ndemonstrated remarkable performance in numerous natural language tasks. Despite\nits evident usefulness, evaluating ChatGPT's performance in diverse problem\ndomains remains challenging due to the closed nature of the model and its\ncontinuous updates via Reinforcement Learning from Human Feedback (RLHF). We\nhighlight the issue of data contamination in ChatGPT evaluations, with a case\nstudy of the task of stance detection. We discuss the challenge of preventing\ndata contamination and ensuring fair model evaluation in the age of closed and\ncontinuously trained models.\n","authors":["Rachith Aiyappa","Jisun An","Haewoon Kwak","Yong-Yeol Ahn"],"pdf_url":"https://arxiv.org/pdf/2303.12767v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07862v2","updated":"2024-08-22T13:57:30Z","published":"2024-02-12T18:14:43Z","title":"AI-Augmented Predictions: LLM Assistants Improve Human Forecasting\n Accuracy","summary":" Large language models (LLMs) match and sometimes exceeding human performance\nin many domains. This study explores the potential of LLMs to augment human\njudgement in a forecasting task. We evaluate the effect on human forecasters of\ntwo LLM assistants: one designed to provide high-quality (\"superforecasting\")\nadvice, and the other designed to be overconfident and base-rate neglecting,\nthus providing noisy forecasting advice. We compare participants using these\nassistants to a control group that received a less advanced model that did not\nprovide numerical predictions or engaged in explicit discussion of predictions.\nParticipants (N = 991) answered a set of six forecasting questions and had the\noption to consult their assigned LLM assistant throughout. Our preregistered\nanalyses show that interacting with each of our frontier LLM assistants\nsignificantly enhances prediction accuracy by between 24 percent and 28 percent\ncompared to the control group. Exploratory analyses showed a pronounced outlier\neffect in one forecasting item, without which we find that the superforecasting\nassistant increased accuracy by 41 percent, compared with 29 percent for the\nnoisy assistant. We further examine whether LLM forecasting augmentation\ndisproportionately benefits less skilled forecasters, degrades the\nwisdom-of-the-crowd by reducing prediction diversity, or varies in\neffectiveness with question difficulty. Our data do not consistently support\nthese hypotheses. Our results suggest that access to a frontier LLM assistant,\neven a noisy one, can be a helpful decision aid in cognitively demanding tasks\ncompared to a less powerful model that does not provide specific forecasting\nadvice. However, the effects of outliers suggest that further research into the\nrobustness of this pattern is needed.\n","authors":["Philipp Schoenegger","Peter S. Park","Ezra Karger","Sean Trott","Philip E. Tetlock"],"pdf_url":"https://arxiv.org/pdf/2402.07862v2.pdf","comment":"22 pages pages (main text comprised of 19 pages, appendix comprised\n of three pages). 10 visualizations in the main text (four figures, six\n tables), three additional figures in the appendix"},{"id":"http://arxiv.org/abs/2408.12398v1","updated":"2024-08-22T13:44:31Z","published":"2024-08-22T13:44:31Z","title":"A Comparative Analysis of Faithfulness Metrics and Humans in Citation\n Evaluation","summary":" Large language models (LLMs) often generate content with unsupported or\nunverifiable content, known as \"hallucinations.\" To address this,\nretrieval-augmented LLMs are employed to include citations in their content,\ngrounding the content in verifiable sources. Despite such developments,\nmanually assessing how well a citation supports the associated statement\nremains a major challenge. Previous studies tackle this challenge by leveraging\nfaithfulness metrics to estimate citation support automatically. However, they\nlimit this citation support estimation to a binary classification scenario,\nneglecting fine-grained citation support in practical scenarios. To investigate\nthe effectiveness of faithfulness metrics in fine-grained scenarios, we propose\na comparative evaluation framework that assesses the metric effectiveness in\ndistinguishing citations between three-category support levels: full, partial,\nand no support. Our framework employs correlation analysis, classification\nevaluation, and retrieval evaluation to measure the alignment between metric\nscores and human judgments comprehensively. Our results indicate no single\nmetric consistently excels across all evaluations, highlighting the complexity\nof accurately evaluating fine-grained support levels. Particularly, we find\nthat the best-performing metrics struggle to distinguish partial support from\nfull or no support. Based on these findings, we provide practical\nrecommendations for developing more effective metrics.\n","authors":["Weijia Zhang","Mohammad Aliannejadi","Jiahuan Pei","Yifei Yuan","Jia-Hong Huang","Evangelos Kanoulas"],"pdf_url":"https://arxiv.org/pdf/2408.12398v1.pdf","comment":"Accepted by the First Workshop on Large Language Model for Evaluation\n in Information Retrieval (LLM4Eval@SIGIR2024), non-archival. arXiv admin\n note: substantial text overlap with arXiv:2406.15264"},{"id":"http://arxiv.org/abs/2402.16823v3","updated":"2024-08-22T13:06:51Z","published":"2024-02-26T18:48:27Z","title":"Language Agents as Optimizable Graphs","summary":" Various human-designed prompt engineering techniques have been proposed to\nimprove problem solvers based on Large Language Models (LLMs), yielding many\ndisparate code bases. We unify these approaches by describing LLM-based agents\nas computational graphs. The nodes implement functions to process multimodal\ndata or query LLMs, and the edges describe the information flow between\noperations. Graphs can be recursively combined into larger composite graphs\nrepresenting hierarchies of inter-agent collaboration (where edges connect\noperations of different agents). Our novel automatic graph optimizers (1)\nrefine node-level LLM prompts (node optimization) and (2) improve agent\norchestration by changing graph connectivity (edge optimization). Experiments\ndemonstrate that our framework can be used to efficiently develop, integrate,\nand automatically improve various LLM agents. The code can be found at\nhttps://github.com/metauto-ai/gptswarm.\n","authors":["Mingchen Zhuge","Wenyi Wang","Louis Kirsch","Francesco Faccio","Dmitrii Khizbullin","Jürgen Schmidhuber"],"pdf_url":"https://arxiv.org/pdf/2402.16823v3.pdf","comment":"Project Website: https://gptswarm.org ; Github Repo:\n https://github.com/metauto-ai/gptswarm . In Forty-first International\n Conference on Machine Learning (2024)"},{"id":"http://arxiv.org/abs/2408.12362v1","updated":"2024-08-22T12:59:05Z","published":"2024-08-22T12:59:05Z","title":"CLEANANERCorp: Identifying and Correcting Incorrect Labels in the\n ANERcorp Dataset","summary":" Label errors are a common issue in machine learning datasets, particularly\nfor tasks such as Named Entity Recognition. Such label errors might hurt model\ntraining, affect evaluation results, and lead to an inaccurate assessment of\nmodel performance. In this study, we dived deep into one of the widely adopted\nArabic NER benchmark datasets (ANERcorp) and found a significant number of\nannotation errors, missing labels, and inconsistencies. Therefore, in this\nstudy, we conducted empirical research to understand these errors, correct them\nand propose a cleaner version of the dataset named CLEANANERCorp. CLEANANERCorp\nwill serve the research community as a more accurate and consistent benchmark.\n","authors":["Mashael Al-Duwais","Hend Al-Khalifa","Abdulmalik Al-Salman"],"pdf_url":"https://arxiv.org/pdf/2408.12362v1.pdf","comment":"Proceedings of the 6th Workshop on Open-Source Arabic Corpora and\n Processing Tools (OSACT) with Shared Tasks on Arabic LLMs Hallucination and\n Dialect to MSA Machine Translation @ LREC-COLING 2024"},{"id":"http://arxiv.org/abs/2408.12337v1","updated":"2024-08-22T12:23:29Z","published":"2024-08-22T12:23:29Z","title":"Fine-tuning Smaller Language Models for Question Answering over\n Financial Documents","summary":" Recent research has shown that smaller language models can acquire\nsubstantial reasoning abilities when fine-tuned with reasoning exemplars\ncrafted by a significantly larger teacher model. We explore this paradigm for\nthe financial domain, focusing on the challenge of answering questions that\nrequire multi-hop numerical reasoning over financial texts. We assess the\nperformance of several smaller models that have been fine-tuned to generate\nprograms that encode the required financial reasoning and calculations. Our\nfindings demonstrate that these fine-tuned smaller models approach the\nperformance of the teacher model.\n To provide a granular analysis of model performance, we propose an approach\nto investigate the specific student model capabilities that are enhanced by\nfine-tuning. Our empirical analysis indicates that fine-tuning refines the\nstudent models ability to express and apply the required financial concepts\nalong with adapting the entity extraction for the specific data format. In\naddition, we hypothesize and demonstrate that comparable financial reasoning\ncapability can be induced using relatively smaller datasets.\n","authors":["Karmvir Singh Phogat","Sai Akhil Puranam","Sridhar Dasaratha","Chetan Harsha","Shashishekar Ramakrishna"],"pdf_url":"https://arxiv.org/pdf/2408.12337v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12326v1","updated":"2024-08-22T12:04:04Z","published":"2024-08-22T12:04:04Z","title":"Interactive DualChecker for Mitigating Hallucinations in Distilling\n Large Language Models","summary":" Large Language Models (LLMs) have demonstrated exceptional capabilities\nacross various machine learning (ML) tasks. Given the high costs of creating\nannotated datasets for supervised learning, LLMs offer a valuable alternative\nby enabling effective few-shot in-context learning. However, these models can\nproduce hallucinations, particularly in domains with incomplete knowledge.\nAdditionally, current methods for knowledge distillation using LLMs often\nstruggle to enhance the effectiveness of both teacher and student models. To\naddress these challenges, we introduce DualChecker, an innovative framework\ndesigned to mitigate hallucinations and improve the performance of both teacher\nand student models during knowledge distillation. DualChecker employs\nContextAligner to ensure that the context provided by teacher models aligns\nwith human labeling standards. It also features a dynamic checker system that\nenhances model interaction: one component re-prompts teacher models with more\ndetailed content when they show low confidence, and another identifies\nborderline cases from student models to refine the teaching templates. This\ninteractive process promotes continuous improvement and effective knowledge\ntransfer between the models. We evaluate DualChecker using a green innovation\ntextual dataset that includes binary, multiclass, and token classification\ntasks. The experimental results show that DualChecker significantly outperforms\nexisting state-of-the-art methods, achieving up to a 17% improvement in F1\nscore for teacher models and 10% for student models. Notably, student models\nfine-tuned with LLM predictions perform comparably to those fine-tuned with\nactual data, even in a challenging domain. We make all datasets, models, and\ncode from this research publicly available.\n","authors":["Meiyun Wang","Masahiro Suzuki","Hiroki Sakaji","Kiyoshi Izumi"],"pdf_url":"https://arxiv.org/pdf/2408.12326v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12325v1","updated":"2024-08-22T12:00:31Z","published":"2024-08-22T12:00:31Z","title":"Improving Factuality in Large Language Models via Decoding-Time\n Hallucinatory and Truthful Comparators","summary":" Despite their remarkable capabilities, Large Language Models (LLMs) are prone\nto generate responses that contradict verifiable facts, i.e., unfaithful\nhallucination content. Existing efforts generally focus on optimizing model\nparameters or editing semantic representations, which compromise the internal\nfactual knowledge of target LLMs. In addition, hallucinations typically exhibit\nmultifaceted patterns in downstream tasks, limiting the model's holistic\nperformance across tasks. In this paper, we propose a Comparator-driven\nDecoding-Time (CDT) framework to alleviate the response hallucination. Firstly,\nwe construct hallucinatory and truthful comparators with multi-task fine-tuning\nsamples. In this case, we present an instruction prototype-guided mixture of\nexperts strategy to enhance the ability of the corresponding comparators to\ncapture different hallucination or truthfulness patterns in distinct task\ninstructions. CDT constrains next-token predictions to factuality-robust\ndistributions by contrasting the logit differences between the target LLMs and\nthese comparators. Systematic experiments on multiple downstream tasks show\nthat our framework can significantly improve the model performance and response\nfactuality.\n","authors":["Dingkang Yang","Dongling Xiao","Jinjie Wei","Mingcheng Li","Zhaoyu Chen","Ke Li","Lihua Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.12325v1.pdf","comment":"Hallucination Mitigation in LLMs"},{"id":"http://arxiv.org/abs/2408.12321v1","updated":"2024-08-22T11:57:16Z","published":"2024-08-22T11:57:16Z","title":"MaVEn: An Effective Multi-granularity Hybrid Visual Encoding Framework\n for Multimodal Large Language Model","summary":" This paper presents MaVEn, an innovative Multi-granularity Visual Encoding\nframework designed to enhance the capabilities of Multimodal Large Language\nModels (MLLMs) in multi-image reasoning. Current MLLMs primarily focus on\nsingle-image visual understanding, limiting their ability to interpret and\nintegrate information across multiple images. MaVEn addresses this limitation\nby combining discrete visual symbol sequences, which abstract coarse-grained\nsemantic concepts, with traditional continuous representation sequences that\nmodel fine-grained features. This dual approach bridges the semantic gap\nbetween visual and textual data, thereby improving the model's ability to\nprocess and interpret information from multiple images effectively.\nAdditionally, we design a dynamic reduction mechanism by for long-sequence\ncontinuous features to enhance multi-image processing efficiency. Experimental\nresults demonstrate that MaVEn significantly enhances MLLMs' understanding in\ncomplex multi-image scenarios, while also improving performance in single-image\ncontexts.\n","authors":["Chaoya Jiang","Jia Hongrui","Haiyang Xu","Wei Ye","Mengfan Dong","Ming Yan","Ji Zhang","Fei Huang","Shikun Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.12321v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12315v1","updated":"2024-08-22T11:41:35Z","published":"2024-08-22T11:41:35Z","title":"Large Language Models Are Self-Taught Reasoners: Enhancing LLM\n Applications via Tailored Problem-Solving Demonstrations","summary":" Guiding large language models with a selected set of human-authored\ndemonstrations is a common practice for improving LLM applications. However,\nhuman effort can be costly, especially in specialized domains (e.g., clinical\ndiagnosis), and does not guarantee optimal performance due to the potential\ndiscrepancy of target skills between selected demonstrations and real test\ninstances. Motivated by these, this paper explores the automatic creation of\ncustomized demonstrations, whose target skills align with the given target\ninstance. We present SELF-TAUGHT, a problem-solving framework, which\nfacilitates demonstrations that are \"tailored\" to the target problem and\n\"filtered\" for better quality (i.e., correctness) in a zero-shot manner. In 15\ntasks of multiple-choice questions of diverse domains and the diagnosis of\nAlzheimer's disease (AD) with real-world patients, SELF-TAUGHT achieves\nsuperior performance to strong baselines (e.g., Few-shot CoT, Plan-and-Solve,\nAuto-CoT). We conduct comprehensive analyses on SELF-TAUGHT, including its\ngeneralizability to existing prompting methods and different LLMs, the quality\nof its intermediate generation, and more.\n","authors":["Kai Tzu-iunn Ong","Taeyoon Kwon","Jinyoung Yeo"],"pdf_url":"https://arxiv.org/pdf/2408.12315v1.pdf","comment":"preprint / under review"},{"id":"http://arxiv.org/abs/2402.14743v2","updated":"2024-08-22T11:29:42Z","published":"2024-02-22T17:58:50Z","title":"Dependency Annotation of Ottoman Turkish with Multilingual BERT","summary":" This study introduces a pretrained large language model-based annotation\nmethodology for the first de dency treebank in Ottoman Turkish. Our\nexperimental results show that, iteratively, i) pseudo-annotating data using a\nmultilingual BERT-based parsing model, ii) manually correcting the\npseudo-annotations, and iii) fine-tuning the parsing model with the corrected\nannotations, we speed up and simplify the challenging dependency annotation\nprocess. The resulting treebank, that will be a part of the Universal\nDependencies (UD) project, will facilitate automated analysis of Ottoman\nTurkish documents, unlocking the linguistic richness embedded in this\nhistorical heritage.\n","authors":["Şaziye Betül Özateş","Tarık Emre Tıraş","Efe Eren Genç","Esma Fatıma Bilgin Taşdemir"],"pdf_url":"https://arxiv.org/pdf/2402.14743v2.pdf","comment":"9 pages, 5 figures. Accepted to LAW-XVIII"},{"id":"http://arxiv.org/abs/2403.03823v9","updated":"2024-08-22T10:00:53Z","published":"2024-03-06T16:10:01Z","title":"A Modular Approach for Multimodal Summarization of TV Shows","summary":" In this paper we address the task of summarizing television shows, which\ntouches key areas in AI research: complex reasoning, multiple modalities, and\nlong narratives. We present a modular approach where separate components\nperform specialized sub-tasks which we argue affords greater flexibility\ncompared to end-to-end methods. Our modules involve detecting scene boundaries,\nreordering scenes so as to minimize the number of cuts between different\nevents, converting visual information to text, summarizing the dialogue in each\nscene, and fusing the scene summaries into a final summary for the entire\nepisode. We also present a new metric, PRISMA (Precision and Recall EvaluatIon\nof Summary FActs), to measure both precision and recall of generated summaries,\nwhich we decompose into atomic facts. Tested on the recently released\nSummScreen3D dataset, our method produces higher quality summaries than\ncomparison models, as measured with ROUGE and our new fact-based metric, and as\nassessed by human evaluators.\n","authors":["Louis Mahon","Mirella Lapata"],"pdf_url":"https://arxiv.org/pdf/2403.03823v9.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12263v1","updated":"2024-08-22T10:00:20Z","published":"2024-08-22T10:00:20Z","title":"Toward the Evaluation of Large Language Models Considering Score\n Variance across Instruction Templates","summary":" The natural language understanding (NLU) performance of large language models\n(LLMs) has been evaluated across various tasks and datasets. The existing\nevaluation methods, however, do not take into account the variance in scores\ndue to differences in prompts, which leads to unfair evaluation and comparison\nof NLU performance. Moreover, evaluation designed for specific prompts is\ninappropriate for instruction tuning, which aims to perform well with any\nprompt. It is therefore necessary to find a way to measure NLU performance in a\nfair manner, considering score variance between different instruction\ntemplates. In this study, we provide English and Japanese cross-lingual\ndatasets for evaluating the NLU performance of LLMs, which include multiple\ninstruction templates for fair evaluation of each task, along with regular\nexpressions to constrain the output format. Furthermore, we propose the Sharpe\nscore as an evaluation metric that takes into account the variance in scores\nbetween templates. Comprehensive analysis of English and Japanese LLMs reveals\nthat the high variance among templates has a significant impact on the fair\nevaluation of LLMs.\n","authors":["Yusuke Sakai","Adam Nohejl","Jiangnan Hang","Hidetaka Kamigaito","Taro Watanabe"],"pdf_url":"https://arxiv.org/pdf/2408.12263v1.pdf","comment":"19 pages, 7 figures"},{"id":"http://arxiv.org/abs/2408.12254v1","updated":"2024-08-22T09:48:06Z","published":"2024-08-22T09:48:06Z","title":"A Language-agnostic Model of Child Language Acquisition","summary":" This work reimplements a recent semantic bootstrapping child-language\nacquisition model, which was originally designed for English, and trains it to\nlearn a new language: Hebrew. The model learns from pairs of utterances and\nlogical forms as meaning representations, and acquires both syntax and word\nmeanings simultaneously. The results show that the model mostly transfers to\nHebrew, but that a number of factors, including the richer morphology in\nHebrew, makes the learning slower and less robust. This suggests that a clear\ndirection for future work is to enable the model to leverage the similarities\nbetween different word forms.\n","authors":["Louis Mahon","Omri Abend","Uri Berger","Katherine Demuth","Mark Johnson","Mark Steedman"],"pdf_url":"https://arxiv.org/pdf/2408.12254v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12249v1","updated":"2024-08-22T09:37:40Z","published":"2024-08-22T09:37:40Z","title":"LLMs are not Zero-Shot Reasoners for Biomedical Information Extraction","summary":" Large Language Models (LLMs) are increasingly adopted for applications in\nhealthcare, reaching the performance of domain experts on tasks such as\nquestion answering and document summarisation. Despite their success on these\ntasks, it is unclear how well LLMs perform on tasks that are traditionally\npursued in the biomedical domain, such as structured information extration. To\nbreach this gap, in this paper, we systematically benchmark LLM performance in\nMedical Classification and Named Entity Recognition (NER) tasks. We aim to\ndisentangle the contribution of different factors to the performance,\nparticularly the impact of LLMs' task knowledge and reasoning capabilities,\ntheir (parametric) domain knowledge, and addition of external knowledge. To\nthis end we evaluate various open LLMs -- including BioMistral and Llama-2\nmodels -- on a diverse set of biomedical datasets, using standard prompting,\nChain-of-Thought (CoT) and Self-Consistency based reasoning as well as\nRetrieval-Augmented Generation (RAG) with PubMed and Wikipedia corpora.\nCounter-intuitively, our results reveal that standard prompting consistently\noutperforms more complex techniques across both tasks, laying bare the\nlimitations in the current application of CoT, self-consistency and RAG in the\nbiomedical domain. Our findings suggest that advanced prompting methods\ndeveloped for knowledge- or reasoning-intensive tasks, such as CoT or RAG, are\nnot easily portable to biomedical tasks where precise structured outputs are\nrequired. This highlights the need for more effective integration of external\nknowledge and reasoning mechanisms in LLMs to enhance their performance in\nreal-world biomedical applications.\n","authors":["Aishik Nagar","Viktor Schlegel","Thanh-Tung Nguyen","Hao Li","Yuping Wu","Kuluhan Binici","Stefan Winkler"],"pdf_url":"https://arxiv.org/pdf/2408.12249v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2408.12226v1","updated":"2024-08-22T08:57:31Z","published":"2024-08-22T08:57:31Z","title":"EvalYaks: Instruction Tuning Datasets and LoRA Fine-tuned Models for\n Automated Scoring of CEFR B2 Speaking Assessment Transcripts","summary":" Relying on human experts to evaluate CEFR speaking assessments in an\ne-learning environment creates scalability challenges, as it limits how quickly\nand widely assessments can be conducted. We aim to automate the evaluation of\nCEFR B2 English speaking assessments in e-learning environments from\nconversation transcripts. First, we evaluate the capability of leading open\nsource and commercial Large Language Models (LLMs) to score a candidate's\nperformance across various criteria in the CEFR B2 speaking exam in both global\nand India-specific contexts. Next, we create a new expert-validated,\nCEFR-aligned synthetic conversational dataset with transcripts that are rated\nat different assessment scores. In addition, new instruction-tuned datasets are\ndeveloped from the English Vocabulary Profile (up to CEFR B2 level) and the\nCEFR-SP WikiAuto datasets. Finally, using these new datasets, we perform\nparameter efficient instruction tuning of Mistral Instruct 7B v0.2 to develop a\nfamily of models called EvalYaks. Four models in this family are for assessing\nthe four sections of the CEFR B2 speaking exam, one for identifying the CEFR\nlevel of vocabulary and generating level-specific vocabulary, and another for\ndetecting the CEFR level of text and generating level-specific text. EvalYaks\nachieved an average acceptable accuracy of 96%, a degree of variation of 0.35\nlevels, and performed 3 times better than the next best model. This\ndemonstrates that a 7B parameter LLM instruction tuned with high-quality\nCEFR-aligned assessment data can effectively evaluate and score CEFR B2 English\nspeaking assessments, offering a promising solution for scalable, automated\nlanguage proficiency evaluation.\n","authors":["Nicy Scaria","Silvester John Joseph Kennedy","Thomas Latinovich","Deepak Subramani"],"pdf_url":"https://arxiv.org/pdf/2408.12226v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.06571v4","updated":"2024-08-22T08:19:20Z","published":"2024-06-03T16:43:04Z","title":"SUBLLM: A Novel Efficient Architecture with Token Sequence Subsampling\n for LLM","summary":" While Large Language Models (LLMs) have achieved remarkable success in\nvarious fields, the efficiency of training and inference remains a major\nchallenge. To address this issue, we propose SUBLLM, short for\nSubsampling-Upsampling-Bypass Large Language Model, an innovative architecture\nthat extends the core decoder-only framework by incorporating subsampling,\nupsampling, and bypass modules. The subsampling modules are responsible for\nshortening the sequence, while the upsampling modules restore the sequence\nlength, and the bypass modules enhance convergence. In comparison to LLaMA, the\nproposed SUBLLM exhibits significant enhancements in both training and\ninference speeds as well as memory usage, while maintaining competitive\nfew-shot performance. During training, SUBLLM increases speeds by 26% and cuts\nmemory by 10GB per GPU. In inference, it boosts speeds by up to 37% and reduces\nmemory by 1GB per GPU. The training and inference speeds can be enhanced by 34%\nand 52% respectively when the context window is expanded to 8192. Our code is\navailable at https://github.com/XiaoMi/subllm.\n","authors":["Quandong Wang","Yuxuan Yuan","Xiaoyu Yang","Ruike Zhang","Kang Zhao","Wei Liu","Jian Luan","Daniel Povey","Bin Wang"],"pdf_url":"https://arxiv.org/pdf/2406.06571v4.pdf","comment":"10 pages, 5 figures, accepted by ECAI 2024"},{"id":"http://arxiv.org/abs/2408.12194v1","updated":"2024-08-22T08:16:07Z","published":"2024-08-22T08:16:07Z","title":"Large Language Models as Foundations for Next-Gen Dense Retrieval: A\n Comprehensive Empirical Assessment","summary":" Pretrained language models like BERT and T5 serve as crucial backbone\nencoders for dense retrieval. However, these models often exhibit limited\ngeneralization capabilities and face challenges in improving in domain\naccuracy. Recent research has explored using large language models (LLMs) as\nretrievers, achieving SOTA performance across various tasks. Despite these\nadvancements, the specific benefits of LLMs over traditional retrievers and the\nimpact of different LLM configurations, such as parameter sizes, pretraining\nduration, and alignment processes on retrieval tasks remain unclear. In this\nwork, we conduct a comprehensive empirical study on a wide range of retrieval\ntasks, including in domain accuracy, data efficiency, zero shot generalization,\nlengthy retrieval, instruction based retrieval, and multi task learning. We\nevaluate over 15 different backbone LLMs and non LLMs. Our findings reveal that\nlarger models and extensive pretraining consistently enhance in domain accuracy\nand data efficiency. Additionally, larger models demonstrate significant\npotential in zero shot generalization, lengthy retrieval, instruction based\nretrieval, and multi task learning. These results underscore the advantages of\nLLMs as versatile and effective backbone encoders in dense retrieval, providing\nvaluable insights for future research and development in this field.\n","authors":["Kun Luo","Minghao Qin","Zheng Liu","Shitao Xiao","Jun Zhao","Kang Liu"],"pdf_url":"https://arxiv.org/pdf/2408.12194v1.pdf","comment":"Submitted to EMNLP24"},{"id":"http://arxiv.org/abs/2408.12188v1","updated":"2024-08-22T08:05:09Z","published":"2024-08-22T08:05:09Z","title":"Reasoning Factual Knowledge in Structured Data with Large Language\n Models","summary":" Large language models (LLMs) have made remarkable progress in various natural\nlanguage processing tasks as a benefit of their capability to comprehend and\nreason with factual knowledge. However, a significant amount of factual\nknowledge is stored in structured data, which possesses unique characteristics\nthat differ from the unstructured texts used for pretraining. This difference\ncan introduce imperceptible inference parameter deviations, posing challenges\nfor LLMs in effectively utilizing and reasoning with structured data to\naccurately infer factual knowledge. To this end, we propose a benchmark named\nStructFact, to evaluate the structural reasoning capabilities of LLMs in\ninferring factual knowledge. StructFact comprises 8,340 factual questions\nencompassing various tasks, domains, timelines, and regions. This benchmark\nallows us to investigate the capability of LLMs across five factual tasks\nderived from the unique characteristics of structural facts. Extensive\nexperiments on a set of LLMs with different training strategies reveal the\nlimitations of current LLMs in inferring factual knowledge from structured\ndata. We present this benchmark as a compass to navigate the strengths and\nweaknesses of LLMs in reasoning with structured data for knowledge-sensitive\ntasks, and to encourage advancements in related real-world applications. Please\nfind our code at https://github.com/EganGu/StructFact.\n","authors":["Sirui Huang","Yanggan Gu","Xuming Hu","Zhonghao Li","Qing Li","Guandong Xu"],"pdf_url":"https://arxiv.org/pdf/2408.12188v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12177v1","updated":"2024-08-22T07:49:41Z","published":"2024-08-22T07:49:41Z","title":"Revisiting the Phenomenon of Syntactic Complexity Convergence on German\n Dialogue Data","summary":" We revisit the phenomenon of syntactic complexity convergence in\nconversational interaction, originally found for English dialogue, which has\ntheoretical implication for dialogical concepts such as mutual understanding.\nWe use a modified metric to quantify syntactic complexity based on dependency\nparsing. The results show that syntactic complexity convergence can be\nstatistically confirmed in one of three selected German datasets that were\nanalysed. Given that the dataset which shows such convergence is much larger\nthan the other two selected datasets, the empirical results indicate a certain\ndegree of linguistic generality of syntactic complexity convergence in\nconversational interaction. We also found a different type of syntactic\ncomplexity convergence in one of the datasets while further investigation is\nstill necessary.\n","authors":["Yu Wang","Hendrik Buschmeier"],"pdf_url":"https://arxiv.org/pdf/2408.12177v1.pdf","comment":"Accepted to KONVENS 2024"},{"id":"http://arxiv.org/abs/2408.12168v1","updated":"2024-08-22T07:31:00Z","published":"2024-08-22T07:31:00Z","title":"FIRST: Teach A Reliable Large Language Model Through Efficient\n Trustworthy Distillation","summary":" Large language models (LLMs) have become increasingly prevalent in our daily\nlives, leading to an expectation for LLMs to be trustworthy -- - both accurate\nand well-calibrated (the prediction confidence should align with its ground\ntruth correctness likelihood). Nowadays, fine-tuning has become the most\npopular method for adapting a model to practical usage by significantly\nincreasing accuracy on downstream tasks. Despite the great accuracy it\nachieves, we found fine-tuning is still far away from satisfactory\ntrustworthiness due to \"tuning-induced mis-calibration\". In this paper, we\ndelve deeply into why and how mis-calibration exists in fine-tuned models, and\nhow distillation can alleviate the issue. Then we further propose a brand new\nmethod named Efficient Trustworthy Distillation (FIRST), which utilizes a small\nportion of teacher's knowledge to obtain a reliable language model in a\ncost-efficient way. Specifically, we identify the \"concentrated knowledge\"\nphenomenon during distillation, which can significantly reduce the\ncomputational burden. Then we apply a \"trustworthy maximization\" process to\noptimize the utilization of this small portion of concentrated knowledge before\ntransferring it to the student. Experimental results demonstrate the\neffectiveness of our method, where better accuracy (+2.3%) and less\nmis-calibration (-10%) are achieved on average across both in-domain and\nout-of-domain scenarios, indicating better trustworthiness.\n","authors":["KaShun Shum","Minrui Xu","Jianshu Zhang","Zixin Chen","Shizhe Diao","Hanze Dong","Jipeng Zhang","Muhammad Omer Raza"],"pdf_url":"https://arxiv.org/pdf/2408.12168v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16535v2","updated":"2024-08-22T07:23:15Z","published":"2023-09-28T15:47:03Z","title":"KLoB: a Benchmark for Assessing Knowledge Locating Methods in Language\n Models","summary":" Recently, Locate-Then-Edit paradigm has emerged as one of the main approaches\nin changing factual knowledge stored in the Language models. However, there is\na lack of research on whether present locating methods can pinpoint the exact\nparameters embedding the desired knowledge. Moreover, although many researchers\nhave questioned the validity of locality hypothesis of factual knowledge, no\nmethod is provided to test the a hypothesis for more in-depth discussion and\nresearch. Therefore, we introduce KLoB, a benchmark examining three essential\nproperties that a reliable knowledge locating method should satisfy. KLoB can\nserve as a benchmark for evaluating existing locating methods in language\nmodels, and can contributes a method to reassessing the validity of locality\nhypothesis of factual knowledge. KLoB is publicly available at an anonymous\nGitHub: \\url{https://github.com/anon6662/KLoB}.\n","authors":["Yiming Ju","Xingrun Xing","Zhixiong Zeng"],"pdf_url":"https://arxiv.org/pdf/2309.16535v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12163v1","updated":"2024-08-22T07:18:46Z","published":"2024-08-22T07:18:46Z","title":"Preference-Guided Reflective Sampling for Aligning Language Models","summary":" Large language models (LLMs) are aligned with human preferences by\nreinforcement learning from human feedback (RLHF). Effective data sampling is\ncrucial for RLHF, as it determines the efficiency of model training, ensuring\nthat models learn from the informative samples. To achieve better data\ngeneration, we propose a new sampling method called Preference-Guided\nReflective Sampling (PRS). PRS frames the response generation as an\noptimization process to the explicitly specified user preference described in\nnatural language. It employs a tree-based generation framework to enable an\nefficient sampling process, which guides the direction of generation through\npreference and better explores the sampling space with adaptive\nself-refinement. Notably, PRS can align LLMs to diverse preferences. We study\npreference-controlled text generation for instruction following and\nkeyword-focused document summarization. Our findings indicate that PRS, across\ndifferent LLM policies, generates training data with much higher rewards than\nstrong baselines. PRS also excels in post-RL training.\n","authors":["Hai Ye","Hwee Tou Ng"],"pdf_url":"https://arxiv.org/pdf/2408.12163v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14183v3","updated":"2024-08-22T07:01:29Z","published":"2023-12-19T14:35:04Z","title":"On Early Detection of Hallucinations in Factual Question Answering","summary":" While large language models (LLMs) have taken great strides towards helping\nhumans with a plethora of tasks, hallucinations remain a major impediment\ntowards gaining user trust. The fluency and coherence of model generations even\nwhen hallucinating makes detection a difficult task. In this work, we explore\nif the artifacts associated with the model generations can provide hints that\nthe generation will contain hallucinations. Specifically, we probe LLMs at 1)\nthe inputs via Integrated Gradients based token attribution, 2) the outputs via\nthe Softmax probabilities, and 3) the internal state via self-attention and\nfully-connected layer activations for signs of hallucinations on open-ended\nquestion answering tasks. Our results show that the distributions of these\nartifacts tend to differ between hallucinated and non-hallucinated generations.\nBuilding on this insight, we train binary classifiers that use these artifacts\nas input features to classify model generations into hallucinations and\nnon-hallucinations. These hallucination classifiers achieve up to $0.80$ AUROC.\nWe also show that tokens preceding a hallucination can already predict the\nsubsequent hallucination even before it occurs.\n","authors":["Ben Snyder","Marius Moisescu","Muhammad Bilal Zafar"],"pdf_url":"https://arxiv.org/pdf/2312.14183v3.pdf","comment":"KDD 2024"},{"id":"http://arxiv.org/abs/2408.12159v1","updated":"2024-08-22T06:59:46Z","published":"2024-08-22T06:59:46Z","title":"Search-Based LLMs for Code Optimization","summary":" The code written by developers usually suffers from efficiency problems and\ncontain various performance bugs. These inefficiencies necessitate the research\nof automated refactoring methods for code optimization. Early research in code\noptimization employs rule-based methods and focuses on specific inefficiency\nissues, which are labor-intensive and suffer from the low coverage issue.\nRecent work regards the task as a sequence generation problem, and resorts to\ndeep learning (DL) techniques such as large language models (LLMs). These\nmethods typically prompt LLMs to directly generate optimized code. Although\nthese methods show state-of-the-art performance, such one-step generation\nparadigm is hard to achieve an optimal solution. First, complex optimization\nmethods such as combinatorial ones are hard to be captured by LLMs. Second, the\none-step generation paradigm poses challenge in precisely infusing the\nknowledge required for effective code optimization within LLMs, resulting in\nunder-optimized code.To address these problems, we propose to model this task\nfrom the search perspective, and propose a search-based LLMs framework named\nSBLLM that enables iterative refinement and discovery of improved optimization\nmethods. SBLLM synergistically integrate LLMs with evolutionary search and\nconsists of three key components: 1) an execution-based representative sample\nselection part that evaluates the fitness of each existing optimized code and\nprioritizes promising ones to pilot the generation of improved code; 2) an\nadaptive optimization pattern retrieval part that infuses targeted optimization\npatterns into the model for guiding LLMs towards rectifying and progressively\nenhancing their optimization methods; and 3) a genetic operator-inspired\nchain-of-thought prompting part that aids LLMs in combining different\noptimization methods and generating improved optimization methods.\n","authors":["Shuzheng Gao","Cuiyun Gao","Wenchao Gu","Michael Lyu"],"pdf_url":"https://arxiv.org/pdf/2408.12159v1.pdf","comment":"Accepted by 2025 IEEE/ACM 47th International Conference on Software\n Engineering (ICSE'25)"},{"id":"http://arxiv.org/abs/2408.12157v1","updated":"2024-08-22T06:55:29Z","published":"2024-08-22T06:55:29Z","title":"Implicit Sentiment Analysis Based on Chain of Thought Prompting","summary":" Implicit Sentiment Analysis (ISA) is a crucial research area in natural\nlanguage processing. Inspired by the idea of large language model Chain of\nThought (CoT), this paper introduces a Sentiment Analysis of Thinking (SAoT)\nframework. The framework first analyzes the implicit aspects and opinions in\nthe text using common sense and thinking chain capabilities. Then, it reflects\non the process of implicit sentiment analysis and finally deduces the polarity\nof sentiment. The model is evaluated on the SemEval 2014 dataset, consisting of\n1120 restaurant reviews and 638 laptop reviews. The experimental results\ndemonstrate that the utilization of the ERNIE-Bot-4+SAoT model yields a notable\nperformance improvement. Specifically, on the restaurant dataset, the F1 score\nreaches 75.27, accompanied by an ISA score of 66.29. Similarly, on the computer\ndataset, the F1 score achieves 76.50, while the ISA score amounts to 73.46.\nComparatively, the ERNIE-Bot-4+SAoT model surpasses the BERTAsp + SCAPt\nbaseline by an average margin of 47.99%.\n","authors":["Zhihua Duan","Jialin Wang"],"pdf_url":"https://arxiv.org/pdf/2408.12157v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12151v1","updated":"2024-08-22T06:40:32Z","published":"2024-08-22T06:40:32Z","title":"A Tighter Complexity Analysis of SparseGPT","summary":" In this work, we improved the analysis of the running time of SparseGPT\n[Frantar, Alistarh ICML 2023] from $O(d^{3})$ to $O(d^{\\omega} + d^{2+a+o(1)} +\nd^{1+\\omega(1,1,a)-a})$ for any $a \\in [0, 1]$, where $\\omega$ is the exponent\nof matrix multiplication. In particular, for the current $\\omega \\approx 2.371$\n[Alman, Duan, Williams, Xu, Xu, Zhou 2024], our running times boil down to\n$O(d^{2.53})$. This running time is due to the analysis of the lazy update\nbehavior in iterative maintenance problems, such as [Deng, Song, Weinstein\n2022, Brand, Song, Zhou ICML 2024].\n","authors":["Xiaoyu Li","Yingyu Liang","Zhenmei Shi","Zhao Song"],"pdf_url":"https://arxiv.org/pdf/2408.12151v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12142v1","updated":"2024-08-22T05:59:47Z","published":"2024-08-22T05:59:47Z","title":"MDD-5k: A New Diagnostic Conversation Dataset for Mental Disorders\n Synthesized via Neuro-Symbolic LLM Agents","summary":" The clinical diagnosis of most mental disorders primarily relies on the\nconversations between psychiatrist and patient. The creation of such diagnostic\nconversation datasets is promising to boost the AI mental healthcare community.\nHowever, directly collecting the conversations in real diagnosis scenarios is\nnear impossible due to stringent privacy and ethical considerations. To address\nthis issue, we seek to synthesize diagnostic conversation by exploiting\nanonymous patient cases that are easier to access. Specifically, we design a\nneuro-symbolic multi-agent framework for synthesizing the diagnostic\nconversation of mental disorders with large language models. It takes patient\ncase as input and is capable of generating multiple diverse conversations with\none single patient case. The framework basically involves the interaction\nbetween a doctor agent and a patient agent, and achieves text generation under\nsymbolic control via a dynamic diagnosis tree from a tool agent. By applying\nthe proposed framework, we develop the largest Chinese mental disorders\ndiagnosis dataset MDD-5k, which is built upon 1000 cleaned real patient cases\nby cooperating with a pioneering psychiatric hospital, and contains 5000\nhigh-quality long conversations with diagnosis results as labels. To the best\nof our knowledge, it's also the first labelled Chinese mental disorders\ndiagnosis dataset. Human evaluation demonstrates the proposed MDD-5k dataset\nsuccessfully simulates human-like diagnostic process of mental disorders. The\ndataset and code will become publicly accessible in\nhttps://github.com/lemonsis/MDD-5k.\n","authors":["Congchi Yin","Feng Li","Shu Zhang","Zike Wang","Jun Shao","Piji Li","Jianhua Chen","Xun Jiang"],"pdf_url":"https://arxiv.org/pdf/2408.12142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01109v4","updated":"2024-08-22T04:29:11Z","published":"2024-02-02T02:56:50Z","title":"Vaccine: Perturbation-aware Alignment for Large Language Models against\n Harmful Fine-tuning","summary":" The new paradigm of finetuning-as-a-service introduces a new attack surface\nfor Large Language Models (LLMs): a few harmful data uploaded by users can\neasily trick the finetuning to produce an alignment-broken model. We conduct an\nempirical analysis and uncover a \\textit{harmful embedding drift} phenomenon,\nshowing a probable cause of the alignment-broken effect. Inspired by our\nfindings, we propose Vaccine, a perturbation-aware alignment technique to\nmitigate the security risk of users finetuning. The core idea of Vaccine is to\nproduce invariant hidden embeddings by progressively adding crafted\nperturbation to them in the alignment phase. This enables the embeddings to\nwithstand harmful perturbation from un-sanitized user data in the finetuning\nphase. Our results on open source mainstream LLMs (e.g., Llama2, Opt, Vicuna)\ndemonstrate that Vaccine can boost the robustness of alignment against harmful\nprompts induced embedding drift while reserving reasoning ability towards\nbenign prompts. Our code is available at\n\\url{https://github.com/git-disl/Vaccine}.\n","authors":["Tiansheng Huang","Sihao Hu","Ling Liu"],"pdf_url":"https://arxiv.org/pdf/2402.01109v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12117v3","updated":"2024-08-22T04:11:45Z","published":"2024-01-22T16:57:05Z","title":"The Curious Case of Nonverbal Abstract Reasoning with Multi-Modal Large\n Language Models","summary":" While large language models (LLMs) are still being adopted to new domains and\nutilized in novel applications, we are experiencing an influx of the new\ngeneration of foundation models, namely multi-modal large language models\n(MLLMs). These models integrate verbal and visual information, opening new\npossibilities to demonstrate more complex reasoning abilities at the\nintersection of the two modalities. However, despite the revolutionizing\nprospect of MLLMs, our understanding of their reasoning abilities is limited.\nIn this study, we assess the nonverbal abstract reasoning abilities of\nopen-source and closed-source MLLMs using variations of Raven's Progressive\nMatrices. Our experiments reveal the challenging nature of such problems for\nMLLMs while showcasing the immense gap between open-source and closed-source\nmodels. We also uncover critical shortcomings of visual and textual\nperceptions, subjecting the models to low-performance ceilings. Finally, to\nimprove MLLMs' performance, we experiment with different methods, such as\nChain-of-Thought prompting, leading to a significant (up to 100%) boost in\nperformance. Our code and datasets are available at\nhttps://github.com/usc-isi-i2/isi-mmlm-rpm.\n","authors":["Kian Ahrabian","Zhivar Sourati","Kexuan Sun","Jiarui Zhang","Yifan Jiang","Fred Morstatter","Jay Pujara"],"pdf_url":"https://arxiv.org/pdf/2401.12117v3.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2408.12109v1","updated":"2024-08-22T03:49:18Z","published":"2024-08-22T03:49:18Z","title":"RoVRM: A Robust Visual Reward Model Optimized via Auxiliary Textual\n Preference Data","summary":" Large vision-language models (LVLMs) often fail to align with human\npreferences, leading to issues like generating misleading content without\nproper visual context (also known as hallucination). A promising solution to\nthis problem is using human-preference alignment techniques, such as best-of-n\nsampling and reinforcement learning. However, these techniques face the\ndifficulty arising from the scarcity of visual preference data, which is\nrequired to train a visual reward model (VRM). In this work, we continue the\nline of research. We present a Robust Visual Reward Model (RoVRM) which\nimproves human-preference alignment for LVLMs. RoVRM leverages auxiliary\ntextual preference data through a three-phase progressive training and optimal\ntransport-based preference data selection to effectively mitigate the scarcity\nof visual preference data. We experiment with RoVRM on the commonly used\nvision-language tasks based on the LLaVA-1.5-7B and -13B models. Experimental\nresults demonstrate that RoVRM consistently outperforms traditional VRMs.\nFurthermore, our three-phase progressive training and preference data selection\napproaches can yield consistent performance gains over ranking-based alignment\ntechniques, such as direct preference optimization.\n","authors":["Chenglong Wang","Yang Gan","Yifu Huo","Yongyu Mu","Murun Yang","Qiaozhi He","Tong Xiao","Chunliang Zhang","Tongran Liu","Quan Du","Di Yang","Jingbo Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.12109v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11119v2","updated":"2024-08-22T03:46:25Z","published":"2024-08-20T18:21:54Z","title":"Mistral-SPLADE: LLMs for better Learned Sparse Retrieval","summary":" Learned Sparse Retrievers (LSR) have evolved into an effective retrieval\nstrategy that can bridge the gap between traditional keyword-based sparse\nretrievers and embedding-based dense retrievers. At its core, learned sparse\nretrievers try to learn the most important semantic keyword expansions from a\nquery and/or document which can facilitate better retrieval with overlapping\nkeyword expansions. LSR like SPLADE has typically been using encoder only\nmodels with MLM (masked language modeling) style objective in conjunction with\nknown ways of retrieval performance improvement such as hard negative mining,\ndistillation, etc. In this work, we propose to use decoder-only model for\nlearning semantic keyword expansion. We posit, decoder only models that have\nseen much higher magnitudes of data are better equipped to learn keyword\nexpansions needed for improved retrieval. We use Mistral as the backbone to\ndevelop our Learned Sparse Retriever similar to SPLADE and train it on a subset\nof sentence-transformer data which is often used for training text embedding\nmodels. Our experiments support the hypothesis that a sparse retrieval model\nbased on decoder only large language model (LLM) surpasses the performance of\nexisting LSR systems, including SPLADE and all its variants. The LLM based\nmodel (Echo-Mistral-SPLADE) now stands as a state-of-the-art learned sparse\nretrieval model on the BEIR text retrieval benchmark.\n","authors":["Meet Doshi","Vishwajeet Kumar","Rudra Murthy","Vignesh P","Jaydeep Sen"],"pdf_url":"https://arxiv.org/pdf/2408.11119v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12097v1","updated":"2024-08-22T03:10:52Z","published":"2024-08-22T03:10:52Z","title":"Extraction of Research Objectives, Machine Learning Model Names, and\n Dataset Names from Academic Papers and Analysis of Their Interrelationships\n Using LLM and Network Analysis","summary":" Machine learning is widely utilized across various industries. Identifying\nthe appropriate machine learning models and datasets for specific tasks is\ncrucial for the effective industrial application of machine learning. However,\nthis requires expertise in both machine learning and the relevant domain,\nleading to a high learning cost. Therefore, research focused on extracting\ncombinations of tasks, machine learning models, and datasets from academic\npapers is critically important, as it can facilitate the automatic\nrecommendation of suitable methods. Conventional information extraction methods\nfrom academic papers have been limited to identifying machine learning models\nand other entities as named entities. To address this issue, this study\nproposes a methodology extracting tasks, machine learning methods, and dataset\nnames from scientific papers and analyzing the relationships between these\ninformation by using LLM, embedding model, and network clustering. The proposed\nmethod's expression extraction performance, when using Llama3, achieves an\nF-score exceeding 0.8 across various categories, confirming its practical\nutility. Benchmarking results on financial domain papers have demonstrated the\neffectiveness of this method, providing insights into the use of the latest\ndatasets, including those related to ESG (Environmental, Social, and\nGovernance) data.\n","authors":["S. Nishio","H. Nonaka","N. Tsuchiya","A. Migita","Y. Banno","T. Hayashi","H. Sakaji","T. Sakumoto","K. Watabe"],"pdf_url":"https://arxiv.org/pdf/2408.12097v1.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2408.12095v1","updated":"2024-08-22T03:08:49Z","published":"2024-08-22T03:08:49Z","title":"uMedSum: A Unified Framework for Advancing Medical Abstractive\n Summarization","summary":" Medical abstractive summarization faces the challenge of balancing\nfaithfulness and informativeness. Current methods often sacrifice key\ninformation for faithfulness or introduce confabulations when prioritizing\ninformativeness. While recent advancements in techniques like in-context\nlearning (ICL) and fine-tuning have improved medical summarization, they often\noverlook crucial aspects such as faithfulness and informativeness without\nconsidering advanced methods like model reasoning and self-improvement.\nMoreover, the field lacks a unified benchmark, hindering systematic evaluation\ndue to varied metrics and datasets. This paper addresses these gaps by\npresenting a comprehensive benchmark of six advanced abstractive summarization\nmethods across three diverse datasets using five standardized metrics. Building\non these findings, we propose uMedSum, a modular hybrid summarization framework\nthat introduces novel approaches for sequential confabulation removal followed\nby key missing information addition, ensuring both faithfulness and\ninformativeness. Our work improves upon previous GPT-4-based state-of-the-art\n(SOTA) medical summarization methods, significantly outperforming them in both\nquantitative metrics and qualitative domain expert evaluations. Notably, we\nachieve an average relative performance improvement of 11.8% in reference-free\nmetrics over the previous SOTA. Doctors prefer uMedSum's summaries 6 times more\nthan previous SOTA in difficult cases where there are chances of confabulations\nor missing information. These results highlight uMedSum's effectiveness and\ngeneralizability across various datasets and metrics, marking a significant\nadvancement in medical summarization.\n","authors":["Aishik Nagar","Yutong Liu","Andy T. Liu","Viktor Schlegel","Vijay Prakash Dwivedi","Arun-Kumar Kaliya-Perumal","Guna Pratheep Kalanchiam","Yili Tang","Robby T. Tan"],"pdf_url":"https://arxiv.org/pdf/2408.12095v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2408.08780v3","updated":"2024-08-22T02:52:28Z","published":"2024-08-16T14:49:04Z","title":"Large Language Models Might Not Care What You Are Saying: Prompt Format\n Beats Descriptions","summary":" With the help of in-context learning (ICL), large language models (LLMs) have\nachieved impressive performance across various tasks. However, the function of\ndescriptive instructions during ICL remains under-explored. In this work, we\npropose an ensemble prompt framework to describe the selection criteria of\nmultiple in-context examples, and preliminary experiments on machine\ntranslation (MT) across six translation directions confirm that this framework\nboosts ICL perfromance. But to our surprise, LLMs might not necessarily care\nwhat the descriptions actually say, and the performance gain is primarily\ncaused by the ensemble format, since the framework could lead to improvement\neven with random descriptive nouns. We further apply this new ensemble prompt\non a range of commonsense, math, logical reasoning and hallucination tasks with\nthree LLMs and achieve promising results, suggesting again that designing a\nproper prompt format would be much more effective and efficient than paying\neffort into specific descriptions. Our code will be publicly available once\nthis paper is published.\n","authors":["Chenming Tang","Zhixiang Wang","Yunfang Wu"],"pdf_url":"https://arxiv.org/pdf/2408.08780v3.pdf","comment":"There are some mistakes in the experimental data"},{"id":"http://arxiv.org/abs/2408.12079v1","updated":"2024-08-22T02:35:47Z","published":"2024-08-22T02:35:47Z","title":"High-Quality Data Augmentation for Low-Resource NMT: Combining a\n Translation Memory, a GAN Generator, and Filtering","summary":" Back translation, as a technique for extending a dataset, is widely used by\nresearchers in low-resource language translation tasks. It typically translates\nfrom the target to the source language to ensure high-quality translation\nresults. This paper proposes a novel way of utilizing a monolingual corpus on\nthe source side to assist Neural Machine Translation (NMT) in low-resource\nsettings. We realize this concept by employing a Generative Adversarial Network\n(GAN), which augments the training data for the discriminator while mitigating\nthe interference of low-quality synthetic monolingual translations with the\ngenerator. Additionally, this paper integrates Translation Memory (TM) with\nNMT, increasing the amount of data available to the generator. Moreover, we\npropose a novel procedure to filter the synthetic sentence pairs during the\naugmentation process, ensuring the high quality of the data.\n","authors":["Hengjie Liu","Ruibo Hou","Yves Lepage"],"pdf_url":"https://arxiv.org/pdf/2408.12079v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12076v1","updated":"2024-08-22T02:33:13Z","published":"2024-08-22T02:33:13Z","title":"ConflictBank: A Benchmark for Evaluating the Influence of Knowledge\n Conflicts in LLM","summary":" Large language models (LLMs) have achieved impressive advancements across\nnumerous disciplines, yet the critical issue of knowledge conflicts, a major\nsource of hallucinations, has rarely been studied. Only a few research explored\nthe conflicts between the inherent knowledge of LLMs and the retrieved\ncontextual knowledge. However, a thorough assessment of knowledge conflict in\nLLMs is still missing. Motivated by this research gap, we present ConflictBank,\nthe first comprehensive benchmark developed to systematically evaluate\nknowledge conflicts from three aspects: (i) conflicts encountered in retrieved\nknowledge, (ii) conflicts within the models' encoded knowledge, and (iii) the\ninterplay between these conflict forms. Our investigation delves into four\nmodel families and twelve LLM instances, meticulously analyzing conflicts\nstemming from misinformation, temporal discrepancies, and semantic divergences.\nBased on our proposed novel construction framework, we create 7,453,853\nclaim-evidence pairs and 553,117 QA pairs. We present numerous findings on\nmodel scale, conflict causes, and conflict types. We hope our ConflictBank\nbenchmark will help the community better understand model behavior in conflicts\nand develop more reliable LLMs.\n","authors":["Zhaochen Su","Jun Zhang","Xiaoye Qu","Tong Zhu","Yanshu Li","Jiashuo Sun","Juntao Li","Min Zhang","Yu Cheng"],"pdf_url":"https://arxiv.org/pdf/2408.12076v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2407.14845v2","updated":"2024-08-22T02:23:12Z","published":"2024-07-20T11:19:58Z","title":"Understanding the Relationship between Prompts and Response Uncertainty\n in Large Language Models","summary":" Large language models (LLMs) are widely used in decision-making, but their\nreliability, especially in critical tasks like healthcare, is not\nwell-established. Therefore, understanding how LLMs reason and make decisions\nis crucial for their safe deployment. This paper investigates how the\nuncertainty of responses generated by LLMs relates to the information provided\nin the input prompt. Leveraging the insight that LLMs learn to infer latent\nconcepts during pretraining, we propose a prompt-response concept model that\nexplains how LLMs generate responses and helps understand the relationship\nbetween prompts and response uncertainty. We show that the uncertainty\ndecreases as the prompt's informativeness increases, similar to epistemic\nuncertainty. Our detailed experimental results on real datasets validate our\nproposed model.\n","authors":["Ze Yu Zhang","Arun Verma","Finale Doshi-Velez","Bryan Kian Hsiang Low"],"pdf_url":"https://arxiv.org/pdf/2407.14845v2.pdf","comment":"27 pages, 13 figures"},{"id":"http://arxiv.org/abs/2408.12060v1","updated":"2024-08-22T01:42:34Z","published":"2024-08-22T01:42:34Z","title":"Evidence-backed Fact Checking using RAG and Few-Shot In-Context Learning\n with LLMs","summary":" Given the widespread dissemination of misinformation on social media,\nimplementing fact-checking mechanisms for online claims is essential. Manually\nverifying every claim is highly challenging, underscoring the need for an\nautomated fact-checking system. This paper presents our system designed to\naddress this issue. We utilize the Averitec dataset to assess the veracity of\nclaims. In addition to veracity prediction, our system provides supporting\nevidence, which is extracted from the dataset. We develop a Retrieve and\nGenerate (RAG) pipeline to extract relevant evidence sentences from a knowledge\nbase, which are then inputted along with the claim into a large language model\n(LLM) for classification. We also evaluate the few-shot In-Context Learning\n(ICL) capabilities of multiple LLMs. Our system achieves an 'Averitec' score of\n0.33, which is a 22% absolute improvement over the baseline. All code will be\nmade available on All code will be made available on\nhttps://github.com/ronit-singhal/evidence-backed-fact-checking-using-rag-and-few-shot-in-context-learning-with-llms.\n","authors":["Ronit Singhal","Pransh Patwa","Parth Patwa","Aman Chadha","Amitava Das"],"pdf_url":"https://arxiv.org/pdf/2408.12060v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03715v3","updated":"2024-08-22T01:26:21Z","published":"2024-02-06T05:11:38Z","title":"Clarify: Improving Model Robustness With Natural Language Corrections","summary":" The standard way to teach models is by feeding them lots of data. However,\nthis approach often teaches models incorrect ideas because they pick up on\nmisleading signals in the data. To prevent such misconceptions, we must\nnecessarily provide additional information beyond the training data. Prior\nmethods incorporate additional instance-level supervision, such as labels for\nmisleading features or additional labels for debiased data. However, such\nstrategies require a large amount of labeler effort. We hypothesize that people\nare good at providing textual feedback at the concept level, a capability that\nexisting teaching frameworks do not leverage. We propose Clarify, a novel\ninterface and method for interactively correcting model misconceptions. Through\nClarify, users need only provide a short text description of a model's\nconsistent failure patterns. Then, in an entirely automated way, we use such\ndescriptions to improve the training process. Clarify is the first end-to-end\nsystem for user model correction. Our user studies show that non-expert users\ncan successfully describe model misconceptions via Clarify, leading to\nincreased worst-case performance in two datasets. We additionally conduct a\ncase study on a large-scale image dataset, ImageNet, using Clarify to find and\nrectify 31 novel hard subpopulations.\n","authors":["Yoonho Lee","Michelle S. Lam","Helena Vasconcelos","Michael S. Bernstein","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2402.03715v3.pdf","comment":"UIST 2024. Interface code available at\n https://github.com/yoonholee/Clarify"},{"id":"http://arxiv.org/abs/2408.12055v1","updated":"2024-08-22T01:11:27Z","published":"2024-08-22T01:11:27Z","title":"Aligning (Medical) LLMs for (Counterfactual) Fairness","summary":" Large Language Models (LLMs) have emerged as promising solutions for a\nvariety of medical and clinical decision support applications. However, LLMs\nare often subject to different types of biases, which can lead to unfair\ntreatment of individuals, worsening health disparities, and reducing trust in\nAI-augmented medical tools. Aiming to address this important issue, in this\nstudy, we present a new model alignment approach for aligning LLMs using a\npreference optimization method within a knowledge distillation framework. Prior\nto presenting our proposed method, we first use an evaluation framework to\nconduct a comprehensive (largest to our knowledge) empirical evaluation to\nreveal the type and nature of existing biases in LLMs used for medical\napplications. We then offer a bias mitigation technique to reduce the unfair\npatterns in LLM outputs across different subgroups identified by the protected\nattributes. We show that our mitigation method is effective in significantly\nreducing observed biased patterns. Our code is publicly available at\n\\url{https://github.com/healthylaife/FairAlignmentLLM}.\n","authors":["Raphael Poulain","Hamed Fayyaz","Rahmatollah Beheshti"],"pdf_url":"https://arxiv.org/pdf/2408.12055v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2404.15149"},{"id":"http://arxiv.org/abs/2408.02901v2","updated":"2024-08-22T00:31:39Z","published":"2024-08-06T02:15:12Z","title":"Lighthouse: A User-Friendly Library for Reproducible Video Moment\n Retrieval and Highlight Detection","summary":" We propose Lighthouse, a user-friendly library for reproducible video moment\nretrieval and highlight detection (MR-HD). Although researchers proposed\nvarious MR-HD approaches, the research community holds two main issues. The\nfirst is a lack of comprehensive and reproducible experiments across various\nmethods, datasets, and video-text features. This is because no unified training\nand evaluation codebase covers multiple settings. The second is user-unfriendly\ndesign. Because previous works use different libraries, researchers set up\nindividual environments. In addition, most works release only the training\ncodes, requiring users to implement the whole inference process of MR-HD.\nLighthouse addresses these issues by implementing a unified reproducible\ncodebase that includes six models, three features, and five datasets. In\naddition, it provides an inference API and web demo to make these methods\neasily accessible for researchers and developers. Our experiments demonstrate\nthat Lighthouse generally reproduces the reported scores in the reference\npapers. The code is available at https://github.com/line/lighthouse.\n","authors":["Taichi Nishimura","Shota Nakada","Hokuto Munakata","Tatsuya Komatsu"],"pdf_url":"https://arxiv.org/pdf/2408.02901v2.pdf","comment":"6 pages; library tech report"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2408.12601v1","updated":"2024-08-22T17:59:44Z","published":"2024-08-22T17:59:44Z","title":"DreamCinema: Cinematic Transfer with Free Camera and 3D Character","summary":" We are living in a flourishing era of digital media, where everyone has the\npotential to become a personal filmmaker. Current research on cinematic\ntransfer empowers filmmakers to reproduce and manipulate the visual elements\n(e.g., cinematography and character behaviors) from classic shots. However,\ncharacters in the reimagined films still rely on manual crafting, which\ninvolves significant technical complexity and high costs, making it\nunattainable for ordinary users. Furthermore, their estimated cinematography\nlacks smoothness due to inadequate capturing of inter-frame motion and modeling\nof physical trajectories. Fortunately, the remarkable success of 2D and 3D AIGC\nhas opened up the possibility of efficiently generating characters tailored to\nusers' needs, diversifying cinematography. In this paper, we propose\nDreamCinema, a novel cinematic transfer framework that pioneers generative AI\ninto the film production paradigm, aiming at facilitating user-friendly film\ncreation. Specifically, we first extract cinematic elements (i.e., human and\ncamera pose) and optimize the camera trajectory. Then, we apply a character\ngenerator to efficiently create 3D high-quality characters with a human\nstructure prior. Finally, we develop a structure-guided motion transfer\nstrategy to incorporate generated characters into film creation and transfer it\nvia 3D graphics engines smoothly. Extensive experiments demonstrate the\neffectiveness of our method for creating high-quality films with free camera\nand 3D characters.\n","authors":["Weiliang Chen","Fangfu Liu","Diankun Wu","Haowen Sun","Haixu Song","Yueqi Duan"],"pdf_url":"https://arxiv.org/pdf/2408.12601v1.pdf","comment":"Project page: https://liuff19.github.io/DreamCinema"},{"id":"http://arxiv.org/abs/2408.12598v1","updated":"2024-08-22T17:59:01Z","published":"2024-08-22T17:59:01Z","title":"ND-SDF: Learning Normal Deflection Fields for High-Fidelity Indoor\n Reconstruction","summary":" Neural implicit reconstruction via volume rendering has demonstrated its\neffectiveness in recovering dense 3D surfaces. However, it is non-trivial to\nsimultaneously recover meticulous geometry and preserve smoothness across\nregions with differing characteristics. To address this issue, previous methods\ntypically employ geometric priors, which are often constrained by the\nperformance of the prior models. In this paper, we propose ND-SDF, which learns\na Normal Ddeflection field to represent the angular deviation between the scene\nnormal and the prior normal. Unlike previous methods that uniformly apply\ngeometric priors on all samples, introducing significant bias in accuracy, our\nproposed normal deflection field dynamically learns and adapts the utilization\nof samples based on their specific characteristics, thereby improving both the\naccuracy and effectiveness of the model. Our method not only obtains smooth\nweakly textured regions such as walls and floors but also preserves the\ngeometric details of complex structures. In addition, we introduce a novel ray\nsampling strategy based on the deflection angle to facilitate the unbiased\nrendering process, which significantly improves the quality and accuracy of\nintricate surfaces, especially on thin structures. Consistent improvements on\nvarious challenging datasets demonstrate the superiority of our method.\n","authors":["Ziyu Tang","Weicai Ye","Yifan Wang","Di Huang","Hujun Bao","Tong He","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.12598v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12593v1","updated":"2024-08-22T17:57:03Z","published":"2024-08-22T17:57:03Z","title":"Automating Deformable Gasket Assembly","summary":" In Gasket Assembly, a deformable gasket must be aligned and pressed into a\nnarrow channel. This task is common for sealing surfaces in the manufacturing\nof automobiles, appliances, electronics, and other products. Gasket Assembly is\na long-horizon, high-precision task and the gasket must align with the channel\nand be fully pressed in to achieve a secure fit. To compare approaches, we\npresent 4 methods for Gasket Assembly: one policy from deep imitation learning\nand three procedural algorithms. We evaluate these methods with 100 physical\ntrials. Results suggest that the Binary+ algorithm succeeds in 10/10 on the\nstraight channel whereas the learned policy based on 250 human teleoperated\ndemonstrations succeeds in 8/10 trials and is significantly slower. Code, CAD\nmodels, videos, and data can be found at\nhttps://berkeleyautomation.github.io/robot-gasket/\n","authors":["Simeon Adebola","Tara Sadjadpour","Karim El-Refai","Will Panitch","Zehan Ma","Roy Lin","Tianshuang Qiu","Shreya Ganti","Charlotte Le","Jaimyn Drake","Ken Goldberg"],"pdf_url":"https://arxiv.org/pdf/2408.12593v1.pdf","comment":"Content without Appendix accepted for IEEE CASE 2024"},{"id":"http://arxiv.org/abs/2408.12590v1","updated":"2024-08-22T17:55:22Z","published":"2024-08-22T17:55:22Z","title":"xGen-VideoSyn-1: High-fidelity Text-to-Video Synthesis with Compressed\n Representations","summary":" We present xGen-VideoSyn-1, a text-to-video (T2V) generation model capable of\nproducing realistic scenes from textual descriptions. Building on recent\nadvancements, such as OpenAI's Sora, we explore the latent diffusion model\n(LDM) architecture and introduce a video variational autoencoder (VidVAE).\nVidVAE compresses video data both spatially and temporally, significantly\nreducing the length of visual tokens and the computational demands associated\nwith generating long-sequence videos. To further address the computational\ncosts, we propose a divide-and-merge strategy that maintains temporal\nconsistency across video segments. Our Diffusion Transformer (DiT) model\nincorporates spatial and temporal self-attention layers, enabling robust\ngeneralization across different timeframes and aspect ratios. We have devised a\ndata processing pipeline from the very beginning and collected over 13M\nhigh-quality video-text pairs. The pipeline includes multiple steps such as\nclipping, text detection, motion estimation, aesthetics scoring, and dense\ncaptioning based on our in-house video-LLM model. Training the VidVAE and DiT\nmodels required approximately 40 and 642 H100 days, respectively. Our model\nsupports over 14-second 720p video generation in an end-to-end way and\ndemonstrates competitive performance against state-of-the-art T2V models.\n","authors":["Can Qin","Congying Xia","Krithika Ramakrishnan","Michael Ryoo","Lifu Tu","Yihao Feng","Manli Shu","Honglu Zhou","Anas Awadalla","Jun Wang","Senthil Purushwalkam","Le Xue","Yingbo Zhou","Huan Wang","Silvio Savarese","Juan Carlos Niebles","Zeyuan Chen","Ran Xu","Caiming Xiong"],"pdf_url":"https://arxiv.org/pdf/2408.12590v1.pdf","comment":"Accepted by ECCV24 AI4VA"},{"id":"http://arxiv.org/abs/2408.12588v1","updated":"2024-08-22T17:54:21Z","published":"2024-08-22T17:54:21Z","title":"Real-Time Video Generation with Pyramid Attention Broadcast","summary":" We present Pyramid Attention Broadcast (PAB), a real-time, high quality and\ntraining-free approach for DiT-based video generation. Our method is founded on\nthe observation that attention difference in the diffusion process exhibits a\nU-shaped pattern, indicating significant redundancy. We mitigate this by\nbroadcasting attention outputs to subsequent steps in a pyramid style. It\napplies different broadcast strategies to each attention based on their\nvariance for best efficiency. We further introduce broadcast sequence parallel\nfor more efficient distributed inference. PAB demonstrates superior results\nacross three models compared to baselines, achieving real-time generation for\nup to 720p videos. We anticipate that our simple yet effective method will\nserve as a robust baseline and facilitate future research and application for\nvideo generation.\n","authors":["Xuanlei Zhao","Xiaolong Jin","Kai Wang","Yang You"],"pdf_url":"https://arxiv.org/pdf/2408.12588v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12575v1","updated":"2024-08-22T17:42:16Z","published":"2024-08-22T17:42:16Z","title":"Enhanced Parking Perception by Multi-Task Fisheye Cross-view\n Transformers","summary":" Current parking area perception algorithms primarily focus on detecting\nvacant slots within a limited range, relying on error-prone homographic\nprojection for both labeling and inference. However, recent advancements in\nAdvanced Driver Assistance System (ADAS) require interaction with end-users\nthrough comprehensive and intelligent Human-Machine Interfaces (HMIs). These\ninterfaces should present a complete perception of the parking area going from\ndistinguishing vacant slots' entry lines to the orientation of other parked\nvehicles. This paper introduces Multi-Task Fisheye Cross View Transformers (MT\nF-CVT), which leverages features from a four-camera fisheye Surround-view\nCamera System (SVCS) with multihead attentions to create a detailed Bird-Eye\nView (BEV) grid feature map. Features are processed by both a segmentation\ndecoder and a Polygon-Yolo based object detection decoder for parking slots and\nvehicles. Trained on data labeled using LiDAR, MT F-CVT positions objects\nwithin a 25m x 25m real open-road scenes with an average error of only 20 cm.\nOur larger model achieves an F-1 score of 0.89. Moreover the smaller model\noperates at 16 fps on an Nvidia Jetson Orin embedded board, with similar\ndetection results to the larger one. MT F-CVT demonstrates robust\ngeneralization capability across different vehicles and camera rig\nconfigurations. A demo video from an unseen vehicle and camera rig is available\nat: https://streamable.com/jjw54x.\n","authors":["Antonyo Musabini","Ivan Novikov","Sana Soula","Christel Leonet","Lihao Wang","Rachid Benmokhtar","Fabian Burger","Thomas Boulay","Xavier Perrotton"],"pdf_url":"https://arxiv.org/pdf/2408.12575v1.pdf","comment":"26th Irish Machine Vision and Image Processing Conference,\n Data-Driven Autonomy Workshop (matching camera-ready version)"},{"id":"http://arxiv.org/abs/2408.12574v1","updated":"2024-08-22T17:41:45Z","published":"2024-08-22T17:41:45Z","title":"MuMA-ToM: Multi-modal Multi-Agent Theory of Mind","summary":" Understanding people's social interactions in complex real-world scenarios\noften relies on intricate mental reasoning. To truly understand how and why\npeople interact with one another, we must infer the underlying mental states\nthat give rise to the social interactions, i.e., Theory of Mind reasoning in\nmulti-agent interactions. Additionally, social interactions are often\nmulti-modal -- we can watch people's actions, hear their conversations, and/or\nread about their past behaviors. For AI systems to successfully and safely\ninteract with people in real-world environments, they also need to understand\npeople's mental states as well as their inferences about each other's mental\nstates based on multi-modal information about their interactions. For this, we\nintroduce MuMA-ToM, a Multi-modal Multi-Agent Theory of Mind benchmark.\nMuMA-ToM is the first multi-modal Theory of Mind benchmark that evaluates\nmental reasoning in embodied multi-agent interactions. In MuMA-ToM, we provide\nvideo and text descriptions of people's multi-modal behavior in realistic\nhousehold environments. Based on the context, we then ask questions about\npeople's goals, beliefs, and beliefs about others' goals. We validated MuMA-ToM\nin a human experiment and provided a human baseline. We also proposed a novel\nmulti-modal, multi-agent ToM model, LIMP (Language model-based Inverse\nMulti-agent Planning). Our experimental results show that LIMP significantly\noutperforms state-of-the-art methods, including large multi-modal models (e.g.,\nGPT-4o, Gemini-1.5 Pro) and a recent multi-modal ToM model, BIP-ALM.\n","authors":["Haojun Shi","Suyu Ye","Xinyu Fang","Chuanyang Jin","Layla Isik","Yen-Ling Kuo","Tianmin Shu"],"pdf_url":"https://arxiv.org/pdf/2408.12574v1.pdf","comment":"Project website: https://scai.cs.jhu.edu/projects/MuMA-ToM/ Code:\n https://github.com/SCAI-JHU/MuMA-ToM"},{"id":"http://arxiv.org/abs/2408.12569v1","updated":"2024-08-22T17:37:27Z","published":"2024-08-22T17:37:27Z","title":"Sapiens: Foundation for Human Vision Models","summary":" We present Sapiens, a family of models for four fundamental human-centric\nvision tasks - 2D pose estimation, body-part segmentation, depth estimation,\nand surface normal prediction. Our models natively support 1K high-resolution\ninference and are extremely easy to adapt for individual tasks by simply\nfine-tuning models pretrained on over 300 million in-the-wild human images. We\nobserve that, given the same computational budget, self-supervised pretraining\non a curated dataset of human images significantly boosts the performance for a\ndiverse set of human-centric tasks. The resulting models exhibit remarkable\ngeneralization to in-the-wild data, even when labeled data is scarce or\nentirely synthetic. Our simple model design also brings scalability - model\nperformance across tasks improves as we scale the number of parameters from 0.3\nto 2 billion. Sapiens consistently surpasses existing baselines across various\nhuman-centric benchmarks. We achieve significant improvements over the prior\nstate-of-the-art on Humans-5K (pose) by 7.6 mAP, Humans-2K (part-seg) by 17.1\nmIoU, Hi4D (depth) by 22.4% relative RMSE, and THuman2 (normal) by 53.5%\nrelative angular error.\n","authors":["Rawal Khirodkar","Timur Bagautdinov","Julieta Martinez","Su Zhaoen","Austin James","Peter Selednik","Stuart Anderson","Shunsuke Saito"],"pdf_url":"https://arxiv.org/pdf/2408.12569v1.pdf","comment":"ECCV 2024 (Oral)"},{"id":"http://arxiv.org/abs/2408.12568v1","updated":"2024-08-22T17:35:18Z","published":"2024-08-22T17:35:18Z","title":"Pruning By Explaining Revisited: Optimizing Attribution Methods to Prune\n CNNs and Transformers","summary":" To solve ever more complex problems, Deep Neural Networks are scaled to\nbillions of parameters, leading to huge computational costs. An effective\napproach to reduce computational requirements and increase efficiency is to\nprune unnecessary components of these often over-parameterized networks.\nPrevious work has shown that attribution methods from the field of eXplainable\nAI serve as effective means to extract and prune the least relevant network\ncomponents in a few-shot fashion. We extend the current state by proposing to\nexplicitly optimize hyperparameters of attribution methods for the task of\npruning, and further include transformer-based networks in our analysis. Our\napproach yields higher model compression rates of large transformer- and\nconvolutional architectures (VGG, ResNet, ViT) compared to previous works,\nwhile still attaining high performance on ImageNet classification tasks. Here,\nour experiments indicate that transformers have a higher degree of\nover-parameterization compared to convolutional neural networks. Code is\navailable at\n$\\href{https://github.com/erfanhatefi/Pruning-by-eXplaining-in-PyTorch}{\\text{this\nhttps link}}$.\n","authors":["Sayed Mohammad Vakilzadeh Hatefi","Maximilian Dreyer","Reduan Achtibat","Thomas Wiegand","Wojciech Samek","Sebastian Lapuschkin"],"pdf_url":"https://arxiv.org/pdf/2408.12568v1.pdf","comment":"Accepted as a workshop paper at ECCV 2024 31 pages (14 pages\n manuscript, 4 pages references, 13 pages appendix)"},{"id":"http://arxiv.org/abs/2408.12550v1","updated":"2024-08-22T17:06:29Z","published":"2024-08-22T17:06:29Z","title":"Comparing YOLOv5 Variants for Vehicle Detection: A Performance Analysis","summary":" Vehicle detection is an important task in the management of traffic and\nautomatic vehicles. This study provides a comparative analysis of five YOLOv5\nvariants, YOLOv5n6s, YOLOv5s6s, YOLOv5m6s, YOLOv5l6s, and YOLOv5x6s, for\nvehicle detection in various environments. The research focuses on evaluating\nthe effectiveness of these models in detecting different types of vehicles,\nsuch as Car, Bus, Truck, Bicycle, and Motorcycle, under varying conditions\nincluding lighting, occlusion, and weather. Performance metrics such as\nprecision, recall, F1-score, and mean Average Precision are utilized to assess\nthe accuracy and reliability of each model. YOLOv5n6s demonstrated a strong\nbalance between precision and recall, particularly in detecting Cars. YOLOv5s6s\nand YOLOv5m6s showed improvements in recall, enhancing their ability to detect\nall relevant objects. YOLOv5l6s, with its larger capacity, provided robust\nperformance, especially in detecting Cars, but not good with identifying\nMotorcycles and Bicycles. YOLOv5x6s was effective in recognizing Buses and Cars\nbut faced challenges with Motorcycle class.\n","authors":["Athulya Sundaresan Geetha"],"pdf_url":"https://arxiv.org/pdf/2408.12550v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16813v4","updated":"2024-08-22T17:04:52Z","published":"2024-05-27T04:14:20Z","title":"SiNGR: Brain Tumor Segmentation via Signed Normalized Geodesic Transform\n Regression","summary":" One of the primary challenges in brain tumor segmentation arises from the\nuncertainty of voxels close to tumor boundaries. However, the conventional\nprocess of generating ground truth segmentation masks fails to treat such\nuncertainties properly. Those \"hard labels\" with 0s and 1s conceptually\ninfluenced the majority of prior studies on brain image segmentation. As a\nresult, tumor segmentation is often solved through voxel classification. In\nthis work, we instead view this problem as a voxel-level regression, where the\nground truth represents a certainty mapping from any pixel to the border of the\ntumor. We propose a novel ground truth label transformation, which is based on\na signed geodesic transform, to capture the uncertainty in brain tumors'\nvicinity. We combine this idea with a Focal-like regression L1-loss that\nenables effective regression learning in high-dimensional output space by\nappropriately weighting voxels according to their difficulty. We thoroughly\nconduct an experimental evaluation to validate the components of our proposed\nmethod, compare it to a diverse array of state-of-the-art segmentation models,\nand show that it is architecture-agnostic. The code of our method is made\npublicly available (\\url{https://github.com/Oulu-IMEDS/SiNGR/}).\n","authors":["Trung Dang","Huy Hoang Nguyen","Aleksei Tiulpin"],"pdf_url":"https://arxiv.org/pdf/2405.16813v4.pdf","comment":"Accepted as a conference paper at MICCAI 2024"},{"id":"http://arxiv.org/abs/2401.06400v3","updated":"2024-08-22T16:46:33Z","published":"2024-01-12T06:49:49Z","title":"Generalizing Visual Question Answering from Synthetic to Human-Written\n Questions via a Chain of QA with a Large Language Model","summary":" Visual question answering (VQA) is a task where an image is given, and a\nseries of questions are asked about the image. To build an efficient VQA\nalgorithm, a large amount of QA data is required which is very expensive.\nGenerating synthetic QA pairs based on templates is a practical way to obtain\ndata. However, VQA models trained on those data do not perform well on complex,\nhuman-written questions. To address this issue, we propose a new method called\n{\\it chain of QA for human-written questions} (CoQAH). CoQAH utilizes a\nsequence of QA interactions between a large language model and a VQA model\ntrained on synthetic data to reason and derive logical answers for\nhuman-written questions. We tested the effectiveness of CoQAH on two types of\nhuman-written VQA datasets for 3D-rendered and chest X-ray images and found\nthat it achieved state-of-the-art accuracy in both types of data. Notably,\nCoQAH outperformed general vision-language models, VQA models, and medical\nfoundation models with no finetuning.\n","authors":["Taehee Kim","Yeongjae Cho","Heejun Shin","Yohan Jo","Dongmyung Shin"],"pdf_url":"https://arxiv.org/pdf/2401.06400v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12534v1","updated":"2024-08-22T16:38:45Z","published":"2024-08-22T16:38:45Z","title":"Automatic Organ and Pan-cancer Segmentation in Abdomen CT: the FLARE\n 2023 Challenge","summary":" Organ and cancer segmentation in abdomen Computed Tomography (CT) scans is\nthe prerequisite for precise cancer diagnosis and treatment. Most existing\nbenchmarks and algorithms are tailored to specific cancer types, limiting their\nability to provide comprehensive cancer analysis. This work presents the first\ninternational competition on abdominal organ and pan-cancer segmentation by\nproviding a large-scale and diverse dataset, including 4650 CT scans with\nvarious cancer types from over 40 medical centers. The winning team established\na new state-of-the-art with a deep learning-based cascaded framework, achieving\naverage Dice Similarity Coefficient scores of 92.3% for organs and 64.9% for\nlesions on the hidden multi-national testing set. The dataset and code of top\nteams are publicly available, offering a benchmark platform to drive further\ninnovations https://codalab.lisn.upsaclay.fr/competitions/12239.\n","authors":["Jun Ma","Yao Zhang","Song Gu","Cheng Ge","Ershuai Wang","Qin Zhou","Ziyan Huang","Pengju Lyu","Jian He","Bo Wang"],"pdf_url":"https://arxiv.org/pdf/2408.12534v1.pdf","comment":"MICCAI 2024 FLARE Challenge Summary"},{"id":"http://arxiv.org/abs/2408.00756v3","updated":"2024-08-22T16:38:20Z","published":"2024-08-01T17:57:25Z","title":"Segment anything model 2: an application to 2D and 3D medical images","summary":" Segment Anything Model (SAM) has gained significant attention because of its\nability to segment various objects in images given a prompt. The recently\ndeveloped SAM 2 has extended this ability to video inputs. This opens an\nopportunity to apply SAM to 3D images, one of the fundamental tasks in the\nmedical imaging field. In this paper, we extensively evaluate SAM 2's ability\nto segment both 2D and 3D medical images by first collecting 21 medical imaging\ndatasets, including surgical videos, common 3D modalities such as computed\ntomography (CT), magnetic resonance imaging (MRI), and positron emission\ntomography (PET) as well as 2D modalities such as X-ray and ultrasound. Two\nevaluation settings of SAM 2 are considered: (1) multi-frame 3D segmentation,\nwhere prompts are provided to one or multiple slice(s) selected from the\nvolume, and (2) single-frame 2D segmentation, where prompts are provided to\neach slice. The former only applies to videos and 3D modalities, while the\nlatter applies to all datasets. Our results show that SAM 2 exhibits similar\nperformance as SAM under single-frame 2D segmentation, and has variable\nperformance under multi-frame 3D segmentation depending on the choices of\nslices to annotate, the direction of the propagation, the predictions utilized\nduring the propagation, etc. We believe our work enhances the understanding of\nSAM 2's behavior in the medical field and provides directions for future work\nin adapting SAM 2 to this domain. Our code is available at:\nhttps://github.com/mazurowski-lab/segment-anything2-medical-evaluation.\n","authors":["Haoyu Dong","Hanxue Gu","Yaqian Chen","Jichen Yang","Yuwen Chen","Maciej A. Mazurowski"],"pdf_url":"https://arxiv.org/pdf/2408.00756v3.pdf","comment":"20 pages, 13 figures. Codes are available at\n https://github.com/mazurowski-lab/segment-anything2-medical-evaluation"},{"id":"http://arxiv.org/abs/2408.12531v1","updated":"2024-08-22T16:32:59Z","published":"2024-08-22T16:32:59Z","title":"Deep Learning Improvements for Sparse Spatial Field Reconstruction","summary":" Accurately reconstructing a global spatial field from sparse data has been a\nlongstanding problem in several domains, such as Earth Sciences and Fluid\nDynamics. Historically, scientists have approached this problem by employing\ncomplex physics models to reconstruct the spatial fields. However, these\nmethods are often computationally intensive. With the increase in popularity of\nmachine learning (ML), several researchers have applied ML to the spatial field\nreconstruction task and observed improvements in computational efficiency. One\nsuch method in arXiv:2101.00554 utilizes a sparse mask of sensor locations and\na Voronoi tessellation with sensor measurements as inputs to a convolutional\nneural network for reconstructing the global spatial field. In this work, we\npropose multiple adjustments to the aforementioned approach and show\nimprovements on geoscience and fluid dynamics simulation datasets. We identify\nand discuss scenarios that benefit the most using the proposed ML-based spatial\nfield reconstruction approach.\n","authors":["Robert Sunderhaft","Logan Frank","Jim Davis"],"pdf_url":"https://arxiv.org/pdf/2408.12531v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12528v1","updated":"2024-08-22T16:32:32Z","published":"2024-08-22T16:32:32Z","title":"Show-o: One Single Transformer to Unify Multimodal Understanding and\n Generation","summary":" We present a unified transformer, i.e., Show-o, that unifies multimodal\nunderstanding and generation. Unlike fully autoregressive models, Show-o\nunifies autoregressive and (discrete) diffusion modeling to adaptively handle\ninputs and outputs of various and mixed modalities. The unified model flexibly\nsupports a wide range of vision-language tasks including visual\nquestion-answering, text-to-image generation, text-guided\ninpainting/extrapolation, and mixed-modality generation. Across various\nbenchmarks, it demonstrates comparable or superior performance to existing\nindividual models with an equivalent or larger number of parameters tailored\nfor understanding or generation. This significantly highlights its potential as\na next-generation foundation model. Code and models are released at\nhttps://github.com/showlab/Show-o.\n","authors":["Jinheng Xie","Weijia Mao","Zechen Bai","David Junhao Zhang","Weihao Wang","Kevin Qinghong Lin","Yuchao Gu","Zhijie Chen","Zhenheng Yang","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2408.12528v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2408.12527v1","updated":"2024-08-22T16:32:19Z","published":"2024-08-22T16:32:19Z","title":"UMAD: University of Macau Anomaly Detection Benchmark Dataset","summary":" Anomaly detection is critical in surveillance systems and patrol robots by\nidentifying anomalous regions in images for early warning. Depending on whether\nreference data are utilized, anomaly detection can be categorized into anomaly\ndetection with reference and anomaly detection without reference. Currently,\nanomaly detection without reference, which is closely related to\nout-of-distribution (OoD) object detection, struggles with learning anomalous\npatterns due to the difficulty of collecting sufficiently large and diverse\nanomaly datasets with the inherent rarity and novelty of anomalies.\nAlternatively, anomaly detection with reference employs the scheme of change\ndetection to identify anomalies by comparing semantic changes between a\nreference image and a query one. However, there are very few ADr works due to\nthe scarcity of public datasets in this domain. In this paper, we aim to\naddress this gap by introducing the UMAD Benchmark Dataset. To our best\nknowledge, this is the first benchmark dataset designed specifically for\nanomaly detection with reference in robotic patrolling scenarios, e.g., where\nan autonomous robot is employed to detect anomalous objects by comparing a\nreference and a query video sequences. The reference sequences can be taken by\nthe robot along a specified route when there are no anomalous objects in the\nscene. The query sequences are captured online by the robot when it is\npatrolling in the same scene following the same route. Our benchmark dataset is\nelaborated such that each query image can find a corresponding reference based\non accurate robot localization along the same route in the prebuilt 3D map,\nwith which the reference and query images can be geometrically aligned using\nadaptive warping. Besides the proposed benchmark dataset, we evaluate the\nbaseline models of ADr on this dataset.\n","authors":["Dong Li","Lineng Chen","Cheng-Zhong Xu","Hui Kong"],"pdf_url":"https://arxiv.org/pdf/2408.12527v1.pdf","comment":"Accepted by the IEEE/RSJ International Conference on Intelligent\n Robots and Systems (IROS) 2024, project code at https://github.com/IMRL/UMAD"},{"id":"http://arxiv.org/abs/2406.07966v2","updated":"2024-08-22T16:23:37Z","published":"2024-06-12T07:44:22Z","title":"Real-world Image Dehazing with Coherence-based Label Generator and\n Cooperative Unfolding Network","summary":" Real-world Image Dehazing (RID) aims to alleviate haze-induced degradation in\nreal-world settings. This task remains challenging due to the complexities in\naccurately modeling real haze distributions and the scarcity of paired\nreal-world data. To address these challenges, we first introduce a cooperative\nunfolding network that jointly models atmospheric scattering and image scenes,\neffectively integrating physical knowledge into deep networks to restore\nhaze-contaminated details. Additionally, we propose the first RID-oriented\niterative mean-teacher framework, termed the Coherence-based Label Generator,\nto generate high-quality pseudo labels for network training. Specifically, we\nprovide an optimal label pool to store the best pseudo-labels during network\ntraining, leveraging both global and local coherence to select high-quality\ncandidates and assign weights to prioritize haze-free regions. We verify the\neffectiveness of our method, with experiments demonstrating that it achieves\nstate-of-the-art performance on RID tasks. Code will be available at\n\\url{https://github.com/cnyvfang/CORUN-Colabator}.\n","authors":["Chengyu Fang","Chunming He","Fengyang Xiao","Yulun Zhang","Longxiang Tang","Yuelin Zhang","Kai Li","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2406.07966v2.pdf","comment":"10 pages, 7 figures, 6 tables"},{"id":"http://arxiv.org/abs/2407.03825v2","updated":"2024-08-22T15:40:42Z","published":"2024-07-04T10:56:10Z","title":"StreamLTS: Query-based Temporal-Spatial LiDAR Fusion for Cooperative\n Object Detection","summary":" Cooperative perception via communication among intelligent traffic agents has\ngreat potential to improve the safety of autonomous driving. However, limited\ncommunication bandwidth, localization errors and asynchronized capturing time\nof sensor data, all introduce difficulties to the data fusion of different\nagents. To some extend, previous works have attempted to reduce the shared data\nsize, mitigate the spatial feature misalignment caused by localization errors\nand communication delay. However, none of them have considered the\nasynchronized sensor ticking times, which can lead to dynamic object\nmisplacement of more than one meter during data fusion. In this work, we\npropose Time-Aligned COoperative Object Detection (TA-COOD), for which we adapt\nwidely used dataset OPV2V and DairV2X with considering asynchronous LiDAR\nsensor ticking times and build an efficient fully sparse framework with\nmodeling the temporal information of individual objects with query-based\ntechniques. The experiment results confirmed the superior efficiency of our\nfully sparse framework compared to the state-of-the-art dense models. More\nimportantly, they show that the point-wise observation timestamps of the\ndynamic objects are crucial for accurate modeling the object temporal context\nand the predictability of their time-related locations. The official code is\navailable at \\url{https://github.com/YuanYunshuang/CoSense3D}.\n","authors":["Yunshuang Yuan","Monika Sester"],"pdf_url":"https://arxiv.org/pdf/2407.03825v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03143v2","updated":"2024-08-22T15:38:28Z","published":"2024-08-06T12:37:47Z","title":"SuperSimpleNet: Unifying Unsupervised and Supervised Learning for Fast\n and Reliable Surface Defect Detection","summary":" The aim of surface defect detection is to identify and localise abnormal\nregions on the surfaces of captured objects, a task that's increasingly\ndemanded across various industries. Current approaches frequently fail to\nfulfil the extensive demands of these industries, which encompass high\nperformance, consistency, and fast operation, along with the capacity to\nleverage the entirety of the available training data. Addressing these gaps, we\nintroduce SuperSimpleNet, an innovative discriminative model that evolved from\nSimpleNet. This advanced model significantly enhances its predecessor's\ntraining consistency, inference time, as well as detection performance.\nSuperSimpleNet operates in an unsupervised manner using only normal training\nimages but also benefits from labelled abnormal training images when they are\navailable. SuperSimpleNet achieves state-of-the-art results in both the\nsupervised and the unsupervised settings, as demonstrated by experiments across\nfour challenging benchmark datasets. Code:\nhttps://github.com/blaz-r/SuperSimpleNet .\n","authors":["Blaž Rolih","Matic Fučka","Danijel Skočaj"],"pdf_url":"https://arxiv.org/pdf/2408.03143v2.pdf","comment":"Accepted to ICPR 2024"},{"id":"http://arxiv.org/abs/2408.12489v1","updated":"2024-08-22T15:29:08Z","published":"2024-08-22T15:29:08Z","title":"Scribbles for All: Benchmarking Scribble Supervised Segmentation Across\n Datasets","summary":" In this work, we introduce Scribbles for All, a label and training data\ngeneration algorithm for semantic segmentation trained on scribble labels.\nTraining or fine-tuning semantic segmentation models with weak supervision has\nbecome an important topic recently and was subject to significant advances in\nmodel quality. In this setting, scribbles are a promising label type to achieve\nhigh quality segmentation results while requiring a much lower annotation\neffort than usual pixel-wise dense semantic segmentation annotations. The main\nlimitation of scribbles as source for weak supervision is the lack of\nchallenging datasets for scribble segmentation, which hinders the development\nof novel methods and conclusive evaluations. To overcome this limitation,\nScribbles for All provides scribble labels for several popular segmentation\ndatasets and provides an algorithm to automatically generate scribble labels\nfor any dataset with dense annotations, paving the way for new insights and\nmodel advancements in the field of weakly supervised segmentation. In addition\nto providing datasets and algorithm, we evaluate state-of-the-art segmentation\nmodels on our datasets and show that models trained with our synthetic labels\nperform competitively with respect to models trained on manual labels. Thus,\nour datasets enable state-of-the-art research into methods for scribble-labeled\nsemantic segmentation. The datasets, scribble generation algorithm, and\nbaselines are publicly available at https://github.com/wbkit/Scribbles4All\n","authors":["Wolfgang Boettcher","Lukas Hoyer","Ozan Unal","Jan Eric Lenssen","Bernt Schiele"],"pdf_url":"https://arxiv.org/pdf/2408.12489v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2408.12483v1","updated":"2024-08-22T15:20:32Z","published":"2024-08-22T15:20:32Z","title":"Not All Samples Should Be Utilized Equally: Towards Understanding and\n Improving Dataset Distillation","summary":" Dataset Distillation (DD) aims to synthesize a small dataset capable of\nperforming comparably to the original dataset. Despite the success of numerous\nDD methods, theoretical exploration of this area remains unaddressed. In this\npaper, we take an initial step towards understanding various matching-based DD\nmethods from the perspective of sample difficulty. We begin by empirically\nexamining sample difficulty, measured by gradient norm, and observe that\ndifferent matching-based methods roughly correspond to specific difficulty\ntendencies. We then extend the neural scaling laws of data pruning to DD to\ntheoretically explain these matching-based methods. Our findings suggest that\nprioritizing the synthesis of easier samples from the original dataset can\nenhance the quality of distilled datasets, especially in low IPC\n(image-per-class) settings. Based on our empirical observations and theoretical\nanalysis, we introduce the Sample Difficulty Correction (SDC) approach,\ndesigned to predominantly generate easier samples to achieve higher dataset\nquality. Our SDC can be seamlessly integrated into existing methods as a plugin\nwith minimal code adjustments. Experimental results demonstrate that adding SDC\ngenerates higher-quality distilled datasets across 7 distillation methods and 6\ndatasets.\n","authors":["Shaobo Wang","Yantai Yang","Qilong Wang","Kaixin Li","Linfeng Zhang","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2408.12483v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18279v2","updated":"2024-08-22T15:20:20Z","published":"2023-10-27T17:11:07Z","title":"FOUND: Foot Optimization with Uncertain Normals for Surface Deformation\n Using Synthetic Data","summary":" Surface reconstruction from multi-view images is a challenging task, with\nsolutions often requiring a large number of sampled images with high overlap.\nWe seek to develop a method for few-view reconstruction, for the case of the\nhuman foot. To solve this task, we must extract rich geometric cues from RGB\nimages, before carefully fusing them into a final 3D object. Our FOUND approach\ntackles this, with 4 main contributions: (i) SynFoot, a synthetic dataset of\n50,000 photorealistic foot images, paired with ground truth surface normals and\nkeypoints; (ii) an uncertainty-aware surface normal predictor trained on our\nsynthetic dataset; (iii) an optimization scheme for fitting a generative foot\nmodel to a series of images; and (iv) a benchmark dataset of calibrated images\nand high resolution ground truth geometry. We show that our normal predictor\noutperforms all off-the-shelf equivalents significantly on real images, and our\noptimization scheme outperforms state-of-the-art photogrammetry pipelines,\nespecially for a few-view setting. We release our synthetic dataset and\nbaseline 3D scans to the research community.\n","authors":["Oliver Boyne","Gwangbin Bae","James Charles","Roberto Cipolla"],"pdf_url":"https://arxiv.org/pdf/2310.18279v2.pdf","comment":"14 pages, 15 figures"},{"id":"http://arxiv.org/abs/2408.12475v1","updated":"2024-08-22T15:13:27Z","published":"2024-08-22T15:13:27Z","title":"Frame Order Matters: A Temporal Sequence-Aware Model for Few-Shot Action\n Recognition","summary":" In this paper, we propose a novel Temporal Sequence-Aware Model (TSAM) for\nfew-shot action recognition (FSAR), which incorporates a sequential perceiver\nadapter into the pre-training framework, to integrate both the spatial\ninformation and the sequential temporal dynamics into the feature embeddings.\nDifferent from the existing fine-tuning approaches that capture temporal\ninformation by exploring the relationships among all the frames, our\nperceiver-based adapter recurrently captures the sequential dynamics alongside\nthe timeline, which could perceive the order change. To obtain the\ndiscriminative representations for each class, we extend a textual corpus for\neach class derived from the large language models (LLMs) and enrich the visual\nprototypes by integrating the contextual semantic information. Besides, We\nintroduce an unbalanced optimal transport strategy for feature matching that\nmitigates the impact of class-unrelated features, thereby facilitating more\neffective decision-making. Experimental results on five FSAR datasets\ndemonstrate that our method set a new benchmark, beating the second-best\ncompetitors with large margins.\n","authors":["Bozheng Li","Mushui Liu","Gaoang Wang","Yunlong Yu"],"pdf_url":"https://arxiv.org/pdf/2408.12475v1.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.12469v1","updated":"2024-08-22T15:10:20Z","published":"2024-08-22T15:10:20Z","title":"Envisioning Class Entity Reasoning by Large Language Models for Few-shot\n Learning","summary":" Few-shot learning (FSL) aims to recognize new concepts using a limited number\nof visual samples. Existing approaches attempt to incorporate semantic\ninformation into the limited visual data for category understanding. However,\nthese methods often enrich class-level feature representations with abstract\ncategory names, failing to capture the nuanced features essential for effective\ngeneralization. To address this issue, we propose a novel framework for FSL,\nwhich incorporates both the abstract class semantics and the concrete class\nentities extracted from Large Language Models (LLMs), to enhance the\nrepresentation of the class prototypes. Specifically, our framework composes a\nSemantic-guided Visual Pattern Extraction (SVPE) module and a\nPrototype-Calibration (PC) module, where the SVPE meticulously extracts\nsemantic-aware visual patterns across diverse scales, while the PC module\nseamlessly integrates these patterns to refine the visual prototype, enhancing\nits representativeness. Extensive experiments on four few-shot classification\nbenchmarks and the BSCD-FSL cross-domain benchmarks showcase remarkable\nadvancements over the current state-of-the-art methods. Notably, for the\nchallenging one-shot setting, our approach, utilizing the ResNet-12 backbone,\nachieves an impressive average improvement of 1.95% over the second-best\ncompetitor.\n","authors":["Mushui Liu","Fangtai Wu","Bozheng Li","Ziqian Lu","Yunlong Yu","Xi Li"],"pdf_url":"https://arxiv.org/pdf/2408.12469v1.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2408.12466v1","updated":"2024-08-22T15:06:50Z","published":"2024-08-22T15:06:50Z","title":"WCEbleedGen: A wireless capsule endoscopy dataset and its benchmarking\n for automatic bleeding classification, detection, and segmentation","summary":" Computer-based analysis of Wireless Capsule Endoscopy (WCE) is crucial.\nHowever, a medically annotated WCE dataset for training and evaluation of\nautomatic classification, detection, and segmentation of bleeding and\nnon-bleeding frames is currently lacking. The present work focused on\ndevelopment of a medically annotated WCE dataset called WCEbleedGen for\nautomatic classification, detection, and segmentation of bleeding and\nnon-bleeding frames. It comprises 2,618 WCE bleeding and non-bleeding frames\nwhich were collected from various internet resources and existing WCE datasets.\nA comprehensive benchmarking and evaluation of the developed dataset was done\nusing nine classification-based, three detection-based, and three\nsegmentation-based deep learning models. The dataset is of high-quality, is\nclass-balanced and contains single and multiple bleeding sites. Overall, our\nstandard benchmark results show that Visual Geometric Group (VGG) 19, You Only\nLook Once version 8 nano (YOLOv8n), and Link network (Linknet) performed best\nin automatic classification, detection, and segmentation-based evaluations,\nrespectively. Automatic bleeding diagnosis is crucial for WCE video\ninterpretations. This diverse dataset will aid in developing of real-time,\nmulti-task learning-based innovative solutions for automatic bleeding diagnosis\nin WCE. The dataset and code are publicly available at\nhttps://zenodo.org/records/10156571 and\nhttps://github.com/misahub2023/Benchmarking-Codes-of-the-WCEBleedGen-dataset.\n","authors":["Palak Handa","Manas Dhir","Amirreza Mahbod","Florian Schwarzhans","Ramona Woitek","Nidhi Goel","Deepak Gunjan"],"pdf_url":"https://arxiv.org/pdf/2408.12466v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12463v1","updated":"2024-08-22T15:04:59Z","published":"2024-08-22T15:04:59Z","title":"Smartphone-based Eye Tracking System using Edge Intelligence and Model\n Optimisation","summary":" A significant limitation of current smartphone-based eye-tracking algorithms\nis their low accuracy when applied to video-type visual stimuli, as they are\ntypically trained on static images. Also, the increasing demand for real-time\ninteractive applications like games, VR, and AR on smartphones requires\novercoming the limitations posed by resource constraints such as limited\ncomputational power, battery life, and network bandwidth. Therefore, we\ndeveloped two new smartphone eye-tracking techniques for video-type visuals by\ncombining Convolutional Neural Networks (CNN) with two different Recurrent\nNeural Networks (RNN), namely Long Short Term Memory (LSTM) and Gated Recurrent\nUnit (GRU). Our CNN+LSTM and CNN+GRU models achieved an average Root Mean\nSquare Error of 0.955cm and 1.091cm, respectively. To address the computational\nconstraints of smartphones, we developed an edge intelligence architecture to\nenhance the performance of smartphone-based eye tracking. We applied various\noptimisation methods like quantisation and pruning to deep learning models for\nbetter energy, CPU, and memory usage on edge devices, focusing on real-time\nprocessing. Using model quantisation, the model inference time in the CNN+LSTM\nand CNN+GRU models was reduced by 21.72% and 19.50%, respectively, on edge\ndevices.\n","authors":["Nishan Gunawardena","Gough Yumu Lui","Jeewani Anupama Ginige","Bahman Javadi"],"pdf_url":"https://arxiv.org/pdf/2408.12463v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12460v1","updated":"2024-08-22T14:59:37Z","published":"2024-08-22T14:59:37Z","title":"Finding Closure: A Closer Look at the Gestalt Law of Closure in\n Convolutional Neural Networks","summary":" The human brain has an inherent ability to fill in gaps to perceive figures\nas complete wholes, even when parts are missing or fragmented. This phenomenon\nis known as Closure in psychology, one of the Gestalt laws of perceptual\norganization, explaining how the human brain interprets visual stimuli. Given\nthe importance of Closure for human object recognition, we investigate whether\nneural networks rely on a similar mechanism. Exploring this crucial human\nvisual skill in neural networks has the potential to highlight their\ncomparability to humans. Recent studies have examined the Closure effect in\nneural networks. However, they typically focus on a limited selection of\nConvolutional Neural Networks (CNNs) and have not reached a consensus on their\ncapability to perform Closure. To address these gaps, we present a systematic\nframework for investigating the Closure principle in neural networks. We\nintroduce well-curated datasets designed to test for Closure effects, including\nboth modal and amodal completion. We then conduct experiments on various CNNs\nemploying different measurements. Our comprehensive analysis reveals that VGG16\nand DenseNet-121 exhibit the Closure effect, while other CNNs show variable\nresults. We interpret these findings by blending insights from psychology and\nneural network research, offering a unique perspective that enhances\ntransparency in understanding neural networks. Our code and dataset will be\nmade available on GitHub.\n","authors":["Yuyan Zhang","Derya Soydaner","Lisa Koßmann","Fatemeh Behrad","Johan Wagemans"],"pdf_url":"https://arxiv.org/pdf/2408.12460v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12454v1","updated":"2024-08-22T14:52:53Z","published":"2024-08-22T14:52:53Z","title":"Relaxed Rotational Equivariance via $G$-Biases in Vision","summary":" Group Equivariant Convolution (GConv) can effectively handle rotational\nsymmetry data. They assume uniform and strict rotational symmetry across all\nfeatures, as the transformations under the specific group. However, real-world\ndata rarely conforms to strict rotational symmetry commonly referred to as\nRotational Symmetry-Breaking in the system or dataset, making GConv unable to\nadapt effectively to this phenomenon. Motivated by this, we propose a simple\nbut highly effective method to address this problem, which utilizes a set of\nlearnable biases called the $G$-Biases under the group order to break strict\ngroup constraints and achieve \\textbf{R}elaxed \\textbf{R}otational\n\\textbf{E}quivarant \\textbf{Conv}olution (RREConv). We conduct extensive\nexperiments to validate Relaxed Rotational Equivariance on rotational symmetry\ngroups $\\mathcal{C}_n$ (e.g. $\\mathcal{C}_2$, $\\mathcal{C}_4$, and\n$\\mathcal{C}_6$ groups). Further experiments demonstrate that our proposed\nRREConv-based methods achieve excellent performance, compared to existing\nGConv-based methods in classification and detection tasks on natural image\ndatasets.\n","authors":["Zhiqiang Wu","Licheng Sun","Yingjie Liu","Jian Yang","Hanlin Dong","Shing-Ho J. Lin","Xuan Tang","Jinpeng Mi","Bo Jin","Xian Wei"],"pdf_url":"https://arxiv.org/pdf/2408.12454v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12447v1","updated":"2024-08-22T14:43:02Z","published":"2024-08-22T14:43:02Z","title":"The 2nd Solution for LSVOS Challenge RVOS Track: Spatial-temporal\n Refinement for Consistent Semantic Segmentation","summary":" Referring Video Object Segmentation (RVOS) is a challenging task due to its\nrequirement for temporal understanding. Due to the obstacle of computational\ncomplexity, many state-of-the-art models are trained on short time intervals.\nDuring testing, while these models can effectively process information over\nshort time steps, they struggle to maintain consistent perception over\nprolonged time sequences, leading to inconsistencies in the resulting semantic\nsegmentation masks. To address this challenge, we take a step further in this\nwork by leveraging the tracking capabilities of the newly introduced Segment\nAnything Model version 2 (SAM-v2) to enhance the temporal consistency of the\nreferring object segmentation model. Our method achieved a score of 60.40\n\\mathcal{J\\text{\\&}F} on the test set of the MeViS dataset, placing 2nd place\nin the final ranking of the RVOS Track at the ECCV 2024 LSVOS Challenge.\n","authors":["Tuyen Tran"],"pdf_url":"https://arxiv.org/pdf/2408.12447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12443v1","updated":"2024-08-22T14:39:30Z","published":"2024-08-22T14:39:30Z","title":"A Riemannian Approach for Spatiotemporal Analysis and Generation of 4D\n Tree-shaped Structures","summary":" We propose the first comprehensive approach for modeling and analyzing the\nspatiotemporal shape variability in tree-like 4D objects, i.e., 3D objects\nwhose shapes bend, stretch, and change in their branching structure over time\nas they deform, grow, and interact with their environment. Our key contribution\nis the representation of tree-like 3D shapes using Square Root Velocity\nFunction Trees (SRVFT). By solving the spatial registration in the SRVFT space,\nwhich is equipped with an L2 metric, 4D tree-shaped structures become\ntime-parameterized trajectories in this space. This reduces the problem of\nmodeling and analyzing 4D tree-like shapes to that of modeling and analyzing\nelastic trajectories in the SRVFT space, where elasticity refers to time\nwarping. In this paper, we propose a novel mathematical representation of the\nshape space of such trajectories, a Riemannian metric on that space, and\ncomputational tools for fast and accurate spatiotemporal registration and\ngeodesics computation between 4D tree-shaped structures. Leveraging these\nbuilding blocks, we develop a full framework for modelling the spatiotemporal\nvariability using statistical models and generating novel 4D tree-like\nstructures from a set of exemplars. We demonstrate and validate the proposed\nframework using real 4D plant data.\n","authors":["Tahmina Khanam","Hamid Laga","Mohammed Bennamoun","Guanjin Wang","Ferdous Sohel","Farid Boussaid","Guan Wang","Anuj Srivastava"],"pdf_url":"https://arxiv.org/pdf/2408.12443v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12439v1","updated":"2024-08-22T14:36:56Z","published":"2024-08-22T14:36:56Z","title":"Adapting MIMO video restoration networks to low latency constraints","summary":" MIMO (multiple input, multiple output) approaches are a recent trend in\nneural network architectures for video restoration problems, where each network\nevaluation produces multiple output frames. The video is split into\nnon-overlapping stacks of frames that are processed independently, resulting in\na very appealing trade-off between output quality and computational cost. In\nthis work we focus on the low-latency setting by limiting the number of\navailable future frames. We find that MIMO architectures suffer from problems\nthat have received little attention so far, namely (1) the performance drops\nsignificantly due to the reduced temporal receptive field, particularly for\nframes at the borders of the stack, (2) there are strong temporal\ndiscontinuities at stack transitions which induce a step-wise motion artifact.\nWe propose two simple solutions to alleviate these problems: recurrence across\nMIMO stacks to boost the output quality by implicitly increasing the temporal\nreceptive field, and overlapping of the output stacks to smooth the temporal\ndiscontinuity at stack transitions. These modifications can be applied to any\nMIMO architecture. We test them on three state-of-the-art video denoising\nnetworks with different computational cost. The proposed contributions result\nin a new state-of-the-art for low-latency networks, both in terms of\nreconstruction error and temporal consistency. As an additional contribution,\nwe introduce a new benchmark consisting of drone footage that highlights\ntemporal consistency issues that are not apparent in the standard benchmarks.\n","authors":["Valéry Dewil","Zhe Zheng","Arnaud Barral","Lara Raad","Nao Nicolas","Ioannis Cassagne","Jean-michel Morel","Gabriele Facciolo","Bruno Galerne","Pablo Arias"],"pdf_url":"https://arxiv.org/pdf/2408.12439v1.pdf","comment":"See the project web page to download the associated videos"},{"id":"http://arxiv.org/abs/2408.12437v1","updated":"2024-08-22T14:36:06Z","published":"2024-08-22T14:36:06Z","title":"Robotic Eye-in-hand Visual Servo Axially Aligning Nasopharyngeal Swabs\n with the Nasal Cavity","summary":" The nasopharyngeal (NP) swab test is a method for collecting cultures to\ndiagnose for different types of respiratory illnesses, including COVID-19.\nDelegating this task to robots would be beneficial in terms of reducing\ninfection risks and bolstering the healthcare system, but a critical component\nof the NP swab test is having the swab aligned properly with the nasal cavity\nso that it does not cause excessive discomfort or injury by traveling down the\nwrong passage. Existing research towards robotic NP swabbing typically assumes\nthe patient's head is held within a fixture. This simplifies the alignment\nproblem, but is also dissimilar to clinical scenarios where patients are\ntypically free-standing. Consequently, our work creates a vision-guided\npipeline to allow an instrumented robot arm to properly position and orient NP\nswabs with respect to the nostrils of free-standing patients. The first\ncomponent of the pipeline is a precomputed joint lookup table to allow the arm\nto meet the patient's arbitrary position in the designated workspace, while\navoiding joint limits. Our pipeline leverages semantic face models from\ncomputer vision to estimate the Euclidean pose of the face with respect to a\nmonocular RGB-D camera placed on the end-effector. These estimates are passed\ninto an unscented Kalman filter on manifolds state estimator and a pose based\nvisual servo control loop to move the swab to the designated pose in front of\nthe nostril. Our pipeline was validated with human trials, featuring a cohort\nof 25 participants. The system is effective, reaching the nostril for 84% of\nparticipants, and our statistical analysis did not find significant demographic\nbiases within the cohort.\n","authors":["Peter Q. Lee","John S. Zelek","Katja Mombaur"],"pdf_url":"https://arxiv.org/pdf/2408.12437v1.pdf","comment":"12 pages, 13 figures"},{"id":"http://arxiv.org/abs/2408.12429v1","updated":"2024-08-22T14:22:07Z","published":"2024-08-22T14:22:07Z","title":"FlexEdit: Marrying Free-Shape Masks to VLLM for Flexible Image Editing","summary":" Combining Vision Large Language Models (VLLMs) with diffusion models offers a\npowerful method for executing image editing tasks based on human language\ninstructions. However, language instructions alone often fall short in\naccurately conveying user requirements, particularly when users want to add,\nreplace elements in specific areas of an image. Luckily, masks can effectively\nindicate the exact locations or elements to be edited, while they require users\nto precisely draw the shapes at the desired locations, which is highly\nuser-unfriendly. To address this, we propose FlexEdit, an end-to-end image\nediting method that leverages both free-shape masks and language instructions\nfor Flexible Editing. Our approach employs a VLLM in comprehending the image\ncontent, mask, and user instructions. Additionally, we introduce the Mask\nEnhance Adapter (MEA) that fuses the embeddings of the VLLM with the image\ndata, ensuring a seamless integration of mask information and model output\nembeddings. Furthermore, we construct FSMI-Edit, a benchmark specifically\ntailored for free-shape mask, including 8 types of free-shape mask. Extensive\nexperiments show that our method achieves state-of-the-art (SOTA) performance\nin LLM-based image editing, and our simple prompting technique stands out in\nits effectiveness. The code and data can be found at\nhttps://github.com/A-new-b/flex_edit.\n","authors":["Jue Wang","Yuxiang Lin","Tianshuo Yuan","Zhi-Qi Cheng","Xiaolong Wang","Jiao GH","Wei Chen","Xiaojiang Peng"],"pdf_url":"https://arxiv.org/pdf/2408.12429v1.pdf","comment":"15 pages, 14 figures"},{"id":"http://arxiv.org/abs/2408.12426v1","updated":"2024-08-22T14:20:34Z","published":"2024-08-22T14:20:34Z","title":"Enhanced Infield Agriculture with Interpretable Machine Learning\n Approaches for Crop Classification","summary":" The increasing popularity of Artificial Intelligence in recent years has led\nto a surge in interest in image classification, especially in the agricultural\nsector. With the help of Computer Vision, Machine Learning, and Deep Learning,\nthe sector has undergone a significant transformation, leading to the\ndevelopment of new techniques for crop classification in the field. Despite the\nextensive research on various image classification techniques, most have\nlimitations such as low accuracy, limited use of data, and a lack of reporting\nmodel size and prediction. The most significant limitation of all is the need\nfor model explainability. This research evaluates four different approaches for\ncrop classification, namely traditional ML with handcrafted feature extraction\nmethods like SIFT, ORB, and Color Histogram; Custom Designed CNN and\nestablished DL architecture like AlexNet; transfer learning on five models\npre-trained using ImageNet such as EfficientNetV2, ResNet152V2, Xception,\nInception-ResNetV2, MobileNetV3; and cutting-edge foundation models like YOLOv8\nand DINOv2, a self-supervised Vision Transformer Model. All models performed\nwell, but Xception outperformed all of them in terms of generalization,\nachieving 98% accuracy on the test data, with a model size of 80.03 MB and a\nprediction time of 0.0633 seconds. A key aspect of this research was the\napplication of Explainable AI to provide the explainability of all the models.\nThis journal presents the explainability of Xception model with LIME, SHAP, and\nGradCAM, ensuring transparency and trustworthiness in the models' predictions.\nThis study highlights the importance of selecting the right model according to\ntask-specific needs. It also underscores the important role of explainability\nin deploying AI in agriculture, providing insightful information to help\nenhance AI-driven crop management strategies.\n","authors":["Sudi Murindanyi","Joyce Nakatumba-Nabende","Rahman Sanya","Rose Nakibuule","Andrew Katumba"],"pdf_url":"https://arxiv.org/pdf/2408.12426v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.04151v4","updated":"2024-08-22T14:19:45Z","published":"2024-03-07T02:17:59Z","title":"Dual-path Frequency Discriminators for Few-shot Anomaly Detection","summary":" Few-shot anomaly detection (FSAD) plays a crucial role in industrial\nmanufacturing. However, existing FSAD methods encounter difficulties leveraging\na limited number of normal samples, frequently failing to detect and locate\ninconspicuous anomalies in the spatial domain. We have further discovered that\nthese subtle anomalies would be more noticeable in the frequency domain. In\nthis paper, we propose a Dual-Path Frequency Discriminators (DFD) network from\na frequency perspective to tackle these issues. The original spatial images are\ntransformed into multi-frequency images, making them more conducive to the\ntailored discriminators in detecting anomalies. Additionally, the\ndiscriminators learn a joint representation with forms of pseudo-anomalies.\nExtensive experiments conducted on MVTec AD and VisA benchmarks demonstrate\nthat our DFD surpasses current state-of-the-art methods. The code is available\nat \\url{https://github.com/yuhbai/DFD}.\n","authors":["Yuhu Bai","Jiangning Zhang","Zhaofeng Chen","Yuhang Dong","Yunkang Cao","Guanzhong Tian"],"pdf_url":"https://arxiv.org/pdf/2403.04151v4.pdf","comment":"Accepted by KBS"},{"id":"http://arxiv.org/abs/2408.12418v1","updated":"2024-08-22T14:12:20Z","published":"2024-08-22T14:12:20Z","title":"CODE: Confident Ordinary Differential Editing","summary":" Conditioning image generation facilitates seamless editing and the creation\nof photorealistic images. However, conditioning on noisy or Out-of-Distribution\n(OoD) images poses significant challenges, particularly in balancing fidelity\nto the input and realism of the output. We introduce Confident Ordinary\nDifferential Editing (CODE), a novel approach for image synthesis that\neffectively handles OoD guidance images. Utilizing a diffusion model as a\ngenerative prior, CODE enhances images through score-based updates along the\nprobability-flow Ordinary Differential Equation (ODE) trajectory. This method\nrequires no task-specific training, no handcrafted modules, and no assumptions\nregarding the corruptions affecting the conditioning image. Our method is\ncompatible with any diffusion model. Positioned at the intersection of\nconditional image generation and blind image restoration, CODE operates in a\nfully blind manner, relying solely on a pre-trained generative model. Our\nmethod introduces an alternative approach to blind restoration: instead of\ntargeting a specific ground truth image based on assumptions about the\nunderlying corruption, CODE aims to increase the likelihood of the input image\nwhile maintaining fidelity. This results in the most probable in-distribution\nimage around the input. Our contributions are twofold. First, CODE introduces a\nnovel editing method based on ODE, providing enhanced control, realism, and\nfidelity compared to its SDE-based counterpart. Second, we introduce a\nconfidence interval-based clipping method, which improves CODE's effectiveness\nby allowing it to disregard certain pixels or information, thus enhancing the\nrestoration process in a blind manner. Experimental results demonstrate CODE's\neffectiveness over existing methods, particularly in scenarios involving severe\ndegradation or OoD inputs.\n","authors":["Bastien van Delft","Tommaso Martorella","Alexandre Alahi"],"pdf_url":"https://arxiv.org/pdf/2408.12418v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12406v1","updated":"2024-08-22T13:58:08Z","published":"2024-08-22T13:58:08Z","title":"Generalized SAM: Efficient Fine-Tuning of SAM for Variable Input Image\n Sizes","summary":" There has been a lot of recent research on improving the efficiency of\nfine-tuning foundation models. In this paper, we propose a novel efficient\nfine-tuning method that allows the input image size of Segment Anything Model\n(SAM) to be variable. SAM is a powerful foundational model for image\nsegmentation trained on huge datasets, but it requires fine-tuning to recognize\narbitrary classes. The input image size of SAM is fixed at 1024 x 1024,\nresulting in substantial computational demands during training. Furthermore,\nthe fixed input image size may result in the loss of image information, e.g.\ndue to fixed aspect ratios. To address this problem, we propose Generalized SAM\n(GSAM). Different from the previous methods, GSAM is the first to apply random\ncropping during training with SAM, thereby significantly reducing the\ncomputational cost of training. Experiments on datasets of various types and\nvarious pixel counts have shown that GSAM can train more efficiently than SAM\nand other fine-tuning methods for SAM, achieving comparable or higher accuracy.\n","authors":["Sota Kato","Hinako Mitsuoka","Kazuhiro Hotta"],"pdf_url":"https://arxiv.org/pdf/2408.12406v1.pdf","comment":"Accepted by ECCV2024 Workshop \"Computational Aspects of Deep Learning\n (CADL)\""},{"id":"http://arxiv.org/abs/2404.02785v3","updated":"2024-08-22T13:57:32Z","published":"2024-04-03T14:55:17Z","title":"Domain Generalization through Meta-Learning: A Survey","summary":" Deep neural networks (DNNs) have revolutionized artificial intelligence but\noften lack performance when faced with out-of-distribution (OOD) data, a common\nscenario due to the inevitable domain shifts in real-world applications. This\nlimitation stems from the common assumption that training and testing data\nshare the same distribution--an assumption frequently violated in practice.\nDespite their effectiveness with large amounts of data and computational power,\nDNNs struggle with distributional shifts and limited labeled data, leading to\noverfitting and poor generalization across various tasks and domains.\nMeta-learning presents a promising approach by employing algorithms that\nacquire transferable knowledge across various tasks for fast adaptation,\neliminating the need to learn each task from scratch. This survey paper delves\ninto the realm of meta-learning with a focus on its contribution to domain\ngeneralization. We first clarify the concept of meta-learning for domain\ngeneralization and introduce a novel taxonomy based on the feature extraction\nstrategy and the classifier learning methodology, offering a granular view of\nmethodologies. Additionally, we present a decision graph to assist readers in\nnavigating the taxonomy based on data availability and domain shifts, enabling\nthem to select and develop a proper model tailored to their specific problem\nrequirements. Through an exhaustive review of existing methods and underlying\ntheories, we map out the fundamentals of the field. Our survey provides\npractical insights and an informed discussion on promising research directions.\n","authors":["Arsham Gholamzadeh Khoee","Yinan Yu","Robert Feldt"],"pdf_url":"https://arxiv.org/pdf/2404.02785v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16169v4","updated":"2024-08-22T13:51:34Z","published":"2024-03-24T14:24:13Z","title":"Gaze-guided Hand-Object Interaction Synthesis: Dataset and Method","summary":" Gaze plays a crucial role in revealing human attention and intention,\nparticularly in hand-object interaction scenarios, where it guides and\nsynchronizes complex tasks that require precise coordination between the brain,\nhand, and object. Motivated by this, we introduce a novel task: Gaze-Guided\nHand-Object Interaction Synthesis, with potential applications in augmented\nreality, virtual reality, and assistive technologies. To support this task, we\npresent GazeHOI, the first dataset to capture simultaneous 3D modeling of gaze,\nhand, and object interactions. This task poses significant challenges due to\nthe inherent sparsity and noise in gaze data, as well as the need for high\nconsistency and physical plausibility in generating hand and object motions. To\ntackle these issues, we propose a stacked gaze-guided hand-object interaction\ndiffusion model, named GHO-Diffusion. The stacked design effectively reduces\nthe complexity of motion generation. We also introduce HOI-Manifold Guidance\nduring the sampling stage of GHO-Diffusion, enabling fine-grained control over\ngenerated motions while maintaining the data manifold. Additionally, we propose\na spatial-temporal gaze feature encoding for the diffusion condition and select\ndiffusion results based on consistency scores between gaze-contact maps and\ngaze-interaction trajectories. Extensive experiments highlight the\neffectiveness of our method and the unique contributions of our dataset.\n","authors":["Jie Tian","Ran Ji","Lingxiao Yang","Yuexin Ma","Lan Xu","Jingyi Yu","Ye Shi","Jingya Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16169v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12400v1","updated":"2024-08-22T13:45:04Z","published":"2024-08-22T13:45:04Z","title":"Multi-Style Facial Sketch Synthesis through Masked Generative Modeling","summary":" The facial sketch synthesis (FSS) model, capable of generating sketch\nportraits from given facial photographs, holds profound implications across\nmultiple domains, encompassing cross-modal face recognition, entertainment,\nart, media, among others. However, the production of high-quality sketches\nremains a formidable task, primarily due to the challenges and flaws associated\nwith three key factors: (1) the scarcity of artist-drawn data, (2) the\nconstraints imposed by limited style types, and (3) the deficiencies of\nprocessing input information in existing models. To address these difficulties,\nwe propose a lightweight end-to-end synthesis model that efficiently converts\nimages to corresponding multi-stylized sketches, obviating the necessity for\nany supplementary inputs (\\eg, 3D geometry). In this study, we overcome the\nissue of data insufficiency by incorporating semi-supervised learning into the\ntraining process. Additionally, we employ a feature extraction module and style\nembeddings to proficiently steer the generative transformer during the\niterative prediction of masked image tokens, thus achieving a continuous\nstylized output that retains facial features accurately in sketches. The\nextensive experiments demonstrate that our method consistently outperforms\nprevious algorithms across multiple benchmarks, exhibiting a discernible\ndisparity.\n","authors":["Bowen Sun","Guo Lu","Shibao Zheng"],"pdf_url":"https://arxiv.org/pdf/2408.12400v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12396v1","updated":"2024-08-22T13:41:18Z","published":"2024-08-22T13:41:18Z","title":"Cross-Domain Foundation Model Adaptation: Pioneering Computer Vision\n Models for Geophysical Data Analysis","summary":" We explore adapting foundation models (FMs) from the computer vision domain\nto geoscience. FMs, large neural networks trained on massive datasets, excel in\ndiverse tasks with remarkable adaptability and generality. However, geoscience\nfaces challenges like lacking curated training datasets and high computational\ncosts for developing specialized FMs. This study considers adapting FMs from\ncomputer vision to geoscience, analyzing their scale, adaptability, and\ngenerality for geoscientific data analysis. We introduce a workflow that\nleverages existing computer vision FMs, fine-tuning them for geoscientific\ntasks, reducing development costs while enhancing accuracy. Through\nexperiments, we demonstrate this workflow's effectiveness in broad applications\nto process and interpret geoscientific data of lunar images, seismic data, DAS\narrays and so on. Our findings introduce advanced ML techniques to geoscience,\nproving the feasibility and advantages of cross-domain FMs adaptation, driving\nfurther advancements in geoscientific data analysis and offering valuable\ninsights for FMs applications in other scientific domains.\n","authors":["Zhixiang Guo","Xinming Wu","Luming Liang","Hanlin Sheng","Nuo Chen","Zhengfa Bi"],"pdf_url":"https://arxiv.org/pdf/2408.12396v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12381v1","updated":"2024-08-22T13:21:47Z","published":"2024-08-22T13:21:47Z","title":"Sampling Strategies based on Wisdom of Crowds for Amazon Deforestation\n Detection","summary":" Conserving tropical forests is highly relevant socially and ecologically\nbecause of their critical role in the global ecosystem. However, the ongoing\ndeforestation and degradation affect millions of hectares each year,\nnecessitating government or private initiatives to ensure effective forest\nmonitoring. In April 2019, a project based on Citizen Science and Machine\nLearning models called ForestEyes (FE) was launched with the aim of providing\nsupplementary data to assist experts from government and non-profit\norganizations in their deforestation monitoring efforts. Recent research has\nshown that labeling FE project volunteers/citizen scientists helps tailor\nmachine learning models. In this sense, we adopt the FE project to create\ndifferent sampling strategies based on the wisdom of crowds to select the most\nsuitable samples from the training set to learn an SVM technique and obtain\nbetter classification results in deforestation detection tasks. In our\nexperiments, we can show that our strategy based on user entropy-increasing\nachieved the best classification results in the deforestation detection task\nwhen compared with the random sampling strategies, as well as, reducing the\nconvergence time of the SVM technique.\n","authors":["Hugo Resende","Eduardo B. Neto","Fabio A. M. Cappabianco","Alvaro L. Fazenda","Fabio A. Faria"],"pdf_url":"https://arxiv.org/pdf/2408.12381v1.pdf","comment":"6 pages, 5 figus, paper accepted at the SIBGRAPI 2024"},{"id":"http://arxiv.org/abs/2408.08561v3","updated":"2024-08-22T13:21:29Z","published":"2024-08-16T06:52:38Z","title":"A New Chinese Landscape Paintings Generation Model based on Stable\n Diffusion using DreamBooth","summary":" This study mainly introduces a method combining the Stable Diffusion Model\n(SDM) and Parameter-Efficient Fine-Tuning method for generating Chinese\nLandscape Paintings. This training process is accelerated by combining LoRA\nwith pre-trained SDM and DreamBooth with pre-trained SDM, respectively. On the\nChinese Landscape Paintings Internet dataset used in this paper, this study\nfinds that SDM combined with DreamBooth exhibits superior performance,\noutperforming other models, including the generic pre-trained SDM and\nLoRA-based fine-tuning SDM. The SDM combined with DreamBooth achieves a FID of\n12.75 on the dataset and outperforms all other models in terms of expert\nevaluation, highlighting the model's versatility in the field of Chinese\nLandscape Paintings given the unique identifier, high fidelity and high\nquality. This study illustrates the potential of specialised fine-tuning method\nto improve the performance of SDM on domain-specific tasks, particularly in the\ndomain of Landscape Paintings.\n","authors":["Yujia Gu","Xinyu Fang","Xueyuan Deng","Zihan Peng","Yinan Peng"],"pdf_url":"https://arxiv.org/pdf/2408.08561v3.pdf","comment":"accepted by AHPCAI"},{"id":"http://arxiv.org/abs/2408.12380v1","updated":"2024-08-22T13:18:55Z","published":"2024-08-22T13:18:55Z","title":"UMERegRobust -- Universal Manifold Embedding Compatible Features for\n Robust Point Cloud Registration","summary":" In this paper, we adopt the Universal Manifold Embedding (UME) framework for\nthe estimation of rigid transformations and extend it, so that it can\naccommodate scenarios involving partial overlap and differently sampled point\nclouds. UME is a methodology designed for mapping observations of the same\nobject, related by rigid transformations, into a single low-dimensional linear\nsubspace. This process yields a transformation-invariant representation of the\nobservations, with its matrix form representation being covariant (i.e.\nequivariant) with the transformation. We extend the UME framework by\nintroducing a UME-compatible feature extraction method augmented with a unique\nUME contrastive loss and a sampling equalizer. These components are integrated\ninto a comprehensive and robust registration pipeline, named UMERegRobust. We\npropose the RotKITTI registration benchmark, specifically tailored to evaluate\nregistration methods for scenarios involving large rotations. UMERegRobust\nachieves better than state-of-the-art performance on the KITTI benchmark,\nespecially when strict precision of (1{\\deg}, 10cm) is considered (with an\naverage gain of +9%), and notably outperform SOTA methods on the RotKITTI\nbenchmark (with +45% gain compared the most recent SOTA method). Our code is\navailable at https://github.com/yuvalH9/UMERegRobust.\n","authors":["Yuval Haitman","Amit Efraim","Joseph M. Francos"],"pdf_url":"https://arxiv.org/pdf/2408.12380v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2408.03608v2","updated":"2024-08-22T13:13:56Z","published":"2024-08-07T07:54:19Z","title":"Mixstyle-Entropy: Domain Generalization with Causal Intervention and\n Perturbation","summary":" Despite the considerable advancements achieved by deep neural networks, their\nperformance tends to degenerate when the test environment diverges from the\ntraining ones. Domain generalization (DG) solves this issue by learning\nrepresentations independent of domain-related information, thus facilitating\nextrapolation to unseen environments. Existing approaches typically focus on\nformulating tailored training objectives to extract shared features from the\nsource data. However, the disjointed training and testing procedures may\ncompromise robustness, particularly in the face of unforeseen variations during\ndeployment. In this paper, we propose a novel and holistic framework based on\ncausality, named InPer, designed to enhance model generalization by\nincorporating causal intervention during training and causal perturbation\nduring testing. Specifically, during the training phase, we employ\nentropy-based causal intervention (EnIn) to refine the selection of causal\nvariables. To identify samples with anti-interference causal variables from the\ntarget domain, we propose a novel metric, homeostatic score, through causal\nperturbation (HoPer) to construct a prototype classifier in test time.\nExperimental results across multiple cross-domain tasks confirm the efficacy of\nInPer.\n","authors":["Luyao Tang","Yuxuan Yuan","Chaoqi Chen","Xinghao Ding","Yue Huang"],"pdf_url":"https://arxiv.org/pdf/2408.03608v2.pdf","comment":"Accepted by BMVC2024"},{"id":"http://arxiv.org/abs/2408.12366v1","updated":"2024-08-22T13:06:31Z","published":"2024-08-22T13:06:31Z","title":"Robust Principal Component Analysis via Discriminant Sample Weight\n Learning","summary":" Principal component analysis (PCA) is a classical feature extraction method,\nbut it may be adversely affected by outliers, resulting in inaccurate learning\nof the projection matrix. This paper proposes a robust method to estimate both\nthe data mean and the PCA projection matrix by learning discriminant sample\nweights from data containing outliers. Each sample in the dataset is assigned a\nweight, and the proposed algorithm iteratively learns the weights, the mean,\nand the projection matrix, respectively. Specifically, when the mean and the\nprojection matrix are available, via fine-grained analysis of outliers, a\nweight for each sample is learned hierarchically so that outliers have small\nweights while normal samples have large weights. With the learned weights\navailable, a weighted optimization problem is solved to estimate both the data\nmean and the projection matrix. Because the learned weights discriminate\noutliers from normal samples, the adverse influence of outliers is mitigated\ndue to the corresponding small weights. Experiments on toy data, UCI dataset,\nand face dataset demonstrate the effectiveness of the proposed method in\nestimating the mean and the projection matrix from the data containing\noutliers.\n","authors":["Yingzhuo Deng","Ke Hu","Bo Li","Yao Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.12366v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12364v1","updated":"2024-08-22T13:03:05Z","published":"2024-08-22T13:03:05Z","title":"SAM-SP: Self-Prompting Makes SAM Great Again","summary":" The recently introduced Segment Anything Model (SAM), a Visual Foundation\nModel (VFM), has demonstrated impressive capabilities in zero-shot segmentation\ntasks across diverse natural image datasets. Despite its success, SAM\nencounters noticeably performance degradation when applied to specific domains,\nsuch as medical images. Current efforts to address this issue have involved\nfine-tuning strategies, intended to bolster the generalizability of the vanilla\nSAM. However, these approaches still predominantly necessitate the utilization\nof domain specific expert-level prompts during the evaluation phase, which\nseverely constrains the model's practicality.\n To overcome this limitation, we introduce a novel self-prompting based\nfine-tuning approach, called SAM-SP, tailored for extending the vanilla SAM\nmodel. Specifically, SAM-SP leverages the output from the previous iteration of\nthe model itself as prompts to guide subsequent iteration of the model. This\nself-prompting module endeavors to learn how to generate useful prompts\nautonomously and alleviates the dependence on expert prompts during the\nevaluation phase, significantly broadening SAM's applicability. Additionally,\nwe integrate a self-distillation module to enhance the self-prompting process\nfurther. Extensive experiments across various domain specific datasets validate\nthe effectiveness of the proposed SAM-SP. Our SAM-SP not only alleviates the\nreliance on expert prompts but also exhibits superior segmentation performance\ncomparing to the state-of-the-art task-specific segmentation approaches, the\nvanilla SAM, and SAM-based approaches.\n","authors":["Chunpeng Zhou","Kangjie Ning","Qianqian Shen","Sheng Zhou","Zhi Yu","Haishuai Wang"],"pdf_url":"https://arxiv.org/pdf/2408.12364v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2408.12355v1","updated":"2024-08-22T12:54:15Z","published":"2024-08-22T12:54:15Z","title":"Class-balanced Open-set Semi-supervised Object Detection for Medical\n Images","summary":" Medical image datasets in the real world are often unlabeled and imbalanced,\nand Semi-Supervised Object Detection (SSOD) can utilize unlabeled data to\nimprove an object detector. However, existing approaches predominantly assumed\nthat the unlabeled data and test data do not contain out-of-distribution (OOD)\nclasses. The few open-set semi-supervised object detection methods have two\nweaknesses: first, the class imbalance is not considered; second, the OOD\ninstances are distinguished and simply discarded during pseudo-labeling. In\nthis paper, we consider the open-set semi-supervised object detection problem\nwhich leverages unlabeled data that contain OOD classes to improve object\ndetection for medical images. Our study incorporates two key innovations:\nCategory Control Embed (CCE) and out-of-distribution Detection Fusion\nClassifier (OODFC). CCE is designed to tackle dataset imbalance by constructing\na Foreground information Library, while OODFC tackles open-set challenges by\nintegrating the ``unknown'' information into basic pseudo-labels. Our method\noutperforms the state-of-the-art SSOD performance, achieving a 4.25 mAP\nimprovement on the public Parasite dataset.\n","authors":["Zhanyun Lu","Renshu Gu","Huimin Cheng","Siyu Pang","Mingyu Xu","Peifang Xu","Yaqi Wang","Yuichiro Kinoshita","Juan Ye","Gangyong Jia","Qing Wu"],"pdf_url":"https://arxiv.org/pdf/2408.12355v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12352v1","updated":"2024-08-22T12:50:45Z","published":"2024-08-22T12:50:45Z","title":"GarmentAligner: Text-to-Garment Generation via Retrieval-augmented\n Multi-level Corrections","summary":" General text-to-image models bring revolutionary innovation to the fields of\narts, design, and media. However, when applied to garment generation, even the\nstate-of-the-art text-to-image models suffer from fine-grained semantic\nmisalignment, particularly concerning the quantity, position, and\ninterrelations of garment components. Addressing this, we propose\nGarmentAligner, a text-to-garment diffusion model trained with\nretrieval-augmented multi-level corrections. To achieve semantic alignment at\nthe component level, we introduce an automatic component extraction pipeline to\nobtain spatial and quantitative information of garment components from\ncorresponding images and captions. Subsequently, to exploit component\nrelationships within the garment images, we construct retrieval subsets for\neach garment by retrieval augmentation based on component-level similarity\nranking and conduct contrastive learning to enhance the model perception of\ncomponents from positive and negative samples. To further enhance the alignment\nof components across semantic, spatial, and quantitative granularities, we\npropose the utilization of multi-level correction losses that leverage detailed\ncomponent information. The experimental findings demonstrate that\nGarmentAligner achieves superior fidelity and fine-grained semantic alignment\nwhen compared to existing competitors.\n","authors":["Shiyue Zhang","Zheng Chong","Xujie Zhang","Hanhui Li","Yuhao Cheng","Yiqiang Yan","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2408.12352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.07089v2","updated":"2024-08-22T12:48:10Z","published":"2022-07-14T17:40:05Z","title":"A Personalized Zero-Shot ECG Arrhythmia Monitoring System: From Sparse\n Representation Based Domain Adaption to Energy Efficient Abnormal Beat\n Detection for Practical ECG Surveillance","summary":" This paper proposes a low-cost and highly accurate ECG-monitoring system\nintended for personalized early arrhythmia detection for wearable mobile\nsensors. Earlier supervised approaches for personalized ECG monitoring require\nboth abnormal and normal heartbeats for the training of the dedicated\nclassifier. However, in a real-world scenario where the personalized algorithm\nis embedded in a wearable device, such training data is not available for\nhealthy people with no cardiac disorder history. In this study, (i) we propose\na null space analysis on the healthy signal space obtained via sparse\ndictionary learning, and investigate how a simple null space projection or\nalternatively regularized least squares-based classification methods can reduce\nthe computational complexity, without sacrificing the detection accuracy, when\ncompared to sparse representation-based classification. (ii) Then we introduce\na sparse representation-based domain adaptation technique in order to project\nother existing users' abnormal and normal signals onto the new user's signal\nspace, enabling us to train the dedicated classifier without having any\nabnormal heartbeat of the new user. Therefore, zero-shot learning can be\nachieved without the need for synthetic abnormal heartbeat generation. An\nextensive set of experiments performed on the benchmark MIT-BIH ECG dataset\nshows that when this domain adaptation-based training data generator is used\nwith a simple 1-D CNN classifier, the method outperforms the prior work by a\nsignificant margin. (iii) Then, by combining (i) and (ii), we propose an\nensemble classifier that further improves the performance. This approach for\nzero-shot arrhythmia detection achieves an average accuracy level of 98.2% and\nan F1-Score of 92.8%. Finally, a personalized energy-efficient ECG monitoring\nscheme is proposed using the above-mentioned innovations.\n","authors":["Mehmet Yamaç","Mert Duman","İlke Adalıoğlu","Serkan Kiranyaz","Moncef Gabbouj"],"pdf_url":"https://arxiv.org/pdf/2207.07089v2.pdf","comment":"Software implementation: https://github.com/MertDuman/Zero-Shot-ECG"},{"id":"http://arxiv.org/abs/2408.12340v1","updated":"2024-08-22T12:36:10Z","published":"2024-08-22T12:36:10Z","title":"VTON-HandFit: Virtual Try-on for Arbitrary Hand Pose Guided by Hand\n Priors Embedding","summary":" Although diffusion-based image virtual try-on has made considerable progress,\nemerging approaches still struggle to effectively address the issue of hand\nocclusion (i.e., clothing regions occluded by the hand part), leading to a\nnotable degradation of the try-on performance. To tackle this issue widely\nexisting in real-world scenarios, we propose VTON-HandFit, leveraging the power\nof hand priors to reconstruct the appearance and structure for hand occlusion\ncases. Firstly, we tailor a Handpose Aggregation Net using the ControlNet-based\nstructure explicitly and adaptively encoding the global hand and pose priors.\nBesides, to fully exploit the hand-related structure and appearance\ninformation, we propose Hand-feature Disentanglement Embedding module to\ndisentangle the hand priors into the hand structure-parametric and\nvisual-appearance features, and customize a masked cross attention for further\ndecoupled feature embedding. Lastly, we customize a hand-canny constraint loss\nto better learn the structure edge knowledge from the hand template of model\nimage. VTON-HandFit outperforms the baselines in qualitative and quantitative\nevaluations on the public dataset and our self-collected hand-occlusion\nHandfit-3K dataset particularly for the arbitrary hand pose occlusion cases in\nreal-world scenarios. Code and dataset will be made publicly available.\n","authors":["Yujie Liang","Xiaobin Hu","Boyuan Jiang","Donghao Luo","Kai WU","Wenhui Han","Taisong Jin","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2408.12340v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12323v1","updated":"2024-08-22T11:57:59Z","published":"2024-08-22T11:57:59Z","title":"EUIS-Net: A Convolutional Neural Network for Efficient Ultrasound Image\n Segmentation","summary":" Segmenting ultrasound images is critical for various medical applications,\nbut it offers significant challenges due to ultrasound images' inherent noise\nand unpredictability. To address these challenges, we proposed EUIS-Net, a CNN\nnetwork designed to segment ultrasound images efficiently and precisely. The\nproposed EUIS-Net utilises four encoder-decoder blocks, resulting in a notable\ndecrease in computational complexity while achieving excellent performance. The\nproposed EUIS-Net integrates both channel and spatial attention mechanisms into\nthe bottleneck to improve feature representation and collect significant\ncontextual information. In addition, EUIS-Net incorporates a region-aware\nattention module in skip connections, which enhances the ability to concentrate\non the region of the injury. To enable thorough information exchange across\nvarious network blocks, skip connection aggregation is employed from the\nnetwork's lowermost to the uppermost block. Comprehensive evaluations are\nconducted on two publicly available ultrasound image segmentation datasets. The\nproposed EUIS-Net achieved mean IoU and dice scores of 78. 12\\%, 85. 42\\% and\n84. 73\\%, 89. 01\\% in the BUSI and DDTI datasets, respectively. The findings of\nour study showcase the substantial capabilities of EUIS-Net for immediate use\nin clinical settings and its versatility in various ultrasound imaging tasks.\n","authors":["Shahzaib Iqbal","Hasnat Ahmed","Muhammad Sharif","Madiha Hena","Tariq M. Khan","Imran Razzak"],"pdf_url":"https://arxiv.org/pdf/2408.12323v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12322v1","updated":"2024-08-22T11:57:32Z","published":"2024-08-22T11:57:32Z","title":"Multimodal Foundational Models for Unsupervised 3D General Obstacle\n Detection","summary":" Current autonomous driving perception models primarily rely on supervised\nlearning with predefined categories. However, these models struggle to detect\ngeneral obstacles not included in the fixed category set due to their\nvariability and numerous edge cases. To address this issue, we propose a\ncombination of multimodal foundational model-based obstacle segmentation with\ntraditional unsupervised computational geometry-based outlier detection. Our\napproach operates offline, allowing us to leverage non-causality, and utilizes\ntraining-free methods. This enables the detection of general obstacles in 3D\nwithout the need for expensive retraining. To overcome the limitations of\npublicly available obstacle detection datasets, we collected and annotated our\ndataset, which includes various obstacles even in distant regions.\n","authors":["Tamás Matuszka","Péter Hajas","Dávid Szeghy"],"pdf_url":"https://arxiv.org/pdf/2408.12322v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12321v1","updated":"2024-08-22T11:57:16Z","published":"2024-08-22T11:57:16Z","title":"MaVEn: An Effective Multi-granularity Hybrid Visual Encoding Framework\n for Multimodal Large Language Model","summary":" This paper presents MaVEn, an innovative Multi-granularity Visual Encoding\nframework designed to enhance the capabilities of Multimodal Large Language\nModels (MLLMs) in multi-image reasoning. Current MLLMs primarily focus on\nsingle-image visual understanding, limiting their ability to interpret and\nintegrate information across multiple images. MaVEn addresses this limitation\nby combining discrete visual symbol sequences, which abstract coarse-grained\nsemantic concepts, with traditional continuous representation sequences that\nmodel fine-grained features. This dual approach bridges the semantic gap\nbetween visual and textual data, thereby improving the model's ability to\nprocess and interpret information from multiple images effectively.\nAdditionally, we design a dynamic reduction mechanism by for long-sequence\ncontinuous features to enhance multi-image processing efficiency. Experimental\nresults demonstrate that MaVEn significantly enhances MLLMs' understanding in\ncomplex multi-image scenarios, while also improving performance in single-image\ncontexts.\n","authors":["Chaoya Jiang","Jia Hongrui","Haiyang Xu","Wei Ye","Mengfan Dong","Ming Yan","Ji Zhang","Fei Huang","Shikun Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.12321v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02918v3","updated":"2024-08-22T11:55:56Z","published":"2024-06-05T04:13:03Z","title":"U-KAN Makes Strong Backbone for Medical Image Segmentation and\n Generation","summary":" U-Net has become a cornerstone in various visual applications such as image\nsegmentation and diffusion probability models. While numerous innovative\ndesigns and improvements have been introduced by incorporating transformers or\nMLPs, the networks are still limited to linearly modeling patterns as well as\nthe deficient interpretability. To address these challenges, our intuition is\ninspired by the impressive results of the Kolmogorov-Arnold Networks (KANs) in\nterms of accuracy and interpretability, which reshape the neural network\nlearning via the stack of non-linear learnable activation functions derived\nfrom the Kolmogorov-Anold representation theorem. Specifically, in this paper,\nwe explore the untapped potential of KANs in improving backbones for vision\ntasks. We investigate, modify and re-design the established U-Net pipeline by\nintegrating the dedicated KAN layers on the tokenized intermediate\nrepresentation, termed U-KAN. Rigorous medical image segmentation benchmarks\nverify the superiority of U-KAN by higher accuracy even with less computation\ncost. We further delved into the potential of U-KAN as an alternative U-Net\nnoise predictor in diffusion models, demonstrating its applicability in\ngenerating task-oriented model architectures. These endeavours unveil valuable\ninsights and sheds light on the prospect that with U-KAN, you can make strong\nbackbone for medical image segmentation and generation. Project\npage:\\url{https://yes-u-kan.github.io/}.\n","authors":["Chenxin Li","Xinyu Liu","Wuyang Li","Cheng Wang","Hengyu Liu","Yifan Liu","Zhen Chen","Yixuan Yuan"],"pdf_url":"https://arxiv.org/pdf/2406.02918v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12317v1","updated":"2024-08-22T11:51:50Z","published":"2024-08-22T11:51:50Z","title":"Adapt CLIP as Aggregation Instructor for Image Dehazing","summary":" Most dehazing methods suffer from limited receptive field and do not explore\nthe rich semantic prior encapsulated in vision-language models, which have\nproven effective in downstream tasks. In this paper, we introduce CLIPHaze, a\npioneering hybrid framework that synergizes the efficient global modeling of\nMamba with the prior knowledge and zero-shot capabilities of CLIP to address\nboth issues simultaneously. Specifically, our method employs parallel state\nspace model and window-based self-attention to obtain global contextual\ndependency and local fine-grained perception, respectively. To seamlessly\naggregate information from both paths, we introduce CLIP-instructed Aggregation\nModule (CAM). For non-homogeneous and homogeneous haze, CAM leverages zero-shot\nestimated haze density map and high-quality image embedding without degradation\ninformation to explicitly and implicitly determine the optimal neural operation\nrange for each pixel, thereby adaptively fusing two paths with different\nreceptive fields. Extensive experiments on various benchmarks demonstrate that\nCLIPHaze achieves state-of-the-art (SOTA) performance, particularly in\nnon-homogeneous haze. Code will be publicly after acceptance.\n","authors":["Xiaozhe Zhang","Fengying Xie","Haidong Ding","Linpeng Pan","Zhenwei Shi"],"pdf_url":"https://arxiv.org/pdf/2408.12317v1.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.12316v1","updated":"2024-08-22T11:45:11Z","published":"2024-08-22T11:45:11Z","title":"Unrolled Decomposed Unpaired Learning for Controllable Low-Light Video\n Enhancement","summary":" Obtaining pairs of low/normal-light videos, with motions, is more challenging\nthan still images, which raises technical issues and poses the technical route\nof unpaired learning as a critical role. This paper makes endeavors in the\ndirection of learning for low-light video enhancement without using paired\nground truth. Compared to low-light image enhancement, enhancing low-light\nvideos is more difficult due to the intertwined effects of noise, exposure, and\ncontrast in the spatial domain, jointly with the need for temporal coherence.\nTo address the above challenge, we propose the Unrolled Decomposed Unpaired\nNetwork (UDU-Net) for enhancing low-light videos by unrolling the optimization\nfunctions into a deep network to decompose the signal into spatial and\ntemporal-related factors, which are updated iteratively. Firstly, we formulate\nlow-light video enhancement as a Maximum A Posteriori estimation (MAP) problem\nwith carefully designed spatial and temporal visual regularization. Then, via\nunrolling the problem, the optimization of the spatial and temporal constraints\ncan be decomposed into different steps and updated in a stage-wise manner. From\nthe spatial perspective, the designed Intra subnet leverages unpair prior\ninformation from expert photography retouched skills to adjust the statistical\ndistribution. Additionally, we introduce a novel mechanism that integrates\nhuman perception feedback to guide network optimization, suppressing\nover/under-exposure conditions. Meanwhile, to address the issue from the\ntemporal perspective, the designed Inter subnet fully exploits temporal cues in\nprogressive optimization, which helps achieve improved temporal consistency in\nenhancement results. Consequently, the proposed method achieves superior\nperformance to state-of-the-art methods in video illumination, noise\nsuppression, and temporal consistency across outdoor and indoor scenes.\n","authors":["Lingyu Zhu","Wenhan Yang","Baoliang Chen","Hanwei Zhu","Zhangkai Ni","Qi Mao","Shiqi Wang"],"pdf_url":"https://arxiv.org/pdf/2408.12316v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12312v1","updated":"2024-08-22T11:39:36Z","published":"2024-08-22T11:39:36Z","title":"MakeupAttack: Feature Space Black-box Backdoor Attack on Face\n Recognition via Makeup Transfer","summary":" Backdoor attacks pose a significant threat to the training process of deep\nneural networks (DNNs). As a widely-used DNN-based application in real-world\nscenarios, face recognition systems once implanted into the backdoor, may cause\nserious consequences. Backdoor research on face recognition is still in its\nearly stages, and the existing backdoor triggers are relatively simple and\nvisible. Furthermore, due to the perceptibility, diversity, and similarity of\nfacial datasets, many state-of-the-art backdoor attacks lose effectiveness on\nface recognition tasks. In this work, we propose a novel feature space backdoor\nattack against face recognition via makeup transfer, dubbed MakeupAttack. In\ncontrast to many feature space attacks that demand full access to target\nmodels, our method only requires model queries, adhering to black-box attack\nprinciples. In our attack, we design an iterative training paradigm to learn\nthe subtle features of the proposed makeup-style trigger. Additionally,\nMakeupAttack promotes trigger diversity using the adaptive selection method,\ndispersing the feature distribution of malicious samples to bypass existing\ndefense methods. Extensive experiments were conducted on two widely-used facial\ndatasets targeting multiple models. The results demonstrate that our proposed\nattack method can bypass existing state-of-the-art defenses while maintaining\neffectiveness, robustness, naturalness, and stealthiness, without compromising\nmodel performance.\n","authors":["Ming Sun","Lihua Jing","Zixuan Zhu","Rui Wang"],"pdf_url":"https://arxiv.org/pdf/2408.12312v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05558v2","updated":"2024-08-22T11:25:31Z","published":"2024-08-10T13:50:43Z","title":"Object Re-identification via Spatial-temporal Fusion Networks and Causal\n Identity Matching","summary":" Object re-identification (ReID) in large camera networks faces numerous\nchallenges. First, the similar appearances of objects degrade ReID performance,\na challenge that needs to be addressed by existing appearance-based ReID\nmethods. Second, most ReID studies are performed in laboratory settings and do\nnot consider real-world scenarios. To overcome these challenges, we introduce a\nnovel ReID framework that leverages a spatial-temporal fusion network and\ncausal identity matching (CIM). Our framework estimates camera network topology\nusing a proposed adaptive Parzen window and combines appearance features with\nspatial-temporal cues within the fusion network. This approach has demonstrated\noutstanding performance across several datasets, including VeRi776, Vehicle-3I,\nand Market-1501, achieving up to 99.70% rank-1 accuracy and 95.5% mAP.\nFurthermore, the proposed CIM approach, which dynamically assigns gallery sets\nbased on camera network topology, has further improved ReID accuracy and\nrobustness in real-world settings, evidenced by a 94.95% mAP and a 95.19% F1\nscore on the Vehicle-3I dataset. The experimental results support the\neffectiveness of incorporating spatial-temporal information and CIM for\nreal-world ReID scenarios, regardless of the data domain (e.g., vehicle,\nperson).\n","authors":["Hye-Geun Kim","Yong-Hyuk Moon","Yeong-Jun Cho"],"pdf_url":"https://arxiv.org/pdf/2408.05558v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09218v3","updated":"2024-08-22T11:23:27Z","published":"2024-08-17T14:55:15Z","title":"FQGA-single: Towards Fewer Training Epochs and Fewer Model Parameters\n for Image-to-Image Translation Tasks","summary":" CycleGAN was trained on SynthRAD Grand Challenge Dataset using the\nsingle-epoch modification (SEM) method proposed in this paper which is referred\nto as (CycleGAN-single) compared to the usual method of training CycleGAN on\naround 200 epochs (CycleGAN-multi). Model performance were evaluated\nqualitatively and quantitatively with quantitative performance metrics like\nPSNR, SSIM, MAE and MSE. The consideration of both quantitative and qualitative\nperformance when evaluating a model is unique to certain image-to-image\ntranslation tasks like medical imaging of patient data as detailed in this\npaper. Also, this paper shows that good quantitative performance does not\nalways imply good qualitative performance and the converse is also not always\nTrue (i.e. good qualitative performance does not always imply good quantitative\nperformance). This paper also proposes a lightweight model called FQGA (Fast\nPaired Image-to-Image Translation Quarter-Generator Adversary) which has 1/4\nthe number of parameters compared to CycleGAN (when comparing their Generator\nModels). FQGA outperforms CycleGAN qualitatively and quantitatively even only\nafter training on 20 epochs. Finally, using SEM method on FQGA allowed it to\nagain outperform CycleGAN both quantitatively and qualitatively. These\nperformance gains even with fewer model parameters and fewer epochs (which will\nresult in time and computational savings) may also be applicable to other\nimage-to-image translation tasks in Machine Learning apart from the Medical\nimage-translation task discussed in this paper between Cone Beam Computed\nTomography (CBCT) and Computed Tomography (CT) images.\n","authors":["Cho Yang"],"pdf_url":"https://arxiv.org/pdf/2408.09218v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12293v1","updated":"2024-08-22T11:06:18Z","published":"2024-08-22T11:06:18Z","title":"AT-SNN: Adaptive Tokens for Vision Transformer on Spiking Neural Network","summary":" In the training and inference of spiking neural networks (SNNs), direct\ntraining and lightweight computation methods have been orthogonally developed,\naimed at reducing power consumption. However, only a limited number of\napproaches have applied these two mechanisms simultaneously and failed to fully\nleverage the advantages of SNN-based vision transformers (ViTs) since they were\noriginally designed for convolutional neural networks (CNNs). In this paper, we\npropose AT-SNN designed to dynamically adjust the number of tokens processed\nduring inference in SNN-based ViTs with direct training, wherein power\nconsumption is proportional to the number of tokens. We first demonstrate the\napplicability of adaptive computation time (ACT), previously limited to RNNs\nand ViTs, to SNN-based ViTs, enhancing it to discard less informative spatial\ntokens selectively. Also, we propose a new token-merge mechanism that relies on\nthe similarity of tokens, which further reduces the number of tokens while\nenhancing accuracy. We implement AT-SNN to Spikformer and show the\neffectiveness of AT-SNN in achieving high energy efficiency and accuracy\ncompared to state-of-the-art approaches on the image classification tasks,\nCIFAR10, CIFAR-100, and TinyImageNet. For example, our approach uses up to\n42.4% fewer tokens than the existing best-performing method on CIFAR-100, while\nconserving higher accuracy.\n","authors":["Donghwa Kang","Youngmoon Lee","Eun-Kyu Lee","Brent Kang","Jinkyu Lee","Hyeongboo Baek"],"pdf_url":"https://arxiv.org/pdf/2408.12293v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2408.12292v1","updated":"2024-08-22T11:04:28Z","published":"2024-08-22T11:04:28Z","title":"Towards Deconfounded Image-Text Matching with Causal Inference","summary":" Prior image-text matching methods have shown remarkable performance on many\nbenchmark datasets, but most of them overlook the bias in the dataset, which\nexists in intra-modal and inter-modal, and tend to learn the spurious\ncorrelations that extremely degrade the generalization ability of the model.\nFurthermore, these methods often incorporate biased external knowledge from\nlarge-scale datasets as prior knowledge into image-text matching model, which\nis inevitable to force model further learn biased associations. To address\nabove limitations, this paper firstly utilizes Structural Causal Models (SCMs)\nto illustrate how intra- and inter-modal confounders damage the image-text\nmatching. Then, we employ backdoor adjustment to propose an innovative\nDeconfounded Causal Inference Network (DCIN) for image-text matching task. DCIN\n(1) decomposes the intra- and inter-modal confounders and incorporates them\ninto the encoding stage of visual and textual features, effectively eliminating\nthe spurious correlations during image-text matching, and (2) uses causal\ninference to mitigate biases of external knowledge. Consequently, the model can\nlearn causality instead of spurious correlations caused by dataset bias.\nExtensive experiments on two well-known benchmark datasets, i.e., Flickr30K and\nMSCOCO, demonstrate the superiority of our proposed method.\n","authors":["Wenhui Li","Xinqi Su","Dan Song","Lanjun Wang","Kun Zhang","An-An Liu"],"pdf_url":"https://arxiv.org/pdf/2408.12292v1.pdf","comment":"ACM MM"},{"id":"http://arxiv.org/abs/2408.12282v1","updated":"2024-08-22T10:34:01Z","published":"2024-08-22T10:34:01Z","title":"Subsurface Scattering for 3D Gaussian Splatting","summary":" 3D reconstruction and relighting of objects made from scattering materials\npresent a significant challenge due to the complex light transport beneath the\nsurface. 3D Gaussian Splatting introduced high-quality novel view synthesis at\nreal-time speeds. While 3D Gaussians efficiently approximate an object's\nsurface, they fail to capture the volumetric properties of subsurface\nscattering. We propose a framework for optimizing an object's shape together\nwith the radiance transfer field given multi-view OLAT (one light at a time)\ndata. Our method decomposes the scene into an explicit surface represented as\n3D Gaussians, with a spatially varying BRDF, and an implicit volumetric\nrepresentation of the scattering component. A learned incident light field\naccounts for shadowing. We optimize all parameters jointly via ray-traced\ndifferentiable rendering. Our approach enables material editing, relighting and\nnovel view synthesis at interactive rates. We show successful application on\nsynthetic data and introduce a newly acquired multi-view multi-light dataset of\nobjects in a light-stage setup. Compared to previous work we achieve comparable\nor better results at a fraction of optimization and rendering time while\nenabling detailed control over material attributes. Project page\nhttps://sss.jdihlmann.com/\n","authors":["Jan-Niklas Dihlmann","Arjun Majumdar","Andreas Engelhardt","Raphael Braun","Hendrik P. A. Lensch"],"pdf_url":"https://arxiv.org/pdf/2408.12282v1.pdf","comment":"Project page: https://sss.jdihlmann.com/"},{"id":"http://arxiv.org/abs/2408.12275v1","updated":"2024-08-22T10:19:41Z","published":"2024-08-22T10:19:41Z","title":"Whole Slide Image Classification of Salivary Gland Tumours","summary":" This work shows promising results using multiple instance learning on\nsalivary gland tumours in classifying cancers on whole slide images. Utilising\nCTransPath as a patch-level feature extractor and CLAM as a feature aggregator,\nan F1 score of over 0.88 and AUROC of 0.92 are obtained for detecting cancer in\nwhole slide images.\n","authors":["John Charlton","Ibrahim Alsanie","Syed Ali Khurram"],"pdf_url":"https://arxiv.org/pdf/2408.12275v1.pdf","comment":"5 pages, 2 figures, 28th UK Conference on Medical Image Understanding\n and Analysis - clinical abstract"},{"id":"http://arxiv.org/abs/2311.14006v2","updated":"2024-08-22T10:04:21Z","published":"2023-11-23T13:43:14Z","title":"High-resolution Population Maps Derived from Sentinel-1 and Sentinel-2","summary":" Detailed population maps play an important role in diverse fields ranging\nfrom humanitarian action to urban planning. Generating such maps in a timely\nand scalable manner presents a challenge, especially in data-scarce regions. To\naddress it we have developed POPCORN, a population mapping method whose only\ninputs are free, globally available satellite images from Sentinel-1 and\nSentinel-2; and a small number of aggregate population counts over coarse\ncensus districts for calibration. Despite the minimal data requirements our\napproach surpasses the mapping accuracy of existing schemes, including several\nthat rely on building footprints derived from high-resolution imagery. E.g., we\nwere able to produce population maps for Rwanda with 100m GSD based on less\nthan 400 regional census counts. In Kigali, those maps reach an R^2 score of\n66% w.r.t. a ground truth reference map, with an average error of only about 10\ninhabitants/ha. Conveniently, POPCORN retrieves explicit maps of built-up areas\nand of local building occupancy rates, making the mapping process interpretable\nand offering additional insights, for instance about the distribution of\nbuilt-up, but unpopulated areas, e.g., industrial warehouses. Moreover, we find\nthat, once trained, the model can be applied repeatedly to track population\nchanges; and that it can be transferred to geographically similar regions,\ne.g., from Uganda to Rwanda). With our work we aim to democratize access to\nup-to-date and high-resolution population maps, recognizing that some regions\nfaced with particularly strong population dynamics may lack the resources for\ncostly micro-census campaigns.\n","authors":["Nando Metzger","Rodrigo Caye Daudt","Devis Tuia","Konrad Schindler"],"pdf_url":"https://arxiv.org/pdf/2311.14006v2.pdf","comment":"Accepted to Remote Sensing of Environment 2024"},{"id":"http://arxiv.org/abs/2405.19707v3","updated":"2024-08-22T09:48:49Z","published":"2024-05-30T05:36:12Z","title":"DeMamba: AI-Generated Video Detection on Million-Scale GenVideo\n Benchmark","summary":" Recently, video generation techniques have advanced rapidly. Given the\npopularity of video content on social media platforms, these models intensify\nconcerns about the spread of fake information. Therefore, there is a growing\ndemand for detectors capable of distinguishing between fake AI-generated videos\nand mitigating the potential harm caused by fake information. However, the lack\nof large-scale datasets from the most advanced video generators poses a barrier\nto the development of such detectors. To address this gap, we introduce the\nfirst AI-generated video detection dataset, GenVideo. It features the following\ncharacteristics: (1) a large volume of videos, including over one million\nAI-generated and real videos collected; (2) a rich diversity of generated\ncontent and methodologies, covering a broad spectrum of video categories and\ngeneration techniques. We conducted extensive studies of the dataset and\nproposed two evaluation methods tailored for real-world-like scenarios to\nassess the detectors' performance: the cross-generator video classification\ntask assesses the generalizability of trained detectors on generators; the\ndegraded video classification task evaluates the robustness of detectors to\nhandle videos that have degraded in quality during dissemination. Moreover, we\nintroduced a plug-and-play module, named Detail Mamba (DeMamba), designed to\nenhance the detectors by identifying AI-generated videos through the analysis\nof inconsistencies in temporal and spatial dimensions. Our extensive\nexperiments demonstrate DeMamba's superior generalizability and robustness on\nGenVideo compared to existing detectors. We believe that the GenVideo dataset\nand the DeMamba module will significantly advance the field of AI-generated\nvideo detection. Our code and dataset will be aviliable at\n\\url{https://github.com/chenhaoxing/DeMamba}.\n","authors":["Haoxing Chen","Yan Hong","Zizheng Huang","Zhuoer Xu","Zhangxuan Gu","Yaohui Li","Jun Lan","Huijia Zhu","Jianfu Zhang","Weiqiang Wang","Huaxiong Li"],"pdf_url":"https://arxiv.org/pdf/2405.19707v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12253v1","updated":"2024-08-22T09:45:24Z","published":"2024-08-22T09:45:24Z","title":"Epsilon: Exploring Comprehensive Visual-Semantic Projection for\n Multi-Label Zero-Shot Learning","summary":" This paper investigates a challenging problem of zero-shot learning in the\nmulti-label scenario (MLZSL), wherein the model is trained to recognize\nmultiple unseen classes within a sample (e.g., an image) based on seen classes\nand auxiliary knowledge, e.g., semantic information. Existing methods usually\nresort to analyzing the relationship of various seen classes residing in a\nsample from the dimension of spatial or semantic characteristics and\ntransferring the learned model to unseen ones. However, they neglect the\nintegrity of local and global features. Although the use of the attention\nstructure will accurately locate local features, especially objects, it will\nsignificantly lose its integrity, and the relationship between classes will\nalso be affected. Rough processing of global features will also directly affect\ncomprehensiveness. This neglect will make the model lose its grasp of the main\ncomponents of the image. Relying only on the local existence of seen classes\nduring the inference stage introduces unavoidable bias. In this paper, we\npropose a novel and comprehensive visual-semantic framework for MLZSL, dubbed\nEpsilon, to fully make use of such properties and enable a more accurate and\nrobust visual-semantic projection. In terms of spatial information, we achieve\neffective refinement by group aggregating image features into several semantic\nprompts. It can aggregate semantic information rather than class information,\npreserving the correlation between semantics. In terms of global semantics, we\nuse global forward propagation to collect as much information as possible to\nensure that semantics are not omitted. Experiments on large-scale MLZSL\nbenchmark datasets NUS-Wide and Open-Images-v4 demonstrate that the proposed\nEpsilon outperforms other state-of-the-art methods with large margins.\n","authors":["Ziming Liu","Jingcai Guo","Song Guo","Xiaocheng Lu"],"pdf_url":"https://arxiv.org/pdf/2408.12253v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2309.00923"},{"id":"http://arxiv.org/abs/2408.12248v1","updated":"2024-08-22T09:36:26Z","published":"2024-08-22T09:36:26Z","title":"PRG: Prompt-Based Distillation Without Annotation via Proxy Relational\n Graph","summary":" In this paper, we propose a new distillation method for extracting knowledge\nfrom Large Foundation Models (LFM) into lightweight models, introducing a novel\nsupervision mode that does not require manually annotated data. While LFMs\nexhibit exceptional zero-shot classification abilities across datasets, relying\nsolely on LFM-generated embeddings for distillation poses two main challenges:\nLFM's task-irrelevant knowledge and the high density of features. The transfer\nof task-irrelevant knowledge could compromise the student model's\ndiscriminative capabilities, and the high density of features within target\ndomains obstructs the extraction of discriminative knowledge essential for the\ntask. To address this issue, we introduce the Proxy Relational Graph (PRG)\nmethod. We initially extract task-relevant knowledge from LFMs by calculating a\nweighted average of logits obtained through text prompt embeddings. Then we\nconstruct sample-class proxy graphs for LFM and student models, respectively,\nto model the correlation between samples and class proxies. Then, we achieve\nthe distillation of selective knowledge by aligning the relational graphs\nproduced by both the LFM and the student model. Specifically, the distillation\nfrom LFM to the student model is achieved through two types of alignment: 1)\naligning the sample nodes produced by the student model with those produced by\nthe LFM, and 2) aligning the edge relationships in the student model's graph\nwith those in the LFM's graph. Our experimental results validate the\neffectiveness of PRG, demonstrating its ability to leverage the extensive\nknowledge base of LFMs while skillfully circumventing their inherent\nlimitations in focused learning scenarios. Notably, in our annotation-free\nframework, PRG achieves an accuracy of 76.23\\% (T: 77.9\\%) on CIFAR-100 and\n72.44\\% (T: 75.3\\%) on the ImageNet-1K.\n","authors":["Yijin Xu","Jialun Liu","Hualiang Wei","Wenhui Li"],"pdf_url":"https://arxiv.org/pdf/2408.12248v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12246v1","updated":"2024-08-22T09:33:25Z","published":"2024-08-22T09:33:25Z","title":"OVA-DETR: Open Vocabulary Aerial Object Detection Using Image-Text\n Alignment and Fusion","summary":" Aerial object detection has been a hot topic for many years due to its wide\napplication requirements. However, most existing approaches can only handle\npredefined categories, which limits their applicability for the open scenarios\nin real-world. In this paper, we extend aerial object detection to open\nscenarios by exploiting the relationship between image and text, and propose\nOVA-DETR, a high-efficiency open-vocabulary detector for aerial images.\nSpecifically, based on the idea of image-text alignment, we propose region-text\ncontrastive loss to replace the category regression loss in the traditional\ndetection framework, which breaks the category limitation. Then, we propose\nBidirectional Vision-Language Fusion (Bi-VLF), which includes a dual-attention\nfusion encoder and a multi-level text-guided Fusion Decoder. The dual-attention\nfusion encoder enhances the feature extraction process in the encoder part. The\nmulti-level text-guided Fusion Decoder is designed to improve the detection\nability for small objects, which frequently appear in aerial object detection\nscenarios. Experimental results on three widely used benchmark datasets show\nthat our proposed method significantly improves the mAP and recall, while\nenjoying faster inference speed. For instance, in zero shot detection\nexperiments on DIOR, the proposed OVA-DETR outperforms DescReg and YOLO-World\nby 37.4% and 33.1%, respectively, while achieving 87 FPS inference speed, which\nis 7.9x faster than DescReg and 3x faster than YOLO-world. The code is\navailable at https://github.com/GT-Wei/OVA-DETR.\n","authors":["Guoting Wei","Xia Yuan","Yu Liu","Zhenhao Shang","Kelu Yao","Chao Li","Qingsen Yan","Chunxia Zhao","Haokui Zhang","Rong Xiao"],"pdf_url":"https://arxiv.org/pdf/2408.12246v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12245v1","updated":"2024-08-22T09:27:49Z","published":"2024-08-22T09:27:49Z","title":"Scalable Autoregressive Image Generation with Mamba","summary":" We introduce AiM, an autoregressive (AR) image generative model based on\nMamba architecture. AiM employs Mamba, a novel state-space model characterized\nby its exceptional performance for long-sequence modeling with linear time\ncomplexity, to supplant the commonly utilized Transformers in AR image\ngeneration models, aiming to achieve both superior generation quality and\nenhanced inference speed. Unlike existing methods that adapt Mamba to handle\ntwo-dimensional signals via multi-directional scan, AiM directly utilizes the\nnext-token prediction paradigm for autoregressive image generation. This\napproach circumvents the need for extensive modifications to enable Mamba to\nlearn 2D spatial representations. By implementing straightforward yet\nstrategically targeted modifications for visual generative tasks, we preserve\nMamba's core structure, fully exploiting its efficient long-sequence modeling\ncapabilities and scalability. We provide AiM models in various scales, with\nparameter counts ranging from 148M to 1.3B. On the ImageNet1K 256*256\nbenchmark, our best AiM model achieves a FID of 2.21, surpassing all existing\nAR models of comparable parameter counts and demonstrating significant\ncompetitiveness against diffusion models, with 2 to 10 times faster inference\nspeed. Code is available at https://github.com/hp-l33/AiM\n","authors":["Haopeng Li","Jinyue Yang","Kexin Wang","Xuerui Qiu","Yuhong Chou","Xin Li","Guoqi Li"],"pdf_url":"https://arxiv.org/pdf/2408.12245v1.pdf","comment":"9 pages, 8 figures"},{"id":"http://arxiv.org/abs/2403.02302v3","updated":"2024-08-22T09:15:38Z","published":"2024-03-04T18:32:12Z","title":"Beyond Specialization: Assessing the Capabilities of MLLMs in Age and\n Gender Estimation","summary":" Multimodal Large Language Models (MLLMs) have recently gained immense\npopularity. Powerful commercial models like ChatGPT-4V and Gemini, as well as\nopen-source ones such as LLaVA, are essentially general-purpose models and are\napplied to solve a wide variety of tasks, including those in computer vision.\nThese neural networks possess such strong general knowledge and reasoning\nabilities that they have proven capable of working even on tasks for which they\nwere not specifically trained. We compared the capabilities of the most\npowerful MLLMs to date: ShareGPT4V, ChatGPT, LLaVA-Next in a specialized task\nof age and gender estimation with our state-of-the-art specialized model,\nMiVOLO. We also updated MiVOLO and provide details and new metrics in this\narticle. This comparison has yielded some interesting results and insights\nabout the strengths and weaknesses of the participating models. Furthermore, we\nattempted various ways to fine-tune the ShareGPT4V model for this specific\ntask, aiming to achieve state-of-the-art results in this particular challenge.\nAlthough such a model would not be practical in production, as it is incredibly\nexpensive compared to a specialized model like MiVOLO, it could be very useful\nin some tasks, like data annotation.\n","authors":["Maksim Kuprashevich","Grigorii Alekseenko","Irina Tolstykh"],"pdf_url":"https://arxiv.org/pdf/2403.02302v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12232v1","updated":"2024-08-22T09:07:51Z","published":"2024-08-22T09:07:51Z","title":"BihoT: A Large-Scale Dataset and Benchmark for Hyperspectral Camouflaged\n Object Tracking","summary":" Hyperspectral object tracking (HOT) has exhibited potential in various\napplications, particularly in scenes where objects are camouflaged. Existing\ntrackers can effectively retrieve objects via band regrouping because of the\nbias in existing HOT datasets, where most objects tend to have distinguishing\nvisual appearances rather than spectral characteristics. This bias allows the\ntracker to directly use the visual features obtained from the false-color\nimages generated by hyperspectral images without the need to extract spectral\nfeatures. To tackle this bias, we find that the tracker should focus on the\nspectral information when object appearance is unreliable. Thus, we provide a\nnew task called hyperspectral camouflaged object tracking (HCOT) and\nmeticulously construct a large-scale HCOT dataset, termed BihoT, which consists\nof 41,912 hyperspectral images covering 49 video sequences. The dataset covers\nvarious artificial camouflage scenes where objects have similar appearances,\ndiverse spectrums, and frequent occlusion, making it a very challenging dataset\nfor HCOT. Besides, a simple but effective baseline model, named spectral\nprompt-based distractor-aware network (SPDAN), is proposed, comprising a\nspectral embedding network (SEN), a spectral prompt-based backbone network\n(SPBN), and a distractor-aware module (DAM). Specifically, the SEN extracts\nspectral-spatial features via 3-D and 2-D convolutions. Then, the SPBN\nfine-tunes powerful RGB trackers with spectral prompts and alleviates the\ninsufficiency of training samples. Moreover, the DAM utilizes a novel statistic\nto capture the distractor caused by occlusion from objects and background.\nExtensive experiments demonstrate that our proposed SPDAN achieves\nstate-of-the-art performance on the proposed BihoT and other HOT datasets.\n","authors":["Hanzheng Wang","Wei Li","Xiang-Gen Xia","Qian Du"],"pdf_url":"https://arxiv.org/pdf/2408.12232v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.04879v2","updated":"2024-08-22T09:04:29Z","published":"2024-08-09T05:49:21Z","title":"On the Element-Wise Representation and Reasoning in Zero-Shot Image\n Recognition: A Systematic Survey","summary":" Zero-shot image recognition (ZSIR) aims at empowering models to recognize and\nreason in unseen domains via learning generalized knowledge from limited data\nin the seen domain. The gist for ZSIR is to execute element-wise representation\nand reasoning from the input visual space to the target semantic space, which\nis a bottom-up modeling paradigm inspired by the process by which humans\nobserve the world, i.e., capturing new concepts by learning and combining the\nbasic components or shared characteristics. In recent years, element-wise\nlearning techniques have seen significant progress in ZSIR as well as\nwidespread application. However, to the best of our knowledge, there remains a\nlack of a systematic overview of this topic. To enrich the literature and\nprovide a sound basis for its future development, this paper presents a broad\nreview of recent advances in element-wise ZSIR. Concretely, we first attempt to\nintegrate the three basic ZSIR tasks of object recognition, compositional\nrecognition, and foundation model-based open-world recognition into a unified\nelement-wise perspective and provide a detailed taxonomy and analysis of the\nmain research approaches. Then, we collect and summarize some key information\nand benchmarks, such as detailed technical implementations and common datasets.\nFinally, we sketch out the wide range of its related applications, discuss\nvital challenges, and suggest potential future directions.\n","authors":["Jingcai Guo","Zhijie Rao","Zhi Chen","Song Guo","Jingren Zhou","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2408.04879v2.pdf","comment":"23 pages, 7 figures, and 3 tables"},{"id":"http://arxiv.org/abs/2408.08703v2","updated":"2024-08-22T08:52:56Z","published":"2024-08-16T12:30:29Z","title":"TsCA: On the Semantic Consistency Alignment via Conditional Transport\n for Compositional Zero-Shot Learning","summary":" Compositional Zero-Shot Learning (CZSL) aims to recognize novel\n\\textit{state-object} compositions by leveraging the shared knowledge of their\nprimitive components. Despite considerable progress, effectively calibrating\nthe bias between semantically similar multimodal representations, as well as\ngeneralizing pre-trained knowledge to novel compositional contexts, remains an\nenduring challenge. In this paper, our interest is to revisit the conditional\ntransport (CT) theory and its homology to the visual-semantics interaction in\nCZSL and further, propose a novel Trisets Consistency Alignment framework\n(dubbed TsCA) that well-addresses these issues. Concretely, we utilize three\ndistinct yet semantically homologous sets, i.e., patches, primitives, and\ncompositions, to construct pairwise CT costs to minimize their semantic\ndiscrepancies. To further ensure the consistency transfer within these sets, we\nimplement a cycle-consistency constraint that refines the learning by\nguaranteeing the feature consistency of the self-mapping during transport flow,\nregardless of modality. Moreover, we extend the CT plans to an open-world\nsetting, which enables the model to effectively filter out unfeasible pairs,\nthereby speeding up the inference as well as increasing the accuracy. Extensive\nexperiments are conducted to verify the effectiveness of the proposed method.\n","authors":["Miaoge Li","Jingcai Guo","Richard Yi Da Xu","Dongsheng Wang","Xiaofeng Cao","Song Guo"],"pdf_url":"https://arxiv.org/pdf/2408.08703v2.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2408.12211v1","updated":"2024-08-22T08:40:04Z","published":"2024-08-22T08:40:04Z","title":"Computer-Aided Fall Recognition Using a Three-Stream Spatial-Temporal\n GCN Model with Adaptive Feature Aggregation","summary":" The prevention of falls is paramount in modern healthcare, particularly for\nthe elderly, as falls can lead to severe injuries or even fatalities.\nAdditionally, the growing incidence of falls among the elderly, coupled with\nthe urgent need to prevent suicide attempts resulting from medication overdose,\nunderscores the critical importance of accurate and efficient fall detection\nmethods. In this scenario, a computer-aided fall detection system is inevitable\nto save elderly people's lives worldwide. Many researchers have been working to\ndevelop fall detection systems. However, the existing fall detection systems\noften struggle with issues such as unsatisfactory performance accuracy, limited\nrobustness, high computational complexity, and sensitivity to environmental\nfactors due to a lack of effective features. In response to these challenges,\nthis paper proposes a novel three-stream spatial-temporal feature-based fall\ndetection system. Our system incorporates joint skeleton-based spatial and\ntemporal Graph Convolutional Network (GCN) features, joint motion-based spatial\nand temporal GCN features, and residual connections-based features. Each stream\nemploys adaptive graph-based feature aggregation and consecutive separable\nconvolutional neural networks (Sep-TCN), significantly reducing computational\ncomplexity and model parameters compared to prior systems. Experimental results\nacross multiple datasets demonstrate the superior effectiveness and efficiency\nof our proposed system, with accuracies of 99.51\\%, 99.15\\%, 99.79\\% and 99.85\n\\% achieved on the ImViA, UR-Fall, Fall-UP and FU-Kinect datasets,\nrespectively. The remarkable performance of our system highlights its\nsuperiority, efficiency, and generalizability in real-world fall detection\nscenarios, offering significant advancements in healthcare and societal\nwell-being.\n","authors":["Jungpil Shin","Abu Saleh Musa Miah","Rei Egawa1","Koki Hirooka","Md. Al Mehedi Hasan","Yoichi Tomioka","Yong Seok Hwang"],"pdf_url":"https://arxiv.org/pdf/2408.12211v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11535v2","updated":"2024-08-22T08:25:39Z","published":"2024-08-21T11:18:35Z","title":"SAM-REF: Rethinking Image-Prompt Synergy for Refinement in Segment\n Anything","summary":" The advent of the Segment Anything Model (SAM) marks a significant milestone\nfor interactive segmentation using generalist models. As a late fusion model,\nSAM extracts image embeddings once and merges them with prompts in later\ninteractions. This strategy limits the models ability to extract detailed\ninformation from the prompted target zone. Current specialist models utilize\nthe early fusion strategy that encodes the combination of images and prompts to\ntarget the prompted objects, yet repetitive complex computations on the images\nresult in high latency. The key to these issues is efficiently synergizing the\nimages and prompts. We propose SAM-REF, a two-stage refinement framework that\nfully integrates images and prompts globally and locally while maintaining the\naccuracy of early fusion and the efficiency of late fusion. The first-stage\nGlobalDiff Refiner is a lightweight early fusion network that combines the\nwhole image and prompts, focusing on capturing detailed information for the\nentire object. The second-stage PatchDiff Refiner locates the object detail\nwindow according to the mask and prompts, then refines the local details of the\nobject. Experimentally, we demonstrated the high effectiveness and efficiency\nof our method in tackling complex cases with multiple interactions. Our SAM-REF\nmodel outperforms the current state-of-the-art method in most metrics on\nsegmentation quality without compromising efficiency.\n","authors":["Chongkai Yu","Anqi Li","Xiaochao Qu","Luoqi Liu","Ting Liu"],"pdf_url":"https://arxiv.org/pdf/2408.11535v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12191v1","updated":"2024-08-22T08:12:09Z","published":"2024-08-22T08:12:09Z","title":"Transientangelo: Few-Viewpoint Surface Reconstruction Using\n Single-Photon Lidar","summary":" We consider the problem of few-viewpoint 3D surface reconstruction using raw\nmeasurements from a lidar system. Lidar captures 3D scene geometry by emitting\npulses of light to a target and recording the speed-of-light time delay of the\nreflected light. However, conventional lidar systems do not output the raw,\ncaptured waveforms of backscattered light; instead, they pre-process these data\ninto a 3D point cloud. Since this procedure typically does not accurately model\nthe noise statistics of the system, exploit spatial priors, or incorporate\ninformation about downstream tasks, it ultimately discards useful information\nthat is encoded in raw measurements of backscattered light. Here, we propose to\nleverage raw measurements captured with a single-photon lidar system from\nmultiple viewpoints to optimize a neural surface representation of a scene. The\nmeasurements consist of time-resolved photon count histograms, or transients,\nwhich capture information about backscattered light at picosecond time scales.\nAdditionally, we develop new regularization strategies that improve robustness\nto photon noise, enabling accurate surface reconstruction with as few as 10\nphotons per pixel. Our method outperforms other techniques for few-viewpoint 3D\nreconstruction based on depth maps, point clouds, or conventional lidar as\ndemonstrated in simulation and with captured data.\n","authors":["Weihan Luo","Anagh Malik","David B. Lindell"],"pdf_url":"https://arxiv.org/pdf/2408.12191v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08252v2","updated":"2024-08-22T07:42:51Z","published":"2024-04-12T05:43:10Z","title":"MonoPatchNeRF: Improving Neural Radiance Fields with Patch-based\n Monocular Guidance","summary":" The latest regularized Neural Radiance Field (NeRF) approaches produce poor\ngeometry and view extrapolation for large scale sparse view scenes, such as\nETH3D. Density-based approaches tend to be under-constrained, while\nsurface-based approaches tend to miss details. In this paper, we take a\ndensity-based approach, sampling patches instead of individual rays to better\nincorporate monocular depth and normal estimates and patch-based photometric\nconsistency constraints between training views and sampled virtual views.\nLoosely constraining densities based on estimated depth aligned to sparse\npoints further improves geometric accuracy. While maintaining similar view\nsynthesis quality, our approach significantly improves geometric accuracy on\nthe ETH3D benchmark, e.g. increasing the F1@2cm score by 4x-8x compared to\nother regularized density-based approaches, with much lower training and\ninference time than other approaches.\n","authors":["Yuqun Wu","Jae Yong Lee","Chuhang Zou","Shenlong Wang","Derek Hoiem"],"pdf_url":"https://arxiv.org/pdf/2404.08252v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21631v2","updated":"2024-08-22T07:42:14Z","published":"2024-07-31T14:25:16Z","title":"RoadFormer+: Delivering RGB-X Scene Parsing through Scale-Aware\n Information Decoupling and Advanced Heterogeneous Feature Fusion","summary":" Task-specific data-fusion networks have marked considerable achievements in\nurban scene parsing. Among these networks, our recently proposed RoadFormer\nsuccessfully extracts heterogeneous features from RGB images and surface normal\nmaps and fuses these features through attention mechanisms, demonstrating\ncompelling efficacy in RGB-Normal road scene parsing. However, its performance\nsignificantly deteriorates when handling other types/sources of data or\nperforming more universal, all-category scene parsing tasks. To overcome these\nlimitations, this study introduces RoadFormer+, an efficient, robust, and\nadaptable model capable of effectively fusing RGB-X data, where ``X'',\nrepresents additional types/modalities of data such as depth, thermal, surface\nnormal, and polarization. Specifically, we propose a novel hybrid feature\ndecoupling encoder to extract heterogeneous features and decouple them into\nglobal and local components. These decoupled features are then fused through a\ndual-branch multi-scale heterogeneous feature fusion block, which employs\nparallel Transformer attentions and convolutional neural network modules to\nmerge multi-scale features across different scales and receptive fields. The\nfused features are subsequently fed into a decoder to generate the final\nsemantic predictions. Notably, our proposed RoadFormer+ ranks first on the\nKITTI Road benchmark and achieves state-of-the-art performance in mean\nintersection over union on the Cityscapes, MFNet, FMB, and ZJU datasets.\nMoreover, it reduces the number of learnable parameters by 65\\% compared to\nRoadFormer. Our source code will be publicly available at\nmias.group/RoadFormerPlus.\n","authors":["Jianxin Huang","Jiahang Li","Ning Jia","Yuxiang Sun","Chengju Liu","Qijun Chen","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2407.21631v2.pdf","comment":"11 pages, 5 figures, accepted by Transactions on Intelligent Vehicles\n 2024"},{"id":"http://arxiv.org/abs/2408.12161v1","updated":"2024-08-22T07:04:22Z","published":"2024-08-22T07:04:22Z","title":"Rebalancing Multi-Label Class-Incremental Learning","summary":" Multi-label class-incremental learning (MLCIL) is essential for real-world\nmulti-label applications, allowing models to learn new labels while retaining\npreviously learned knowledge continuously. However, recent MLCIL approaches can\nonly achieve suboptimal performance due to the oversight of the\npositive-negative imbalance problem, which manifests at both the label and loss\nlevels because of the task-level partial label issue. The imbalance at the\nlabel level arises from the substantial absence of negative labels, while the\nimbalance at the loss level stems from the asymmetric contributions of the\npositive and negative loss parts to the optimization. To address the issue\nabove, we propose a Rebalance framework for both the Loss and Label levels\n(RebLL), which integrates two key modules: asymmetric knowledge distillation\n(AKD) and online relabeling (OR). AKD is proposed to rebalance at the loss\nlevel by emphasizing the negative label learning in classification loss and\ndown-weighting the contribution of overconfident predictions in distillation\nloss. OR is designed for label rebalance, which restores the original class\ndistribution in memory by online relabeling the missing classes. Our\ncomprehensive experiments on the PASCAL VOC and MS-COCO datasets demonstrate\nthat this rebalancing strategy significantly improves performance, achieving\nnew state-of-the-art results even with a vanilla CNN backbone.\n","authors":["Kaile Du","Yifan Zhou","Fan Lyu","Yuyang Li","Junzhou Xie","Yixi Shen","Fuyuan Hu","Guangcan Liu"],"pdf_url":"https://arxiv.org/pdf/2408.12161v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19279v2","updated":"2024-08-22T07:03:56Z","published":"2024-04-30T06:02:59Z","title":"Quater-GCN: Enhancing 3D Human Pose Estimation with Orientation and\n Semi-supervised Training","summary":" 3D human pose estimation is a vital task in computer vision, involving the\nprediction of human joint positions from images or videos to reconstruct a\nskeleton of a human in three-dimensional space. This technology is pivotal in\nvarious fields, including animation, security, human-computer interaction, and\nautomotive safety, where it promotes both technological progress and enhanced\nhuman well-being. The advent of deep learning significantly advances the\nperformance of 3D pose estimation by incorporating temporal information for\npredicting the spatial positions of human joints. However, traditional methods\noften fall short as they primarily focus on the spatial coordinates of joints\nand overlook the orientation and rotation of the connecting bones, which are\ncrucial for a comprehensive understanding of human pose in 3D space. To address\nthese limitations, we introduce Quater-GCN (Q-GCN), a directed graph\nconvolutional network tailored to enhance pose estimation by orientation. Q-GCN\nexcels by not only capturing the spatial dependencies among node joints through\ntheir coordinates but also integrating the dynamic context of bone rotations in\n2D space. This approach enables a more sophisticated representation of human\nposes by also regressing the orientation of each bone in 3D space, moving\nbeyond mere coordinate prediction. Furthermore, we complement our model with a\nsemi-supervised training strategy that leverages unlabeled data, addressing the\nchallenge of limited orientation ground truth data. Through comprehensive\nevaluations, Q-GCN has demonstrated outstanding performance against current\nstate-of-the-art methods.\n","authors":["Xingyu Song","Zhan Li","Shi Chen","Kazuyuki Demachi"],"pdf_url":"https://arxiv.org/pdf/2404.19279v2.pdf","comment":"Accepted by ECAI24"},{"id":"http://arxiv.org/abs/2405.01016v4","updated":"2024-08-22T06:36:47Z","published":"2024-05-02T05:35:10Z","title":"Addressing Diverging Training Costs using BEVRestore for High-resolution\n Bird's Eye View Map Construction","summary":" Recent advancements in Bird's Eye View (BEV) fusion for map construction have\ndemonstrated remarkable mapping of urban environments. However, their deep and\nbulky architecture incurs substantial amounts of backpropagation memory and\ncomputing latency. Consequently, the problem poses an unavoidable bottleneck in\nconstructing high-resolution (HR) BEV maps, as their large-sized features cause\nsignificant increases in costs including GPU memory consumption and computing\nlatency, named diverging training costs issue. Affected by the problem, most\nexisting methods adopt low-resolution (LR) BEV and struggle to estimate the\nprecise locations of urban scene components like road lanes, and sidewalks. As\nthe imprecision leads to risky motion planning like collision avoidance, the\ndiverging training costs issue has to be resolved. In this paper, we address\nthe issue with our novel BEVRestore mechanism. Specifically, our proposed model\nencodes the features of each sensor to LR BEV space and restores them to HR\nspace to establish a memory-efficient map constructor. To this end, we\nintroduce the BEV restoration strategy, which restores aliasing, and blocky\nartifacts of the up-scaled BEV features, and narrows down the width of the\nlabels. Our extensive experiments show that the proposed mechanism provides a\nplug-and-play, memory-efficient pipeline, enabling an HR map construction with\na broad BEV scope.\n","authors":["Minsu Kim","Giseop Kim","Sunwook Choi"],"pdf_url":"https://arxiv.org/pdf/2405.01016v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08768v3","updated":"2024-08-22T06:27:48Z","published":"2023-12-14T09:31:33Z","title":"Local Conditional Controlling for Text-to-Image Diffusion Models","summary":" Diffusion models have exhibited impressive prowess in the text-to-image task.\nRecent methods add image-level structure controls, e.g., edge and depth maps,\nto manipulate the generation process together with text prompts to obtain\ndesired images. This controlling process is globally operated on the entire\nimage, which limits the flexibility of control regions. In this paper, we\nexplore a novel and practical task setting: local control. It focuses on\ncontrolling specific local region according to user-defined image conditions,\nwhile the remaining regions are only conditioned by the original text prompt.\nHowever, it is non-trivial to achieve local conditional controlling. The naive\nmanner of directly adding local conditions may lead to the local control\ndominance problem, which forces the model to focus on the controlled region and\nneglect object generation in other regions. To mitigate this problem, we\npropose Regional Discriminate Loss to update the noised latents, aiming at\nenhanced object generation in non-control regions. Furthermore, the proposed\nFocused Token Response suppresses weaker attention scores which lack the\nstrongest response to enhance object distinction and reduce duplication.\nLastly, we adopt Feature Mask Constraint to reduce quality degradation in\nimages caused by information differences across the local control region. All\nproposed strategies are operated at the inference stage. Extensive experiments\ndemonstrate that our method can synthesize high-quality images aligned with the\ntext prompt under local control conditions.\n","authors":["Yibo Zhao","Liang Peng","Yang Yang","Zekai Luo","Hengjia Li","Yao Chen","Zheng Yang","Xiaofei He","Wei Zhao","qinglin lu","Boxi Wu","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2312.08768v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06741v3","updated":"2024-08-22T05:57:25Z","published":"2024-04-10T04:59:51Z","title":"An Animation-based Augmentation Approach for Action Recognition from\n Discontinuous Video","summary":" Action recognition, an essential component of computer vision, plays a\npivotal role in multiple applications. Despite significant improvements brought\nby Convolutional Neural Networks (CNNs), these models suffer performance\ndeclines when trained with discontinuous video frames, which is a frequent\nscenario in real-world settings. This decline primarily results from the loss\nof temporal continuity, which is crucial for understanding the semantics of\nhuman actions. To overcome this issue, we introduce the 4A (Action\nAnimation-based Augmentation Approach) pipeline, which employs a series of\nsophisticated techniques: starting with 2D human pose estimation from RGB\nvideos, followed by Quaternion-based Graph Convolution Network for joint\norientation and trajectory prediction, and Dynamic Skeletal Interpolation for\ncreating smoother, diversified actions using game engine technology. This\ninnovative approach generates realistic animations in varied game environments,\nviewed from multiple viewpoints. In this way, our method effectively bridges\nthe domain gap between virtual and real-world data. In experimental\nevaluations, the 4A pipeline achieves comparable or even superior performance\nto traditional training approaches using real-world data, while requiring only\n10% of the original data volume. Additionally, our approach demonstrates\nenhanced performance on In-the-wild videos, marking a significant advancement\nin the field of action recognition.\n","authors":["Xingyu Song","Zhan Li","Shi Chen","Xin-Qiang Cai","Kazuyuki Demachi"],"pdf_url":"https://arxiv.org/pdf/2404.06741v3.pdf","comment":"Accepted by ECAI24"},{"id":"http://arxiv.org/abs/2408.12141v1","updated":"2024-08-22T05:52:27Z","published":"2024-08-22T05:52:27Z","title":"TRRG: Towards Truthful Radiology Report Generation With Cross-modal\n Disease Clue Enhanced Large Language Model","summary":" The vision-language modeling capability of multi-modal large language models\nhas attracted wide attention from the community. However, in medical domain,\nradiology report generation using vision-language models still faces\nsignificant challenges due to the imbalanced data distribution caused by\nnumerous negated descriptions in radiology reports and issues such as rough\nalignment between radiology reports and radiography. In this paper, we propose\na truthful radiology report generation framework, namely TRRG, based on\nstage-wise training for cross-modal disease clue injection into large language\nmodels. In pre-training stage, During the pre-training phase, contrastive\nlearning is employed to enhance the ability of visual encoder to perceive\nfine-grained disease details. In fine-tuning stage, the clue injection module\nwe proposed significantly enhances the disease-oriented perception capability\nof the large language model by effectively incorporating the robust zero-shot\ndisease perception. Finally, through the cross-modal clue interaction module,\nour model effectively achieves the multi-granular interaction of visual\nembeddings and an arbitrary number of disease clue embeddings. This\nsignificantly enhances the report generation capability and clinical\neffectiveness of multi-modal large language models in the field of radiology\nreportgeneration. Experimental results demonstrate that our proposed\npre-training and fine-tuning framework achieves state-of-the-art performance in\nradiology report generation on datasets such as IU-Xray and MIMIC-CXR. Further\nanalysis indicates that our proposed method can effectively enhance the model\nto perceive diseases and improve its clinical effectiveness.\n","authors":["Yuhao Wang","Chao Hao","Yawen Cui","Xinqi Su","Weicheng Xie","Tao Tan","Zitong Yu"],"pdf_url":"https://arxiv.org/pdf/2408.12141v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06493v3","updated":"2024-08-22T05:43:25Z","published":"2024-04-09T17:48:52Z","title":"Flying with Photons: Rendering Novel Views of Propagating Light","summary":" We present an imaging and neural rendering technique that seeks to synthesize\nvideos of light propagating through a scene from novel, moving camera\nviewpoints. Our approach relies on a new ultrafast imaging setup to capture a\nfirst-of-its kind, multi-viewpoint video dataset with picosecond-level temporal\nresolution. Combined with this dataset, we introduce an efficient neural volume\nrendering framework based on the transient field. This field is defined as a\nmapping from a 3D point and 2D direction to a high-dimensional, discrete-time\nsignal that represents time-varying radiance at ultrafast timescales. Rendering\nwith transient fields naturally accounts for effects due to the finite speed of\nlight, including viewpoint-dependent appearance changes caused by light\npropagation delays to the camera. We render a range of complex effects,\nincluding scattering, specular reflection, refraction, and diffraction.\nAdditionally, we demonstrate removing viewpoint-dependent propagation delays\nusing a time warping procedure, rendering of relativistic effects, and video\nsynthesis of direct and global components of light transport.\n","authors":["Anagh Malik","Noah Juravsky","Ryan Po","Gordon Wetzstein","Kiriakos N. Kutulakos","David B. Lindell"],"pdf_url":"https://arxiv.org/pdf/2404.06493v3.pdf","comment":"ECCV 2024, Project page: https://anaghmalik.com/FlyingWithPhotons/"},{"id":"http://arxiv.org/abs/2408.00380v3","updated":"2024-08-22T05:07:18Z","published":"2024-08-01T08:41:13Z","title":"EXAONEPath 1.0 Patch-level Foundation Model for Pathology","summary":" Recent advancements in digital pathology have led to the development of\nnumerous foundational models that utilize self-supervised learning on patches\nextracted from gigapixel whole slide images (WSIs). While this approach\nleverages vast amounts of unlabeled data, we have discovered a significant\nissue: features extracted from these self-supervised models tend to cluster by\nindividual WSIs, a phenomenon we term WSI-specific feature collapse. This\nproblem can potentially limit the model's generalization ability and\nperformance on various downstream tasks. To address this issue, we introduce\nEXAONEPath, a novel foundational model trained on patches that have undergone\nstain normalization. Stain normalization helps reduce color variability arising\nfrom different laboratories and scanners, enabling the model to learn more\nconsistent features. EXAONEPath is trained using 285,153,903 patches extracted\nfrom a total of 34,795 WSIs. Our experiments demonstrate that EXAONEPath\nsignificantly mitigates the feature collapse problem, indicating that the model\nhas learned more generalized features rather than overfitting to individual WSI\ncharacteristics. We compared EXAONEPath with state-of-the-art models across six\ndownstream task datasets, and our results show that EXAONEPath achieves\nsuperior performance relative to the number of WSIs used and the model's\nparameter count. This suggests that the application of stain normalization has\nsubstantially improved the model's efficiency and generalization capabilities.\n","authors":["Juseung Yun","Yi Hu","Jinhyung Kim","Jongseong Jang","Soonyoung Lee"],"pdf_url":"https://arxiv.org/pdf/2408.00380v3.pdf","comment":"License updated"},{"id":"http://arxiv.org/abs/2408.12128v1","updated":"2024-08-22T04:49:50Z","published":"2024-08-22T04:49:50Z","title":"Diffusion-Based Visual Art Creation: A Survey and New Perspectives","summary":" The integration of generative AI in visual art has revolutionized not only\nhow visual content is created but also how AI interacts with and reflects the\nunderlying domain knowledge. This survey explores the emerging realm of\ndiffusion-based visual art creation, examining its development from both\nartistic and technical perspectives. We structure the survey into three phases,\ndata feature and framework identification, detailed analyses using a structured\ncoding process, and open-ended prospective outlooks. Our findings reveal how\nartistic requirements are transformed into technical challenges and highlight\nthe design and application of diffusion-based methods within visual art\ncreation. We also provide insights into future directions from technical and\nsynergistic perspectives, suggesting that the confluence of generative AI and\nart has shifted the creative paradigm and opened up new possibilities. By\nsummarizing the development and trends of this emerging interdisciplinary area,\nwe aim to shed light on the mechanisms through which AI systems emulate and\npossibly, enhance human capacities in artistic perception and creativity.\n","authors":["Bingyuan Wang","Qifeng Chen","Zeyu Wang"],"pdf_url":"https://arxiv.org/pdf/2408.12128v1.pdf","comment":"35 pages, 9 figures"},{"id":"http://arxiv.org/abs/2408.11679v2","updated":"2024-08-22T04:36:39Z","published":"2024-08-21T14:58:29Z","title":"Exploring Robustness of Visual State Space model against Backdoor\n Attacks","summary":" Visual State Space Model (VSS) has demonstrated remarkable performance in\nvarious computer vision tasks. However, in the process of development, backdoor\nattacks have brought severe challenges to security. Such attacks cause an\ninfected model to predict target labels when a specific trigger is activated,\nwhile the model behaves normally on benign samples. In this paper, we conduct\nsystematic experiments to comprehend on robustness of VSS through the lens of\nbackdoor attacks, specifically how the state space model (SSM) mechanism\naffects robustness. We first investigate the vulnerability of VSS to different\nbackdoor triggers and reveal that the SSM mechanism, which captures contextual\ninformation within patches, makes the VSS model more susceptible to backdoor\ntriggers compared to models without SSM. Furthermore, we analyze the\nsensitivity of the VSS model to patch processing techniques and discover that\nthese triggers are effectively disrupted. Based on these observations, we\nconsider an effective backdoor for the VSS model that recurs in each patch to\nresist patch perturbations. Extensive experiments across three datasets and\nvarious backdoor attacks reveal that the VSS model performs comparably to\nTransformers (ViTs) but is less robust than the Gated CNNs, which comprise only\nstacked Gated CNN blocks without SSM.\n","authors":["Cheng-Yi Lee","Cheng-Chang Tsai","Chia-Mu Yu","Chun-Shien Lu"],"pdf_url":"https://arxiv.org/pdf/2408.11679v2.pdf","comment":"11 pages, 9 figures, minor revise, under review"},{"id":"http://arxiv.org/abs/2408.11478v2","updated":"2024-08-22T04:29:58Z","published":"2024-08-21T09:43:27Z","title":"LAKD-Activation Mapping Distillation Based on Local Learning","summary":" Knowledge distillation is widely applied in various fundamental vision models\nto enhance the performance of compact models. Existing knowledge distillation\nmethods focus on designing different distillation targets to acquire knowledge\nfrom teacher models. However, these methods often overlook the efficient\nutilization of distilled information, crudely coupling different types of\ninformation, making it difficult to explain how the knowledge from the teacher\nnetwork aids the student network in learning. This paper proposes a novel\nknowledge distillation framework, Local Attention Knowledge Distillation\n(LAKD), which more efficiently utilizes the distilled information from teacher\nnetworks, achieving higher interpretability and competitive performance. The\nframework establishes an independent interactive training mechanism through a\nseparation-decoupling mechanism and non-directional activation mapping. LAKD\ndecouples the teacher's features and facilitates progressive interaction\ntraining from simple to complex. Specifically, the student network is divided\ninto local modules with independent gradients to decouple the knowledge\ntransferred from the teacher. The non-directional activation mapping helps the\nstudent network integrate knowledge from different local modules by learning\ncoarse-grained feature knowledge. We conducted experiments on the CIFAR-10,\nCIFAR-100, and ImageNet datasets, and the results show that our LAKD method\nsignificantly outperforms existing methods, consistently achieving\nstate-of-the-art performance across different datasets.\n","authors":["Yaoze Zhang","Yuming Zhang","Yu Zhao","Yue Zhang","Feiyu Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.11478v2.pdf","comment":"8 pages,7 figures"},{"id":"http://arxiv.org/abs/2408.03632v2","updated":"2024-08-22T04:19:34Z","published":"2024-08-07T08:43:58Z","title":"Concept Conductor: Orchestrating Multiple Personalized Concepts in\n Text-to-Image Synthesis","summary":" The customization of text-to-image models has seen significant advancements,\nyet generating multiple personalized concepts remains a challenging task.\nCurrent methods struggle with attribute leakage and layout confusion when\nhandling multiple concepts, leading to reduced concept fidelity and semantic\nconsistency. In this work, we introduce a novel training-free framework,\nConcept Conductor, designed to ensure visual fidelity and correct layout in\nmulti-concept customization. Concept Conductor isolates the sampling processes\nof multiple custom models to prevent attribute leakage between different\nconcepts and corrects erroneous layouts through self-attention-based spatial\nguidance. Additionally, we present a concept injection technique that employs\nshape-aware masks to specify the generation area for each concept. This\ntechnique injects the structure and appearance of personalized concepts through\nfeature fusion in the attention layers, ensuring harmony in the final image.\nExtensive qualitative and quantitative experiments demonstrate that Concept\nConductor can consistently generate composite images with accurate layouts\nwhile preserving the visual details of each concept. Compared to existing\nbaselines, Concept Conductor shows significant performance improvements. Our\nmethod supports the combination of any number of concepts and maintains high\nfidelity even when dealing with visually similar concepts. The code and models\nare available at https://github.com/Nihukat/Concept-Conductor.\n","authors":["Zebin Yao","Fangxiang Feng","Ruifan Li","Xiaojie Wang"],"pdf_url":"https://arxiv.org/pdf/2408.03632v2.pdf","comment":"Github Page: https://github.com/Nihukat/Concept-Conductor"},{"id":"http://arxiv.org/abs/2311.01473v3","updated":"2024-08-22T04:12:59Z","published":"2023-11-01T06:55:09Z","title":"Adversarial Examples in the Physical World: A Survey","summary":" Deep neural networks (DNNs) have demonstrated high vulnerability to\nadversarial examples, raising broad security concerns about their applications.\nBesides the attacks in the digital world, the practical implications of\nadversarial examples in the physical world present significant challenges and\nsafety concerns. However, current research on physical adversarial examples\n(PAEs) lacks a comprehensive understanding of their unique characteristics,\nleading to limited significance and understanding. In this paper, we address\nthis gap by thoroughly examining the characteristics of PAEs within a practical\nworkflow encompassing training, manufacturing, and re-sampling processes. By\nanalyzing the links between physical adversarial attacks, we identify\nmanufacturing and re-sampling as the primary sources of distinct attributes and\nparticularities in PAEs. Leveraging this knowledge, we develop a comprehensive\nanalysis and classification framework for PAEs based on their specific\ncharacteristics, covering over 100 studies on physical-world adversarial\nexamples. Furthermore, we investigate defense strategies against PAEs and\nidentify open challenges and opportunities for future research. We aim to\nprovide a fresh, thorough, and systematic understanding of PAEs, thereby\npromoting the development of robust adversarial learning and its application in\nopen-world scenarios to provide the community with a continuously updated list\nof physical world adversarial sample resources, including papers, code, \\etc,\nwithin the proposed framework\n","authors":["Jiakai Wang","Xianglong Liu","Jin Hu","Donghua Wang","Siyang Wu","Tingsong Jiang","Yuanfang Guo","Aishan Liu","Jiantao Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.01473v3.pdf","comment":"Adversarial examples, physical-world scenarios, attacks and defenses"},{"id":"http://arxiv.org/abs/2408.12114v1","updated":"2024-08-22T03:59:48Z","published":"2024-08-22T03:59:48Z","title":"SPARK: Multi-Vision Sensor Perception and Reasoning Benchmark for\n Large-scale Vision-Language Models","summary":" Large-scale Vision-Language Models (LVLMs) have significantly advanced with\ntext-aligned vision inputs. They have made remarkable progress in computer\nvision tasks by aligning text modality with vision inputs. There are also\nendeavors to incorporate multi-vision sensors beyond RGB, including thermal,\ndepth, and medical X-ray images. However, we observe that current LVLMs view\nimages taken from multi-vision sensors as if they were in the same RGB domain\nwithout considering the physical characteristics of multi-vision sensors. They\nfail to convey the fundamental multi-vision sensor information from the dataset\nand the corresponding contextual knowledge properly. Consequently, alignment\nbetween the information from the actual physical environment and the text is\nnot achieved correctly, making it difficult to answer complex sensor-related\nquestions that consider the physical environment. In this paper, we aim to\nestablish a multi-vision Sensor Perception And Reasoning benchmarK called SPARK\nthat can reduce the fundamental multi-vision sensor information gap between\nimages and multi-vision sensors. We generated 6,248 vision-language test\nsamples automatically to investigate multi-vision sensory perception and\nmulti-vision sensory reasoning on physical sensor knowledge proficiency across\ndifferent formats, covering different types of sensor-related questions. We\nutilized these samples to assess ten leading LVLMs. The results showed that\nmost models displayed deficiencies in multi-vision sensory reasoning to varying\nextents. Codes and data are available at https://github.com/top-yun/SPARK\n","authors":["Youngjoon Yu","Sangyun Chung","Byung-Kwan Lee","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2408.12114v1.pdf","comment":"Codes and data are available at https://github.com/top-yun/SPARK"},{"id":"http://arxiv.org/abs/2408.12111v1","updated":"2024-08-22T03:52:44Z","published":"2024-08-22T03:52:44Z","title":"ZipGait: Bridging Skeleton and Silhouette with Diffusion Model for\n Advancing Gait Recognition","summary":" Current gait recognition research predominantly focuses on extracting\nappearance features effectively, but the performance is severely compromised by\nthe vulnerability of silhouettes under unconstrained scenes. Consequently,\nnumerous studies have explored how to harness information from various models,\nparticularly by sufficiently utilizing the intrinsic information of skeleton\nsequences. While these model-based methods have achieved significant\nperformance, there is still a huge gap compared to appearance-based methods,\nwhich implies the potential value of bridging silhouettes and skeletons. In\nthis work, we make the first attempt to reconstruct dense body shapes from\ndiscrete skeleton distributions via the diffusion model, demonstrating a new\napproach that connects cross-modal features rather than focusing solely on\nintrinsic features to improve model-based methods. To realize this idea, we\npropose a novel gait diffusion model named DiffGait, which has been designed\nwith four specific adaptations suitable for gait recognition. Furthermore, to\neffectively utilize the reconstructed silhouettes and skeletons, we introduce\nPerception Gait Integration (PGI) to integrate different gait features through\na two-stage process. Incorporating those modifications leads to an efficient\nmodel-based gait recognition framework called ZipGait. Through extensive\nexperiments on four public benchmarks, ZipGait demonstrates superior\nperformance, outperforming the state-of-the-art methods by a large margin under\nboth cross-domain and intra-domain settings, while achieving significant\nplug-and-play performance improvements.\n","authors":["Fanxu Min","Qing Cai","Shaoxiang Guo","Yang Yu","Hao Fan","Junyu Dong"],"pdf_url":"https://arxiv.org/pdf/2408.12111v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12109v1","updated":"2024-08-22T03:49:18Z","published":"2024-08-22T03:49:18Z","title":"RoVRM: A Robust Visual Reward Model Optimized via Auxiliary Textual\n Preference Data","summary":" Large vision-language models (LVLMs) often fail to align with human\npreferences, leading to issues like generating misleading content without\nproper visual context (also known as hallucination). A promising solution to\nthis problem is using human-preference alignment techniques, such as best-of-n\nsampling and reinforcement learning. However, these techniques face the\ndifficulty arising from the scarcity of visual preference data, which is\nrequired to train a visual reward model (VRM). In this work, we continue the\nline of research. We present a Robust Visual Reward Model (RoVRM) which\nimproves human-preference alignment for LVLMs. RoVRM leverages auxiliary\ntextual preference data through a three-phase progressive training and optimal\ntransport-based preference data selection to effectively mitigate the scarcity\nof visual preference data. We experiment with RoVRM on the commonly used\nvision-language tasks based on the LLaVA-1.5-7B and -13B models. Experimental\nresults demonstrate that RoVRM consistently outperforms traditional VRMs.\nFurthermore, our three-phase progressive training and preference data selection\napproaches can yield consistent performance gains over ranking-based alignment\ntechniques, such as direct preference optimization.\n","authors":["Chenglong Wang","Yang Gan","Yifu Huo","Yongyu Mu","Murun Yang","Qiaozhi He","Tong Xiao","Chunliang Zhang","Tongran Liu","Quan Du","Di Yang","Jingbo Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.12109v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12102v1","updated":"2024-08-22T03:34:03Z","published":"2024-08-22T03:34:03Z","title":"Integrating Audio, Visual, and Semantic Information for Enhanced\n Multimodal Speaker Diarization","summary":" Speaker diarization, the process of segmenting an audio stream or transcribed\nspeech content into homogenous partitions based on speaker identity, plays a\ncrucial role in the interpretation and analysis of human speech. Most existing\nspeaker diarization systems rely exclusively on unimodal acoustic information,\nmaking the task particularly challenging due to the innate ambiguities of audio\nsignals. Recent studies have made tremendous efforts towards audio-visual or\naudio-semantic modeling to enhance performance. However, even the incorporation\nof up to two modalities often falls short in addressing the complexities of\nspontaneous and unstructured conversations. To exploit more meaningful dialogue\npatterns, we propose a novel multimodal approach that jointly utilizes audio,\nvisual, and semantic cues to enhance speaker diarization. Our method elegantly\nformulates the multimodal modeling as a constrained optimization problem.\nFirst, we build insights into the visual connections among active speakers and\nthe semantic interactions within spoken content, thereby establishing abundant\npairwise constraints. Then we introduce a joint pairwise constraint propagation\nalgorithm to cluster speakers based on these visual and semantic constraints.\nThis integration effectively leverages the complementary strengths of different\nmodalities, refining the affinity estimation between individual speaker\nembeddings. Extensive experiments conducted on multiple multimodal datasets\ndemonstrate that our approach consistently outperforms state-of-the-art speaker\ndiarization methods.\n","authors":["Luyao Cheng","Hui Wang","Siqi Zheng","Yafeng Chen","Rongjie Huang","Qinglin Zhang","Qian Chen","Xihao Li"],"pdf_url":"https://arxiv.org/pdf/2408.12102v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12100v1","updated":"2024-08-22T03:29:51Z","published":"2024-08-22T03:29:51Z","title":"A Unified Plug-and-Play Algorithm with Projected Landweber Operator for\n Split Convex Feasibility Problems","summary":" In recent years Plug-and-Play (PnP) methods have achieved state-of-the-art\nperformance in inverse imaging problems by replacing proximal operators with\ndenoisers. Based on the proximal gradient method, some theoretical results of\nPnP have appeared, where appropriate step size is crucial for convergence\nanalysis. However, in practical applications, applying PnP methods with\ntheoretically guaranteed step sizes is difficult, and these algorithms are\nlimited to Gaussian noise. In this paper,from a perspective of split convex\nfeasibility problems (SCFP), an adaptive PnP algorithm with Projected Landweber\nOperator (PnP-PLO) is proposed to address these issues. Numerical experiments\non image deblurring, super-resolution, and compressed sensing MRI experiments\nillustrate that PnP-PLO with theoretical guarantees outperforms\nstate-of-the-art methods such as RED and RED-PRO.\n","authors":["Shuchang Zhang","Hongxia Wang"],"pdf_url":"https://arxiv.org/pdf/2408.12100v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12099v1","updated":"2024-08-22T03:19:09Z","published":"2024-08-22T03:19:09Z","title":"Query-Efficient Video Adversarial Attack with Stylized Logo","summary":" Video classification systems based on Deep Neural Networks (DNNs) have\ndemonstrated excellent performance in accurately verifying video content.\nHowever, recent studies have shown that DNNs are highly vulnerable to\nadversarial examples. Therefore, a deep understanding of adversarial attacks\ncan better respond to emergency situations. In order to improve attack\nperformance, many style-transfer-based attacks and patch-based attacks have\nbeen proposed. However, the global perturbation of the former will bring\nunnatural global color, while the latter is difficult to achieve success in\ntargeted attacks due to the limited perturbation space. Moreover, compared to a\nplethora of methods targeting image classifiers, video adversarial attacks are\nstill not that popular. Therefore, to generate adversarial examples with a low\nbudget and to provide them with a higher verisimilitude, we propose a novel\nblack-box video attack framework, called Stylized Logo Attack (SLA). SLA is\nconducted through three steps. The first step involves building a style\nreferences set for logos, which can not only make the generated examples more\nnatural, but also carry more target class features in the targeted attacks.\nThen, reinforcement learning (RL) is employed to determine the style reference\nand position parameters of the logo within the video, which ensures that the\nstylized logo is placed in the video with optimal attributes. Finally,\nperturbation optimization is designed to optimize perturbations to improve the\nfooling rate in a step-by-step manner. Sufficient experimental results indicate\nthat, SLA can achieve better performance than state-of-the-art methods and\nstill maintain good deception effects when facing various defense methods.\n","authors":["Duoxun Tang","Yuxin Cao","Xi Xiao","Derui Wang","Sheng Wen","Tianqing Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.12099v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05055v2","updated":"2024-08-22T03:17:42Z","published":"2024-03-08T05:03:25Z","title":"MUC: Mixture of Uncalibrated Cameras for Robust 3D Human Body\n Reconstruction","summary":" Multiple cameras can provide comprehensive multi-view video coverage of a\nperson. Fusing this multi-view data is crucial for tasks like behavioral\nanalysis, although it traditionally requires camera calibration, a process that\nis often complex. Moreover, previous studies have overlooked the challenges\nposed by self-occlusion under multiple views and the continuity of human body\nshape estimation. In this study, we introduce a method to reconstruct the 3D\nhuman body from multiple uncalibrated camera views. Initially, we utilize a\npre-trained human body encoder to process each camera view individually,\nenabling the reconstruction of human body models and parameters for each view\nalong with predicted camera positions. Rather than merely averaging the models\nacross views, we develop a neural network trained to assign weights to\nindividual views for all human body joints, based on the estimated distribution\nof joint distances from each camera. Additionally, we focus on the mesh surface\nof the human body for dynamic fusion, allowing for the seamless integration of\nfacial expressions and body shape into a unified human body model. Our method\nhas shown excellent performance in reconstructing the human body on two public\ndatasets, advancing beyond previous work from the SMPL model to the SMPL-X\nmodel. This extension incorporates more complex hand poses and facial\nexpressions, enhancing the detail and accuracy of the reconstructions.\nCrucially, it supports the flexible ad-hoc deployment of any number of cameras,\noffering significant potential for various applications. Our code is available\nat https://github.com/AbsterZhu/MUC.\n","authors":["Yitao Zhu","Sheng Wang","Mengjie Xu","Zixu Zhuang","Zhixin Wang","Kaidong Wang","Han Zhang","Qian Wang"],"pdf_url":"https://arxiv.org/pdf/2403.05055v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12093v1","updated":"2024-08-22T03:03:04Z","published":"2024-08-22T03:03:04Z","title":"LLM-enhanced Scene Graph Learning for Household Rearrangement","summary":" The household rearrangement task involves spotting misplaced objects in a\nscene and accommodate them with proper places. It depends both on common-sense\nknowledge on the objective side and human user preference on the subjective\nside. In achieving such task, we propose to mine object functionality with user\npreference alignment directly from the scene itself, without relying on human\nintervention. To do so, we work with scene graph representation and propose\nLLM-enhanced scene graph learning which transforms the input scene graph into\nan affordance-enhanced graph (AEG) with information-enhanced nodes and newly\ndiscovered edges (relations). In AEG, the nodes corresponding to the receptacle\nobjects are augmented with context-induced affordance which encodes what kind\nof carriable objects can be placed on it. New edges are discovered with newly\ndiscovered non-local relations. With AEG, we perform task planning for scene\nrearrangement by detecting misplaced carriables and determining a proper\nplacement for each of them. We test our method by implementing a tiding robot\nin simulator and perform evaluation on a new benchmark we build. Extensive\nevaluations demonstrate that our method achieves state-of-the-art performance\non misplacement detection and the following rearrangement planning.\n","authors":["Wenhao Li","Zhiyuan Yu","Qijin She","Zhinan Yu","Yuqing Lan","Chenyang Zhu","Ruizhen Hu","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2408.12093v1.pdf","comment":"SIGGRAPH ASIA 2024"},{"id":"http://arxiv.org/abs/2408.12086v1","updated":"2024-08-22T02:51:21Z","published":"2024-08-22T02:51:21Z","title":"Unlocking Attributes' Contribution to Successful Camouflage: A Combined\n Textual and VisualAnalysis Strategy","summary":" In the domain of Camouflaged Object Segmentation (COS), despite continuous\nimprovements in segmentation performance, the underlying mechanisms of\neffective camouflage remain poorly understood, akin to a black box. To address\nthis gap, we present the first comprehensive study to examine the impact of\ncamouflage attributes on the effectiveness of camouflage patterns, offering a\nquantitative framework for the evaluation of camouflage designs. To support\nthis analysis, we have compiled the first dataset comprising descriptions of\ncamouflaged objects and their attribute contributions, termed COD-Text And\nX-attributions (COD-TAX). Moreover, drawing inspiration from the hierarchical\nprocess by which humans process information: from high-level textual\ndescriptions of overarching scenarios, through mid-level summaries of local\nareas, to low-level pixel data for detailed analysis. We have developed a\nrobust framework that combines textual and visual information for the task of\nCOS, named Attribution CUe Modeling with Eye-fixation Network (ACUMEN). ACUMEN\ndemonstrates superior performance, outperforming nine leading methods across\nthree widely-used datasets. We conclude by highlighting key insights derived\nfrom the attributes identified in our study. Code:\nhttps://github.com/lyu-yx/ACUMEN.\n","authors":["Hong Zhang","Yixuan Lyu","Qian Yu","Hanyang Liu","Huimin Ma","Ding Yuan","Yifan Yang"],"pdf_url":"https://arxiv.org/pdf/2408.12086v1.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2408.12084v1","updated":"2024-08-22T02:48:13Z","published":"2024-08-22T02:48:13Z","title":"Vision-Based Detection of Uncooperative Targets and Components on Small\n Satellites","summary":" Space debris and inactive satellites pose a threat to the safety and\nintegrity of operational spacecraft and motivate the need for space situational\nawareness techniques. These uncooperative targets create a challenging tracking\nand detection problem due to a lack of prior knowledge of their features,\ntrajectories, or even existence. Recent advancements in computer vision models\ncan be used to improve upon existing methods for tracking such uncooperative\ntargets to make them more robust and reliable to the wide-ranging nature of the\ntarget. This paper introduces an autonomous detection model designed to\nidentify and monitor these objects using learning and computer vision. The\nautonomous detection method aims to identify and accurately track the\nuncooperative targets in varied circumstances, including different camera\nspectral sensitivities, lighting, and backgrounds. Our method adapts to the\nrelative distance between the observing spacecraft and the target, and\ndifferent detection strategies are adjusted based on distance. At larger\ndistances, we utilize You Only Look Once (YOLOv8), a multitask Convolutional\nNeural Network (CNN), for zero-shot and domain-specific single-shot real time\ndetection of the target. At shorter distances, we use knowledge distillation to\ncombine visual foundation models with a lightweight fast segmentation CNN\n(Fast-SCNN) to segment the spacecraft components with low storage requirements\nand fast inference times, and to enable weight updates from earth and possible\nonboard training. Lastly, we test our method on a custom dataset simulating the\nunique conditions encountered in space, as well as a publicly-available\ndataset.\n","authors":["Hannah Grauer","Elena-Sorina Lupu","Connor Lee","Soon-Jo Chung","Darren Rowen","Benjamen Bycroft","Phaedrus Leeds","John Brader"],"pdf_url":"https://arxiv.org/pdf/2408.12084v1.pdf","comment":"Small Satellite 2024 Conference, 13 pages, 8 figures, 6 tables"},{"id":"http://arxiv.org/abs/2408.12077v1","updated":"2024-08-22T02:33:29Z","published":"2024-08-22T02:33:29Z","title":"Through-the-Wall Radar Human Activity Micro-Doppler Signature\n Representation Method Based on Joint Boulic-Sinusoidal Pendulum Model","summary":" With the help of micro-Doppler signature, ultra-wideband (UWB)\nthrough-the-wall radar (TWR) enables the reconstruction of range and velocity\ninformation of limb nodes to accurately identify indoor human activities.\nHowever, existing methods are usually trained and validated directly using\nrange-time maps (RTM) and Doppler-time maps (DTM), which have high feature\nredundancy and poor generalization ability. In order to solve this problem,\nthis paper proposes a human activity micro-Doppler signature representation\nmethod based on joint Boulic-sinusoidal pendulum motion model. In detail, this\npaper presents a simplified joint Boulic-sinusoidal pendulum human motion model\nby taking head, torso, both hands and feet into consideration improved from\nBoulic-Thalmann kinematic model. The paper also calculates the minimum number\nof key points needed to describe the Doppler and micro-Doppler information\nsufficiently. Both numerical simulations and experiments are conducted to\nverify the effectiveness. The results demonstrate that the proposed number of\nkey points of micro-Doppler signature can precisely represent the indoor human\nlimb node motion characteristics, and substantially improve the generalization\ncapability of the existing methods for different testers.\n","authors":["Xiaopeng Yang","Weicheng Gao","Xiaodong Qu","Zeyu Ma","Hao Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.12077v1.pdf","comment":"17 pages, 14 figures, 7 tables, in IEEE Transactions on Microwave\n Theory and Techniques, 2024"},{"id":"http://arxiv.org/abs/2309.11092v2","updated":"2024-08-22T02:23:46Z","published":"2023-09-20T06:51:11Z","title":"Generalized Face Forgery Detection via Adaptive Learning for Pre-trained\n Vision Transformer","summary":" With the rapid progress of generative models, the current challenge in face\nforgery detection is how to effectively detect realistic manipulated faces from\ndifferent unseen domains. Though previous studies show that pre-trained Vision\nTransformer (ViT) based models can achieve some promising results after fully\nfine-tuning on the Deepfake dataset, their generalization performances are\nstill unsatisfactory. One possible reason is that fully fine-tuned ViT-based\nmodels may disrupt the pre-trained features [1, 2] and overfit to some\ndata-specific patterns [3]. To alleviate this issue, we present a\n\\textbf{F}orgery-aware \\textbf{A}daptive \\textbf{Vi}sion \\textbf{T}ransformer\n(FA-ViT) under the adaptive learning paradigm, where the parameters in the\npre-trained ViT are kept fixed while the designed adaptive modules are\noptimized to capture forgery features. Specifically, a global adaptive module\nis designed to model long-range interactions among input tokens, which takes\nadvantage of self-attention mechanism to mine global forgery clues. To further\nexplore essential local forgery clues, a local adaptive module is proposed to\nexpose local inconsistencies by enhancing the local contextual association. In\naddition, we introduce a fine-grained adaptive learning module that emphasizes\nthe common compact representation of genuine faces through relationship\nlearning in fine-grained pairs, driving these proposed adaptive modules to be\naware of fine-grained forgery-aware information. Extensive experiments\ndemonstrate that our FA-ViT achieves state-of-the-arts results in the\ncross-dataset evaluation, and enhances the robustness against unseen\nperturbations. Particularly, FA-ViT achieves 93.83\\% and 78.32\\% AUC scores on\nCeleb-DF and DFDC datasets in the cross-dataset evaluation. The code and\ntrained model have been released at: https://github.com/LoveSiameseCat/FAViT.\n","authors":["Anwei Luo","Rizhao Cai","Chenqi Kong","Yakun Ju","Xiangui Kang","Jiwu Huang","Alex C. Kot"],"pdf_url":"https://arxiv.org/pdf/2309.11092v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18906v2","updated":"2024-08-22T02:22:05Z","published":"2024-07-26T17:58:57Z","title":"A Scalable Quantum Non-local Neural Network for Image Classification","summary":" Non-local operations play a crucial role in computer vision enabling the\ncapture of long-range dependencies through weighted sums of features across the\ninput, surpassing the constraints of traditional convolution operations that\nfocus solely on local neighborhoods. Non-local operations typically require\ncomputing pairwise relationships between all elements in a set, leading to\nquadratic complexity in terms of time and memory. Due to the high computational\nand memory demands, scaling non-local neural networks to large-scale problems\ncan be challenging. This article introduces a hybrid quantum-classical scalable\nnon-local neural network, referred to as Quantum Non-Local Neural Network\n(QNL-Net), to enhance pattern recognition. The proposed QNL-Net relies on\ninherent quantum parallelism to allow the simultaneous processing of a large\nnumber of input features enabling more efficient computations in\nquantum-enhanced feature space and involving pairwise relationships through\nquantum entanglement. We benchmark our proposed QNL-Net with other quantum\ncounterparts to binary classification with datasets MNIST and CIFAR-10. The\nsimulation findings showcase our QNL-Net achieves cutting-edge accuracy levels\nin binary image classification among quantum classifiers while utilizing fewer\nqubits.\n","authors":["Sparsh Gupta","Debanjan Konar","Vaneet Aggarwal"],"pdf_url":"https://arxiv.org/pdf/2407.18906v2.pdf","comment":"preprint, 12 pages (including references and appendix), 5 figures"},{"id":"http://arxiv.org/abs/2406.06777v4","updated":"2024-08-22T02:06:31Z","published":"2024-06-10T20:25:18Z","title":"MolX: Enhancing Large Language Models for Molecular Learning with A\n Multi-Modal Extension","summary":" Large Language Models (LLMs) with their strong task-handling capabilities\nhave shown remarkable advancements across a spectrum of fields, moving beyond\nnatural language understanding. However, their proficiency within the chemistry\ndomain remains restricted, especially in solving professional molecule-related\ntasks. This challenge is attributed to their inherent limitations in\ncomprehending molecules using only common textual representations, i.e., SMILES\nstrings. In this study, we seek to enhance the ability of LLMs to comprehend\nmolecules by equipping them with a multi-modal external module, namely MolX. In\nparticular, instead of directly using a SMILES string to represent a molecule,\nwe utilize specific encoders to extract fine-grained features from both SMILES\nstring and 2D molecular graph representations for feeding into an LLM.\nMoreover, a handcrafted molecular fingerprint is incorporated to leverage its\nembedded domain knowledge. Then, to establish an alignment between MolX and the\nLLM's textual input space, the whole model in which the LLM is frozen, is\npre-trained with a versatile strategy including a diverse set of tasks.\nExperimental evaluations show that our proposed method outperforms baselines\nacross 4 downstream molecule-related tasks ranging from molecule-to-text\ntranslation to retrosynthesis, with and without fine-tuning the LLM, while only\nintroducing a small number of trainable parameters 0.53% and 0.82%,\nrespectively.\n","authors":["Khiem Le","Zhichun Guo","Kaiwen Dong","Xiaobao Huang","Bozhao Nan","Roshni Iyer","Xiangliang Zhang","Olaf Wiest","Wei Wang","Nitesh V. Chawla"],"pdf_url":"https://arxiv.org/pdf/2406.06777v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12062v1","updated":"2024-08-22T01:48:31Z","published":"2024-08-22T01:48:31Z","title":"Enhancing Sampling Protocol for Robust Point Cloud Classification","summary":" Established sampling protocols for 3D point cloud learning, such as Farthest\nPoint Sampling (FPS) and Fixed Sample Size (FSS), have long been recognized and\nutilized. However, real-world data often suffer from corrputions such as sensor\nnoise, which violates the benignness assumption of point cloud in current\nprotocols. Consequently, they are notably vulnerable to noise, posing\nsignificant safety risks in critical applications like autonomous driving. To\naddress these issues, we propose an enhanced point cloud sampling protocol,\nPointDR, which comprises two components: 1) Downsampling for key point\nidentification and 2) Resampling for flexible sample size. Furthermore,\ndifferentiated strategies are implemented for training and inference processes.\nParticularly, an isolation-rated weight considering local density is designed\nfor the downsampling method, assisting it in performing random key points\nselection in the training phase and bypassing noise in the inference phase. A\nlocal-geometry-preserved upsampling is incorporated into resampling,\nfacilitating it to maintain a stochastic sample size in the training stage and\ncomplete insufficient data in the inference. It is crucial to note that the\nproposed protocol is free of model architecture altering and extra learning,\nthus minimal efforts are demanded for its replacement of the existing one.\nDespite the simplicity, it substantially improves the robustness of point cloud\nlearning, showcased by outperforming the state-of-the-art methods on multiple\nbenchmarks of corrupted point cloud classification. The code will be available\nupon the paper's acceptance.\n","authors":["Chongshou Li","Pin Tang","Xinke Li","Tianrui Li"],"pdf_url":"https://arxiv.org/pdf/2408.12062v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11286v2","updated":"2024-08-22T01:46:45Z","published":"2024-08-21T02:17:18Z","title":"Video Emotion Open-vocabulary Recognition Based on Multimodal Large\n Language Model","summary":" Multimodal emotion recognition is a task of great concern. However,\ntraditional data sets are based on fixed labels, resulting in models that often\nfocus on main emotions and ignore detailed emotional changes in complex scenes.\nThis report introduces the solution of using MLLMs technology to generate\nopen-vocabulary emotion labels from a video. The solution includes the use of\nframework, data generation and processing, training methods, results generation\nand multi-model co-judgment. In the MER-OV (Open-Word Emotion Recognition) of\nthe MER2024 challenge, our method achieved significant advantages, leading to\nits superior capabilities in complex emotion computation.\n","authors":["Mengying Ge","Dongkai Tang","Mingyang Li"],"pdf_url":"https://arxiv.org/pdf/2408.11286v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07794v3","updated":"2024-08-22T01:42:08Z","published":"2024-04-11T14:35:59Z","title":"DGMamba: Domain Generalization via Generalized State Space Model","summary":" Domain generalization~(DG) aims at solving distribution shift problems in\nvarious scenes. Existing approaches are based on Convolution Neural Networks\n(CNNs) or Vision Transformers (ViTs), which suffer from limited receptive\nfields or quadratic complexities issues. Mamba, as an emerging state space\nmodel (SSM), possesses superior linear complexity and global receptive fields.\nDespite this, it can hardly be applied to DG to address distribution shifts,\ndue to the hidden state issues and inappropriate scan mechanisms. In this\npaper, we propose a novel framework for DG, named DGMamba, that excels in\nstrong generalizability toward unseen domains and meanwhile has the advantages\nof global receptive fields, and efficient linear complexity. Our DGMamba\ncompromises two core components: Hidden State Suppressing~(HSS) and\nSemantic-aware Patch refining~(SPR). In particular, HSS is introduced to\nmitigate the influence of hidden states associated with domain-specific\nfeatures during output prediction. SPR strives to encourage the model to\nconcentrate more on objects rather than context, consisting of two designs:\nPrior-Free Scanning~(PFS), and Domain Context Interchange~(DCI). Concretely,\nPFS aims to shuffle the non-semantic patches within images, creating more\nflexible and effective sequences from images, and DCI is designed to regularize\nMamba with the combination of mismatched non-semantic and semantic information\nby fusing patches among domains. Extensive experiments on five commonly used DG\nbenchmarks demonstrate that the proposed DGMamba achieves remarkably superior\nresults to state-of-the-art models. The code will be made publicly available at\nhttps://github.com/longshaocong/DGMamba.\n","authors":["Shaocong Long","Qianyu Zhou","Xiangtai Li","Xuequan Lu","Chenhao Ying","Yuan Luo","Lizhuang Ma","Shuicheng Yan"],"pdf_url":"https://arxiv.org/pdf/2404.07794v3.pdf","comment":"Accepted to ACM MM 2024"},{"id":"http://arxiv.org/abs/2408.11540v2","updated":"2024-08-22T01:21:50Z","published":"2024-08-21T11:39:18Z","title":"DeRainGS: Gaussian Splatting for Enhanced Scene Reconstruction in Rainy\n Environments","summary":" Reconstruction under adverse rainy conditions poses significant challenges\ndue to reduced visibility and the distortion of visual perception. These\nconditions can severely impair the quality of geometric maps, which is\nessential for applications ranging from autonomous planning to environmental\nmonitoring. In response to these challenges, this study introduces the novel\ntask of 3D Reconstruction in Rainy Environments (3DRRE), specifically designed\nto address the complexities of reconstructing 3D scenes under rainy conditions.\nTo benchmark this task, we construct the HydroViews dataset that comprises a\ndiverse collection of both synthesized and real-world scene images\ncharacterized by various intensities of rain streaks and raindrops.\nFurthermore, we propose DeRainGS, the first 3DGS method tailored for\nreconstruction in adverse rainy environments. Extensive experiments across a\nwide range of rain scenarios demonstrate that our method delivers\nstate-of-the-art performance, remarkably outperforming existing occlusion-free\nmethods.\n","authors":["Shuhong Liu","Xiang Chen","Hongming Chen","Quanfeng Xu","Mingrui Li"],"pdf_url":"https://arxiv.org/pdf/2408.11540v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.11631v2","updated":"2024-08-22T00:41:15Z","published":"2023-01-27T10:21:18Z","title":"HyperNeRFGAN: Hypernetwork approach to 3D NeRF GAN","summary":" The recent surge in popularity of deep generative models for 3D objects has\nhighlighted the need for more efficient training methods, particularly given\nthe difficulties associated with training with conventional 3D representations,\nsuch as voxels or point clouds. Neural Radiance Fields (NeRFs), which provide\nthe current benchmark in terms of quality for the generation of novel views of\ncomplex 3D scenes from a limited set of 2D images, represent a promising\nsolution to this challenge. However, the training of these models requires the\nknowledge of the respective camera positions from which the images were viewed.\nIn this paper, we overcome this limitation by introducing HyperNeRFGAN, a\nGenerative Adversarial Network (GAN) architecture employing a hypernetwork\nparadigm to transform a Gaussian noise into the weights of a NeRF architecture\nthat does not utilize viewing directions in its training phase. Consequently,\nas evidenced by the findings of our experimental study, the proposed model,\ndespite its notable simplicity in comparison to existing state-of-the-art\nalternatives, demonstrates superior performance on a diverse range of image\ndatasets where camera position estimation is challenging, particularly in the\ncontext of medical data.\n","authors":["Adam Kania","Artur Kasymov","Jakub Kościukiewicz","Artur Górak","Marcin Mazur","Maciej Zięba","Przemysław Spurek"],"pdf_url":"https://arxiv.org/pdf/2301.11631v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.02901v2","updated":"2024-08-22T00:31:39Z","published":"2024-08-06T02:15:12Z","title":"Lighthouse: A User-Friendly Library for Reproducible Video Moment\n Retrieval and Highlight Detection","summary":" We propose Lighthouse, a user-friendly library for reproducible video moment\nretrieval and highlight detection (MR-HD). Although researchers proposed\nvarious MR-HD approaches, the research community holds two main issues. The\nfirst is a lack of comprehensive and reproducible experiments across various\nmethods, datasets, and video-text features. This is because no unified training\nand evaluation codebase covers multiple settings. The second is user-unfriendly\ndesign. Because previous works use different libraries, researchers set up\nindividual environments. In addition, most works release only the training\ncodes, requiring users to implement the whole inference process of MR-HD.\nLighthouse addresses these issues by implementing a unified reproducible\ncodebase that includes six models, three features, and five datasets. In\naddition, it provides an inference API and web demo to make these methods\neasily accessible for researchers and developers. Our experiments demonstrate\nthat Lighthouse generally reproduces the reported scores in the reference\npapers. The code is available at https://github.com/line/lighthouse.\n","authors":["Taichi Nishimura","Shota Nakada","Hokuto Munakata","Tatsuya Komatsu"],"pdf_url":"https://arxiv.org/pdf/2408.02901v2.pdf","comment":"6 pages; library tech report"},{"id":"http://arxiv.org/abs/2408.12048v1","updated":"2024-08-22T00:14:50Z","published":"2024-08-22T00:14:50Z","title":"ISETHDR: A Physics-based Synthetic Radiance Dataset for High Dynamic\n Range Driving Scenes","summary":" This paper describes a physics-based end-to-end software simulation for image\nsystems. We use the software to explore sensors designed to enhance performance\nin high dynamic range (HDR) environments, such as driving through daytime\ntunnels and under nighttime conditions. We synthesize physically realistic HDR\nspectral radiance images and use them as the input to digital twins that model\nthe optics and sensors of different systems. This paper makes three main\ncontributions: (a) We create a labeled (instance segmentation and depth),\nsynthetic radiance dataset of HDR driving scenes. (b) We describe the\ndevelopment and validation of the end-to-end simulation framework. (c) We\npresent a comparative analysis of two single-shot sensors designed for HDR. We\nopen-source both the dataset and the software.\n","authors":["Zhenyi Liu","Devesh Shah","Brian Wandell"],"pdf_url":"https://arxiv.org/pdf/2408.12048v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2408.12579v1","updated":"2024-08-22T17:44:40Z","published":"2024-08-22T17:44:40Z","title":"RuleAlign: Making Large Language Models Better Physicians with\n Diagnostic Rule Alignment","summary":" Large Language Models (LLMs) like GPT-4, MedPaLM-2, and Med-Gemini achieve\nperformance competitively with human experts across various medical benchmarks.\nHowever, they still face challenges in making professional diagnoses akin to\nphysicians, particularly in efficiently gathering patient information and\nreasoning the final diagnosis. To this end, we introduce the RuleAlign\nframework, designed to align LLMs with specific diagnostic rules. We develop a\nmedical dialogue dataset comprising rule-based communications between patients\nand physicians and design an alignment learning approach through preference\nlearning. Experimental results demonstrate the effectiveness of the proposed\napproach. We hope that our work can serve as an inspiration for exploring the\npotential of LLMs as AI physicians.\n","authors":["Xiaohan Wang","Xiaoyan Yang","Yuqi Zhu","Yue Shen","Jian Wang","Peng Wei","Lei Liang","Jinjie Gu","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.12579v1.pdf","comment":"Ongoing work"},{"id":"http://arxiv.org/abs/2408.08981v2","updated":"2024-08-22T17:20:27Z","published":"2024-08-16T19:10:48Z","title":"From Lazy to Prolific: Tackling Missing Labels in Open Vocabulary\n Extreme Classification by Positive-Unlabeled Sequence Learning","summary":" Open-vocabulary Extreme Multi-label Classification (OXMC) extends traditional\nXMC by allowing prediction beyond an extremely large, predefined label set\n(typically $10^3$ to $10^{12}$ labels), addressing the dynamic nature of\nreal-world labeling tasks. However, self-selection bias in data annotation\nleads to significant missing labels in both training and test data,\nparticularly for less popular inputs. This creates two critical challenges:\ngeneration models learn to be \"lazy'\" by under-generating labels, and\nevaluation becomes unreliable due to insufficient annotation in the test set.\nIn this work, we introduce Positive-Unlabeled Sequence Learning (PUSL), which\nreframes OXMC as an infinite keyphrase generation task, addressing the\ngeneration model's laziness. Additionally, we propose to adopt a suite of\nevaluation metrics, F1@$\\mathcal{O}$ and newly proposed B@$k$, to reliably\nassess OXMC models with incomplete ground truths. In a highly imbalanced\ne-commerce dataset with substantial missing labels, PUSL generates 30% more\nunique labels, and 72% of its predictions align with actual user queries. On\nthe less skewed EURLex-4.3k dataset, PUSL demonstrates superior F1 scores,\nespecially as label counts increase from 15 to 30. Our approach effectively\ntackles both the modeling and evaluation challenges in OXMC with missing\nlabels.\n","authors":["Ranran Haoran Zhang","Bensu Uçar","Soumik Dey","Hansi Wu","Binbin Li","Rui Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.08981v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12492v1","updated":"2024-08-22T15:33:46Z","published":"2024-08-22T15:33:46Z","title":"The Importance of Cognitive Biases in the Recommendation Ecosystem","summary":" Cognitive biases have been studied in psychology, sociology, and behavioral\neconomics for decades. Traditionally, they have been considered a negative\nhuman trait that leads to inferior decision-making, reinforcement of\nstereotypes, or can be exploited to manipulate consumers, respectively. We\nargue that cognitive biases also manifest in different parts of the\nrecommendation ecosystem and at different stages of the recommendation process.\nMore importantly, we contest this traditional detrimental perspective on\ncognitive biases and claim that certain cognitive biases can be beneficial when\naccounted for by recommender systems. Concretely, we provide empirical evidence\nthat biases such as feature-positive effect, Ikea effect, and cultural\nhomophily can be observed in various components of the recommendation pipeline,\nincluding input data (such as ratings or side information), recommendation\nalgorithm or model (and consequently recommended items), and user interactions\nwith the system. In three small experiments covering recruitment and\nentertainment domains, we study the pervasiveness of the aforementioned biases.\nWe ultimately advocate for a prejudice-free consideration of cognitive biases\nto improve user and item models as well as recommendation algorithms.\n","authors":["Markus Schedl","Oleg Lesota","Stefan Brandl","Mohammad Lotfi","Gustavo Junior Escobedo Ticona","Shahed Masoudian"],"pdf_url":"https://arxiv.org/pdf/2408.12492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12470v1","updated":"2024-08-22T15:10:56Z","published":"2024-08-22T15:10:56Z","title":"DLCRec: A Novel Approach for Managing Diversity in LLM-Based Recommender\n Systems","summary":" The integration of Large Language Models (LLMs) into recommender systems has\nled to substantial performance improvements. However, this often comes at the\ncost of diminished recommendation diversity, which can negatively impact user\nsatisfaction. To address this issue, controllable recommendation has emerged as\na promising approach, allowing users to specify their preferences and receive\nrecommendations that meet their diverse needs. Despite its potential, existing\ncontrollable recommender systems frequently rely on simplistic mechanisms, such\nas a single prompt, to regulate diversity-an approach that falls short of\ncapturing the full complexity of user preferences. In response to these\nlimitations, we propose DLCRec, a novel framework designed to enable\nfine-grained control over diversity in LLM-based recommendations. Unlike\ntraditional methods, DLCRec adopts a fine-grained task decomposition strategy,\nbreaking down the recommendation process into three sequential sub-tasks: genre\nprediction, genre filling, and item prediction. These sub-tasks are trained\nindependently and inferred sequentially according to user-defined control\nnumbers, ensuring more precise control over diversity. Furthermore, the\nscarcity and uneven distribution of diversity-related user behavior data pose\nsignificant challenges for fine-tuning. To overcome these obstacles, we\nintroduce two data augmentation techniques that enhance the model's robustness\nto noisy and out-of-distribution data. These techniques expose the model to a\nbroader range of patterns, improving its adaptability in generating\nrecommendations with varying levels of diversity. Our extensive empirical\nevaluation demonstrates that DLCRec not only provides precise control over\ndiversity but also outperforms state-of-the-art baselines across multiple\nrecommendation scenarios.\n","authors":["Jiaju Chen","Chongming Gao","Shuai Yuan","Shuchang Liu","Qingpeng Cai","Peng Jiang"],"pdf_url":"https://arxiv.org/pdf/2408.12470v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08066v2","updated":"2024-08-22T15:07:40Z","published":"2024-08-15T10:15:37Z","title":"Mamba Retriever: Utilizing Mamba for Effective and Efficient Dense\n Retrieval","summary":" In the information retrieval (IR) area, dense retrieval (DR) models use deep\nlearning techniques to encode queries and passages into embedding space to\ncompute their semantic relations. It is important for DR models to balance both\nefficiency and effectiveness. Pre-trained language models (PLMs), especially\nTransformer-based PLMs, have been proven to be effective encoders of DR models.\nHowever, the self-attention component in Transformer-based PLM results in a\ncomputational complexity that grows quadratically with sequence length, and\nthus exhibits a slow inference speed for long-text retrieval. Some recently\nproposed non-Transformer PLMs, especially the Mamba architecture PLMs, have\ndemonstrated not only comparable effectiveness to Transformer-based PLMs on\ngenerative language tasks but also better efficiency due to linear time scaling\nin sequence length. This paper implements the Mamba Retriever to explore\nwhether Mamba can serve as an effective and efficient encoder of DR model for\nIR tasks. We fine-tune the Mamba Retriever on the classic short-text MS MARCO\npassage ranking dataset and the long-text LoCoV0 dataset. Experimental results\nshow that (1) on the MS MARCO passage ranking dataset and BEIR, the Mamba\nRetriever achieves comparable or better effectiveness compared to\nTransformer-based retrieval models, and the effectiveness grows with the size\nof the Mamba model; (2) on the long-text LoCoV0 dataset, the Mamba Retriever\ncan extend to longer text length than its pre-trained length after fine-tuning\non retrieval task, and it has comparable or better effectiveness compared to\nother long-text retrieval models; (3) the Mamba Retriever has superior\ninference speed for long-text retrieval. In conclusion, Mamba Retriever is both\neffective and efficient, making it a practical model, especially for long-text\nretrieval.\n","authors":["Hanqi Zhang","Chong Chen","Lang Mei","Qi Liu","Jiaxin Mao"],"pdf_url":"https://arxiv.org/pdf/2408.08066v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12398v1","updated":"2024-08-22T13:44:31Z","published":"2024-08-22T13:44:31Z","title":"A Comparative Analysis of Faithfulness Metrics and Humans in Citation\n Evaluation","summary":" Large language models (LLMs) often generate content with unsupported or\nunverifiable content, known as \"hallucinations.\" To address this,\nretrieval-augmented LLMs are employed to include citations in their content,\ngrounding the content in verifiable sources. Despite such developments,\nmanually assessing how well a citation supports the associated statement\nremains a major challenge. Previous studies tackle this challenge by leveraging\nfaithfulness metrics to estimate citation support automatically. However, they\nlimit this citation support estimation to a binary classification scenario,\nneglecting fine-grained citation support in practical scenarios. To investigate\nthe effectiveness of faithfulness metrics in fine-grained scenarios, we propose\na comparative evaluation framework that assesses the metric effectiveness in\ndistinguishing citations between three-category support levels: full, partial,\nand no support. Our framework employs correlation analysis, classification\nevaluation, and retrieval evaluation to measure the alignment between metric\nscores and human judgments comprehensively. Our results indicate no single\nmetric consistently excels across all evaluations, highlighting the complexity\nof accurately evaluating fine-grained support levels. Particularly, we find\nthat the best-performing metrics struggle to distinguish partial support from\nfull or no support. Based on these findings, we provide practical\nrecommendations for developing more effective metrics.\n","authors":["Weijia Zhang","Mohammad Aliannejadi","Jiahuan Pei","Yifei Yuan","Jia-Hong Huang","Evangelos Kanoulas"],"pdf_url":"https://arxiv.org/pdf/2408.12398v1.pdf","comment":"Accepted by the First Workshop on Large Language Model for Evaluation\n in Information Retrieval (LLM4Eval@SIGIR2024), non-archival. arXiv admin\n note: substantial text overlap with arXiv:2406.15264"},{"id":"http://arxiv.org/abs/2408.12392v1","updated":"2024-08-22T13:37:30Z","published":"2024-08-22T13:37:30Z","title":"Dynamic Product Image Generation and Recommendation at Scale for\n Personalized E-commerce","summary":" Coupling latent diffusion based image generation with contextual bandits\nenables the creation of eye-catching personalized product images at scale that\nwas previously either impossible or too expensive. In this paper we showcase\nhow we utilized these technologies to increase user engagement with\nrecommendations in online retargeting campaigns for e-commerce.\n","authors":["Ádám Tibor Czapp","Mátyás Jani","Bálint Domián","Balázs Hidasi"],"pdf_url":"https://arxiv.org/pdf/2408.12392v1.pdf","comment":"Appearing in the Proceedings of the 18th ACM Conference on\n Recommender Systems (RecSys'24) as an Industry Track paper"},{"id":"http://arxiv.org/abs/2312.16563v2","updated":"2024-08-22T12:50:09Z","published":"2023-12-27T13:04:46Z","title":"RDGCL: Reaction-Diffusion Graph Contrastive Learning for Recommendation","summary":" Contrastive learning (CL) has emerged as a promising technique for improving\nrecommender systems, addressing the challenge of data sparsity by using\nself-supervised signals from raw data. Integration of CL with graph\nconvolutional network (GCN)-based collaborative filterings (CFs) has been\nexplored in recommender systems. However, current CL-based recommendation\nmodels heavily rely on low-pass filters and graph augmentations. In this paper,\ninspired by the reaction-diffusion equation, we propose a novel CL method for\nrecommender systems called the reaction-diffusion graph contrastive learning\nmodel (RDGCL). We design our own GCN for CF based on the equations of\ndiffusion, i.e., low-pass filter, and reaction, i.e., high-pass filter. Our\nproposed CL-based training occurs between reaction and diffusion-based\nembeddings, so there is no need for graph augmentations. Experimental\nevaluation on 5 benchmark datasets demonstrates that our proposed method\noutperforms state-of-the-art CL-based recommendation models. By enhancing\nrecommendation accuracy and diversity, our method brings an advancement in CL\nfor recommender systems.\n","authors":["Jeongwhan Choi","Hyowon Wi","Chaejeong Lee","Sung-Bae Cho","Dongha Lee","Noseong Park"],"pdf_url":"https://arxiv.org/pdf/2312.16563v2.pdf","comment":"Jeongwhan Choi and Hyowon Wi are co-first authors with equal\n contributions"},{"id":"http://arxiv.org/abs/2408.08203v2","updated":"2024-08-22T10:14:33Z","published":"2024-08-15T15:11:06Z","title":"From Clicks to Carbon: The Environmental Toll of Recommender Systems","summary":" As global warming soars, the need to assess the environmental impact of\nresearch is becoming increasingly urgent. Despite this, few recommender systems\nresearch papers address their environmental impact. In this study, we estimate\nthe environmental impact of recommender systems research by reproducing typical\nexperimental pipelines. Our analysis spans 79 full papers from the 2013 and\n2023 ACM RecSys conferences, comparing traditional \"good old-fashioned AI\"\nalgorithms with modern deep learning algorithms. We designed and reproduced\nrepresentative experimental pipelines for both years, measuring energy\nconsumption with a hardware energy meter and converting it to CO2 equivalents.\nOur results show that papers using deep learning algorithms emit approximately\n42 times more CO2 equivalents than papers using traditional methods. On\naverage, a single deep learning-based paper generates 3,297 kilograms of CO2\nequivalents - more than the carbon emissions of one person flying from New York\nCity to Melbourne or the amount of CO2 one tree sequesters over 300 years.\n","authors":["Tobias Vente","Lukas Wegmeth","Alan Said","Joeran Beel"],"pdf_url":"https://arxiv.org/pdf/2408.08203v2.pdf","comment":"Accepted for presentation at the 18th ACM Conference on Recommender\n Systems in the Reproducibility Track"},{"id":"http://arxiv.org/abs/2408.12208v1","updated":"2024-08-22T08:35:11Z","published":"2024-08-22T08:35:11Z","title":"Fair Augmentation for Graph Collaborative Filtering","summary":" Recent developments in recommendation have harnessed the collaborative power\nof graph neural networks (GNNs) in learning users' preferences from user-item\nnetworks. Despite emerging regulations addressing fairness of automated\nsystems, unfairness issues in graph collaborative filtering remain\nunderexplored, especially from the consumer's perspective. Despite numerous\ncontributions on consumer unfairness, only a few of these works have delved\ninto GNNs. A notable gap exists in the formalization of the latest mitigation\nalgorithms, as well as in their effectiveness and reliability on cutting-edge\nmodels. This paper serves as a solid response to recent research highlighting\nunfairness issues in graph collaborative filtering by reproducing one of the\nlatest mitigation methods. The reproduced technique adjusts the system fairness\nlevel by learning a fair graph augmentation. Under an experimental setup based\non 11 GNNs, 5 non-GNN models, and 5 real-world networks across diverse domains,\nour investigation reveals that fair graph augmentation is consistently\neffective on high-utility models and large datasets. Experiments on the\ntransferability of the fair augmented graph open new issues for future\nrecommendation studies. Source code: https://github.com/jackmedda/FA4GCF.\n","authors":["Ludovico Boratto","Francesco Fabbri","Gianni Fenu","Mirko Marras","Giacomo Medda"],"pdf_url":"https://arxiv.org/pdf/2408.12208v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12185v1","updated":"2024-08-22T08:00:50Z","published":"2024-08-22T08:00:50Z","title":"Rank and Align: Towards Effective Source-free Graph Domain Adaptation","summary":" Graph neural networks (GNNs) have achieved impressive performance in graph\ndomain adaptation. However, extensive source graphs could be unavailable in\nreal-world scenarios due to privacy and storage concerns. To this end, we\ninvestigate an underexplored yet practical problem of source-free graph domain\nadaptation, which transfers knowledge from source models instead of source\ngraphs to a target domain. To solve this problem, we introduce a novel\nGNN-based approach called Rank and Align (RNA), which ranks graph similarities\nwith spectral seriation for robust semantics learning, and aligns inharmonic\ngraphs with harmonic graphs which close to the source domain for subgraph\nextraction. In particular, to overcome label scarcity, we employ the spectral\nseriation algorithm to infer the robust pairwise rankings, which can guide\nsemantic learning using a similarity learning objective. To depict distribution\nshifts, we utilize spectral clustering and the silhouette coefficient to detect\nharmonic graphs, which the source model can easily classify. To reduce\npotential domain discrepancy, we extract domain-invariant subgraphs from\ninharmonic graphs by an adversarial edge sampling process, which guides the\ninvariant learning of GNNs. Extensive experiments on several benchmark datasets\ndemonstrate the effectiveness of our proposed RNA.\n","authors":["Junyu Luo","Zhiping Xiao","Yifan Wang","Xiao Luo","Jingyang Yuan","Wei Ju","Langechuan Liu","Ming Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.12185v1.pdf","comment":"Published in IJCAI2024"},{"id":"http://arxiv.org/abs/2408.12173v1","updated":"2024-08-22T07:41:33Z","published":"2024-08-22T07:41:33Z","title":"Hardware Acceleration for Knowledge Graph Processing: Challenges &\n Recent Developments","summary":" Knowledge graphs (KGs) have achieved significant attention in recent years,\nparticularly in the area of the Semantic Web as well as gaining popularity in\nother application domains such as data mining and search engines.\nSimultaneously, there has been enormous progress in the development of\ndifferent types of heterogeneous hardware, impacting the way KGs are processed.\nThe aim of this paper is to provide a systematic literature review of knowledge\ngraph hardware acceleration. For this, we present a classification of the\nprimary areas in knowledge graph technology that harnesses different hardware\nunits for accelerating certain knowledge graph functionalities. We then\nextensively describe respective works, focusing on how KG related schemes\nharness modern hardware accelerators. Based on our review, we identify various\nresearch gaps and future exploratory directions that are anticipated to be of\nsignificant value both for academics and industry practitioners.\n","authors":["Maciej Besta","Robert Gerstenberger","Patrick Iff","Pournima Sonawane","Juan Gómez Luna","Raghavendra Kanakagiri","Rui Min","Onur Mutlu","Torsten Hoefler","Raja Appuswamy","Aidan O Mahony"],"pdf_url":"https://arxiv.org/pdf/2408.12173v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12153v1","updated":"2024-08-22T06:42:09Z","published":"2024-08-22T06:42:09Z","title":"DimeRec: A Unified Framework for Enhanced Sequential Recommendation via\n Generative Diffusion Models","summary":" Sequential Recommendation (SR) plays a pivotal role in recommender systems by\ntailoring recommendations to user preferences based on their non-stationary\nhistorical interactions. Achieving high-quality performance in SR requires\nattention to both item representation and diversity. However, designing an SR\nmethod that simultaneously optimizes these merits remains a long-standing\nchallenge. In this study, we address this issue by integrating recent\ngenerative Diffusion Models (DM) into SR. DM has demonstrated utility in\nrepresentation learning and diverse image generation. Nevertheless, a\nstraightforward combination of SR and DM leads to sub-optimal performance due\nto discrepancies in learning objectives (recommendation vs. noise\nreconstruction) and the respective learning spaces (non-stationary vs.\nstationary). To overcome this, we propose a novel framework called DimeRec\n(\\textbf{Di}ffusion with \\textbf{m}ulti-interest \\textbf{e}nhanced\n\\textbf{Rec}ommender). DimeRec synergistically combines a guidance extraction\nmodule (GEM) and a generative diffusion aggregation module (DAM). The GEM\nextracts crucial stationary guidance signals from the user's non-stationary\ninteraction history, while the DAM employs a generative diffusion process\nconditioned on GEM's outputs to reconstruct and generate consistent\nrecommendations. Our numerical experiments demonstrate that DimeRec\nsignificantly outperforms established baseline methods across three publicly\navailable datasets. Furthermore, we have successfully deployed DimeRec on a\nlarge-scale short video recommendation platform, serving hundreds of millions\nof users. Live A/B testing confirms that our method improves both users' time\nspent and result diversification.\n","authors":["Wuchao Li","Rui Huang","Haijun Zhao","Chi Liu","Kai Zheng","Qi Liu","Na Mou","Guorui Zhou","Defu Lian","Yang Song","Wentian Bao","Enyun Yu","Wenwu Ou"],"pdf_url":"https://arxiv.org/pdf/2408.12153v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12152v1","updated":"2024-08-22T06:41:59Z","published":"2024-08-22T06:41:59Z","title":"Behavior Pattern Mining-based Multi-Behavior Recommendation","summary":" Multi-behavior recommendation systems enhance effectiveness by leveraging\nauxiliary behaviors (such as page views and favorites) to address the\nlimitations of traditional models that depend solely on sparse target behaviors\nlike purchases. Existing approaches to multi-behavior recommendations typically\nfollow one of two strategies: some derive initial node representations from\nindividual behavior subgraphs before integrating them for a comprehensive\nprofile, while others interpret multi-behavior data as a heterogeneous graph,\napplying graph neural networks to achieve a unified node representation.\nHowever, these methods do not adequately explore the intricate patterns of\nbehavior among users and items. To bridge this gap, we introduce a novel\nalgorithm called Behavior Pattern mining-based Multi-behavior Recommendation\n(BPMR). Our method extensively investigates the diverse interaction patterns\nbetween users and items, utilizing these patterns as features for making\nrecommendations. We employ a Bayesian approach to streamline the recommendation\nprocess, effectively circumventing the challenges posed by graph neural network\nalgorithms, such as the inability to accurately capture user preferences due to\nover-smoothing. Our experimental evaluation on three real-world datasets\ndemonstrates that BPMR significantly outperforms existing state-of-the-art\nalgorithms, showing an average improvement of 268.29% in Recall@10 and 248.02%\nin NDCG@10 metrics. The code of our BPMR is openly accessible for use and\nfurther research at https://github.com/rookitkitlee/BPMR.\n","authors":["Haojie Li","Zhiyong Cheng","Xu Yu","Jinhuan Liu","Guanfeng Liu","Junwei Du"],"pdf_url":"https://arxiv.org/pdf/2408.12152v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11119v2","updated":"2024-08-22T03:46:25Z","published":"2024-08-20T18:21:54Z","title":"Mistral-SPLADE: LLMs for better Learned Sparse Retrieval","summary":" Learned Sparse Retrievers (LSR) have evolved into an effective retrieval\nstrategy that can bridge the gap between traditional keyword-based sparse\nretrievers and embedding-based dense retrievers. At its core, learned sparse\nretrievers try to learn the most important semantic keyword expansions from a\nquery and/or document which can facilitate better retrieval with overlapping\nkeyword expansions. LSR like SPLADE has typically been using encoder only\nmodels with MLM (masked language modeling) style objective in conjunction with\nknown ways of retrieval performance improvement such as hard negative mining,\ndistillation, etc. In this work, we propose to use decoder-only model for\nlearning semantic keyword expansion. We posit, decoder only models that have\nseen much higher magnitudes of data are better equipped to learn keyword\nexpansions needed for improved retrieval. We use Mistral as the backbone to\ndevelop our Learned Sparse Retriever similar to SPLADE and train it on a subset\nof sentence-transformer data which is often used for training text embedding\nmodels. Our experiments support the hypothesis that a sparse retrieval model\nbased on decoder only large language model (LLM) surpasses the performance of\nexisting LSR systems, including SPLADE and all its variants. The LLM based\nmodel (Echo-Mistral-SPLADE) now stands as a state-of-the-art learned sparse\nretrieval model on the BEIR text retrieval benchmark.\n","authors":["Meet Doshi","Vishwajeet Kumar","Rudra Murthy","Vignesh P","Jaydeep Sen"],"pdf_url":"https://arxiv.org/pdf/2408.11119v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05330v2","updated":"2024-08-22T02:48:34Z","published":"2024-08-09T20:36:40Z","title":"Neural Machine Unranking","summary":" We tackle the problem of machine unlearning within neural information\nretrieval, termed Neural Machine UnRanking (NuMuR) for short. Many of the\nmainstream task- or model-agnostic approaches for machine unlearning were\ndesigned for classification tasks. First, we demonstrate that these methods\nperform poorly on NuMuR tasks due to the unique challenges posed by neural\ninformation retrieval. Then, we develop a methodology for NuMuR named\nContrastive and Consistent Loss (CoCoL), which effectively balances the\nobjectives of data forgetting and model performance retention. Experimental\nresults demonstrate that CoCoL facilitates more effective and controllable data\nremoval than existing techniques.\n","authors":["Jingrui Hou","Axel Finke","Georgina Cosma"],"pdf_url":"https://arxiv.org/pdf/2408.05330v2.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2408.12594v1","updated":"2024-08-22T17:57:31Z","published":"2024-08-22T17:57:31Z","title":"Non-Homophilic Graph Pre-Training and Prompt Learning","summary":" Graphs are ubiquitous for modeling complex relationships between objects\nacross various fields. Graph neural networks (GNNs) have become a mainstream\ntechnique for graph-based applications, but their performance heavily relies on\nabundant labeled data. To reduce labeling requirement, pre-training and prompt\nlearning has become a popular alternative. However, most existing prompt\nmethods do not differentiate homophilic and heterophilic characteristics of\nreal-world graphs. In particular, many real-world graphs are non-homophilic,\nnot strictly or uniformly homophilic with mixing homophilic and heterophilic\npatterns, exhibiting varying non-homophilic characteristics across graphs and\nnodes. In this paper, we propose ProNoG, a novel pre-training and prompt\nlearning framework for such non-homophilic graphs. First, we analyze existing\ngraph pre-training methods, providing theoretical insights into the choice of\npre-training tasks. Second, recognizing that each node exhibits unique\nnon-homophilic characteristics, we propose a conditional network to\ncharacterize the node-specific patterns in downstream tasks. Finally, we\nthoroughly evaluate and analyze ProNoG through extensive experiments on ten\npublic datasets.\n","authors":["Xingtong Yu","Jie Zhang","Yuan Fang","Renhe Jiang"],"pdf_url":"https://arxiv.org/pdf/2408.12594v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2407.13709v2","updated":"2024-08-22T17:56:15Z","published":"2024-07-18T17:08:10Z","title":"Understanding Reference Policies in Direct Preference Optimization","summary":" Direct Preference Optimization (DPO) has become a widely used training method\nfor the instruction fine-tuning of large language models (LLMs). In this work,\nwe explore an under-investigated aspect of DPO - its dependency on the\nreference model or policy. Such reference policies, typically instantiated as\nthe model to be further fine-tuned, are important since they can impose an\nupper limit on DPO's effectiveness. Therefore, we address three related\nresearch questions in this work. First, we explore the optimal strength of the\nKL divergence constraint in DPO, which penalizes deviations from the reference\npolicy, and find that DPO is sensitive to this strength. Next, we examine the\nnecessity of the KL-constraint from the reference policies in DPO by providing\nboth theoretical and empirical comparisons between DPO and related learning\nobjectives, demonstrating DPO's superiority in this controlled setting.\nAdditionally, we investigate whether DPO benefits from stronger reference\npolicies, finding that a stronger reference policy can lead to improved\nperformance, but only when it is similar to the model being fine-tuned. Our\nfindings highlight the confounding role of reference policies in DPO and offer\ninsights for best practices, while also identifying open research questions for\nfuture studies.\n","authors":["Yixin Liu","Pengfei Liu","Arman Cohan"],"pdf_url":"https://arxiv.org/pdf/2407.13709v2.pdf","comment":"GitHub Repo: https://github.com/yale-nlp/refdpo"},{"id":"http://arxiv.org/abs/2404.14757v2","updated":"2024-08-22T17:55:42Z","published":"2024-04-23T05:43:44Z","title":"SST: Multi-Scale Hybrid Mamba-Transformer Experts for Long-Short Range\n Time Series Forecasting","summary":" Despite significant progress in time series forecasting, existing forecasters\noften overlook the heterogeneity between long-range and short-range time\nseries, leading to performance degradation in practical applications. In this\nwork, we highlight the need of distinct objectives tailored to different\nranges. We point out that time series can be decomposed into global patterns\nand local variations, which should be addressed separately in long- and\nshort-range time series. To meet the objectives, we propose a multi-scale\nhybrid Mamba-Transformer experts model State Space Transformer (SST). SST\nleverages Mamba as an expert to extract global patterns in coarse-grained\nlong-range time series, and Local Window Transformer (LWT), the other expert to\nfocus on capturing local variations in fine-grained short-range time series.\nWith an input-dependent mechanism, State Space Model (SSM)-based Mamba is able\nto selectively retain long-term patterns and filter out fluctuations, while LWT\nemploys a local window to enhance locality-awareness capability, thus\neffectively capturing local variations. To adaptively integrate the global\npatterns and local variations, a long-short router dynamically adjusts\ncontributions of the two experts. SST achieves superior performance with\nscaling linearly $O(L)$ on time series length $L$. The comprehensive\nexperiments demonstrate the SST can achieve SOTA results in long-short range\ntime series forecasting while maintaining low memory footprint and\ncomputational cost. The code of SST is available at\nhttps://github.com/XiongxiaoXu/SST.\n","authors":["Xiongxiao Xu","Canyu Chen","Yueqing Liang","Baixiang Huang","Guangji Bai","Liang Zhao","Kai Shu"],"pdf_url":"https://arxiv.org/pdf/2404.14757v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12581v1","updated":"2024-08-22T17:47:01Z","published":"2024-08-22T17:47:01Z","title":"Identifying the Best Arm in the Presence of Global Environment Shifts","summary":" This paper formulates a new Best-Arm Identification problem in the\nnon-stationary stochastic bandits setting, where the means of all arms are\nshifted in the same way due to a global influence of the environment. The aim\nis to identify the unique best arm across environmental change given a fixed\ntotal budget. While this setting can be regarded as a special case of\nAdversarial Bandits or Corrupted Bandits, we demonstrate that existing\nsolutions tailored to those settings do not fully utilise the nature of this\nglobal influence, and thus, do not work well in practice (despite their\ntheoretical guarantees). To overcome this issue, in this paper we develop a\nnovel selection policy that is consistent and robust in dealing with global\nenvironmental shifts. We then propose an allocation policy, LinLUCB, which\nexploits information about global shifts across all arms in each environment.\nEmpirical tests depict a significant improvement in our policies against other\nexisting methods.\n","authors":["Phurinut Srisawad","Juergen Branke","Long Tran-Thanh"],"pdf_url":"https://arxiv.org/pdf/2408.12581v1.pdf","comment":"Extended version of the paper accepted at the 27th European\n Conference on Artificial Intelligence (ECAI 2024); Paper ID: M1125"},{"id":"http://arxiv.org/abs/2408.12579v1","updated":"2024-08-22T17:44:40Z","published":"2024-08-22T17:44:40Z","title":"RuleAlign: Making Large Language Models Better Physicians with\n Diagnostic Rule Alignment","summary":" Large Language Models (LLMs) like GPT-4, MedPaLM-2, and Med-Gemini achieve\nperformance competitively with human experts across various medical benchmarks.\nHowever, they still face challenges in making professional diagnoses akin to\nphysicians, particularly in efficiently gathering patient information and\nreasoning the final diagnosis. To this end, we introduce the RuleAlign\nframework, designed to align LLMs with specific diagnostic rules. We develop a\nmedical dialogue dataset comprising rule-based communications between patients\nand physicians and design an alignment learning approach through preference\nlearning. Experimental results demonstrate the effectiveness of the proposed\napproach. We hope that our work can serve as an inspiration for exploring the\npotential of LLMs as AI physicians.\n","authors":["Xiaohan Wang","Xiaoyan Yang","Yuqi Zhu","Yue Shen","Jian Wang","Peng Wei","Lei Liang","Jinjie Gu","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.12579v1.pdf","comment":"Ongoing work"},{"id":"http://arxiv.org/abs/2408.12578v1","updated":"2024-08-22T17:44:22Z","published":"2024-08-22T17:44:22Z","title":"A Percolation Model of Emergence: Analyzing Transformers Trained on a\n Formal Language","summary":" Increase in data, size, or compute can lead to sudden learning of specific\ncapabilities by a neural network -- a phenomenon often called \"emergence\".\nBeyond scientific understanding, establishing the causal factors underlying\nsuch emergent capabilities is crucial to enable risk regulation frameworks for\nAI. In this work, we seek inspiration from study of emergent properties in\nother fields and propose a phenomenological definition for the concept in the\ncontext of neural networks. Our definition implicates the acquisition of\nspecific structures underlying the data-generating process as a cause of sudden\nperformance growth for specific, narrower tasks. We empirically investigate\nthis definition by proposing an experimental system grounded in a\ncontext-sensitive formal language and find that Transformers trained to perform\ntasks on top of strings from this language indeed exhibit emergent\ncapabilities. Specifically, we show that once the language's underlying grammar\nand context-sensitivity inducing structures are learned by the model,\nperformance on narrower tasks suddenly begins to improve. We then analogize our\nnetwork's learning dynamics with the process of percolation on a bipartite\ngraph, establishing a formal phase transition model that predicts the shift in\nthe point of emergence observed in experiment when changing the data structure.\nOverall, our experimental and theoretical frameworks yield a step towards\nbetter defining, characterizing, and predicting emergence in neural networks.\n","authors":["Ekdeep Singh Lubana","Kyogo Kawaguchi","Robert P. Dick","Hidenori Tanaka"],"pdf_url":"https://arxiv.org/pdf/2408.12578v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2408.12574v1","updated":"2024-08-22T17:41:45Z","published":"2024-08-22T17:41:45Z","title":"MuMA-ToM: Multi-modal Multi-Agent Theory of Mind","summary":" Understanding people's social interactions in complex real-world scenarios\noften relies on intricate mental reasoning. To truly understand how and why\npeople interact with one another, we must infer the underlying mental states\nthat give rise to the social interactions, i.e., Theory of Mind reasoning in\nmulti-agent interactions. Additionally, social interactions are often\nmulti-modal -- we can watch people's actions, hear their conversations, and/or\nread about their past behaviors. For AI systems to successfully and safely\ninteract with people in real-world environments, they also need to understand\npeople's mental states as well as their inferences about each other's mental\nstates based on multi-modal information about their interactions. For this, we\nintroduce MuMA-ToM, a Multi-modal Multi-Agent Theory of Mind benchmark.\nMuMA-ToM is the first multi-modal Theory of Mind benchmark that evaluates\nmental reasoning in embodied multi-agent interactions. In MuMA-ToM, we provide\nvideo and text descriptions of people's multi-modal behavior in realistic\nhousehold environments. Based on the context, we then ask questions about\npeople's goals, beliefs, and beliefs about others' goals. We validated MuMA-ToM\nin a human experiment and provided a human baseline. We also proposed a novel\nmulti-modal, multi-agent ToM model, LIMP (Language model-based Inverse\nMulti-agent Planning). Our experimental results show that LIMP significantly\noutperforms state-of-the-art methods, including large multi-modal models (e.g.,\nGPT-4o, Gemini-1.5 Pro) and a recent multi-modal ToM model, BIP-ALM.\n","authors":["Haojun Shi","Suyu Ye","Xinyu Fang","Chuanyang Jin","Layla Isik","Yen-Ling Kuo","Tianmin Shu"],"pdf_url":"https://arxiv.org/pdf/2408.12574v1.pdf","comment":"Project website: https://scai.cs.jhu.edu/projects/MuMA-ToM/ Code:\n https://github.com/SCAI-JHU/MuMA-ToM"},{"id":"http://arxiv.org/abs/2408.12570v1","updated":"2024-08-22T17:38:59Z","published":"2024-08-22T17:38:59Z","title":"Jamba-1.5: Hybrid Transformer-Mamba Models at Scale","summary":" We present Jamba-1.5, new instruction-tuned large language models based on\nour Jamba architecture. Jamba is a hybrid Transformer-Mamba mixture of experts\narchitecture, providing high throughput and low memory usage across context\nlengths, while retaining the same or better quality as Transformer models. We\nrelease two model sizes: Jamba-1.5-Large, with 94B active parameters, and\nJamba-1.5-Mini, with 12B active parameters. Both models are fine-tuned for a\nvariety of conversational and instruction-following capabilties, and have an\neffective context length of 256K tokens, the largest amongst open-weight\nmodels. To support cost-effective inference, we introduce ExpertsInt8, a novel\nquantization technique that allows fitting Jamba-1.5-Large on a machine with 8\n80GB GPUs when processing 256K-token contexts without loss of quality. When\nevaluated on a battery of academic and chatbot benchmarks, Jamba-1.5 models\nachieve excellent results while providing high throughput and outperforming\nother open-weight models on long-context benchmarks. The model weights for both\nsizes are publicly available under the Jamba Open Model License and we release\nExpertsInt8 as open source.\n","authors":[" Jamba Team","Barak Lenz","Alan Arazi","Amir Bergman","Avshalom Manevich","Barak Peleg","Ben Aviram","Chen Almagor","Clara Fridman","Dan Padnos","Daniel Gissin","Daniel Jannai","Dor Muhlgay","Dor Zimberg","Edden M Gerber","Elad Dolev","Eran Krakovsky","Erez Safahi","Erez Schwartz","Gal Cohen","Gal Shachaf","Haim Rozenblum","Hofit Bata","Ido Blass","Inbal Magar","Itay Dalmedigos","Jhonathan Osin","Julie Fadlon","Maria Rozman","Matan Danos","Michael Gokhman","Mor Zusman","Naama Gidron","Nir Ratner","Noam Gat","Noam Rozen","Oded Fried","Ohad Leshno","Omer Antverg","Omri Abend","Opher Lieber","Or Dagan","Orit Cohavi","Raz Alon","Ro'i Belson","Roi Cohen","Rom Gilad","Roman Glozman","Shahar Lev","Shaked Meirom","Tal Delbari","Tal Ness","Tomer Asida","Tom Ben Gal","Tom Braude","Uriya Pumerantz","Yehoshua Cohen","Yonatan Belinkov","Yuval Globerson","Yuval Peleg Levy","Yoav Shoham"],"pdf_url":"https://arxiv.org/pdf/2408.12570v1.pdf","comment":"Webpage: https://www.ai21.com/jamba"},{"id":"http://arxiv.org/abs/2408.12568v1","updated":"2024-08-22T17:35:18Z","published":"2024-08-22T17:35:18Z","title":"Pruning By Explaining Revisited: Optimizing Attribution Methods to Prune\n CNNs and Transformers","summary":" To solve ever more complex problems, Deep Neural Networks are scaled to\nbillions of parameters, leading to huge computational costs. An effective\napproach to reduce computational requirements and increase efficiency is to\nprune unnecessary components of these often over-parameterized networks.\nPrevious work has shown that attribution methods from the field of eXplainable\nAI serve as effective means to extract and prune the least relevant network\ncomponents in a few-shot fashion. We extend the current state by proposing to\nexplicitly optimize hyperparameters of attribution methods for the task of\npruning, and further include transformer-based networks in our analysis. Our\napproach yields higher model compression rates of large transformer- and\nconvolutional architectures (VGG, ResNet, ViT) compared to previous works,\nwhile still attaining high performance on ImageNet classification tasks. Here,\nour experiments indicate that transformers have a higher degree of\nover-parameterization compared to convolutional neural networks. Code is\navailable at\n$\\href{https://github.com/erfanhatefi/Pruning-by-eXplaining-in-PyTorch}{\\text{this\nhttps link}}$.\n","authors":["Sayed Mohammad Vakilzadeh Hatefi","Maximilian Dreyer","Reduan Achtibat","Thomas Wiegand","Wojciech Samek","Sebastian Lapuschkin"],"pdf_url":"https://arxiv.org/pdf/2408.12568v1.pdf","comment":"Accepted as a workshop paper at ECCV 2024 31 pages (14 pages\n manuscript, 4 pages references, 13 pages appendix)"},{"id":"http://arxiv.org/abs/2309.13080v2","updated":"2024-08-22T17:27:56Z","published":"2023-09-21T10:55:26Z","title":"SPICED: News Similarity Detection Dataset with Multiple Topics and\n Complexity Levels","summary":" The proliferation of news media outlets has increased the demand for\nintelligent systems capable of detecting redundant information in news articles\nin order to enhance user experience. However, the heterogeneous nature of news\ncan lead to spurious findings in these systems: Simple heuristics such as\nwhether a pair of news are both about politics can provide strong but deceptive\ndownstream performance. Segmenting news similarity datasets into topics\nimproves the training of these models by forcing them to learn how to\ndistinguish salient characteristics under more narrow domains. However, this\nrequires the existence of topic-specific datasets, which are currently lacking.\nIn this article, we propose a novel dataset of similar news, SPICED, which\nincludes seven topics: Crime & Law, Culture & Entertainment, Disasters &\nAccidents, Economy & Business, Politics & Conflicts, Science & Technology, and\nSports. Futhermore, we present four different levels of complexity,\nspecifically designed for news similarity detection task. We benchmarked the\ncreated datasets using MinHash, BERT, SBERT, and SimCSE models.\n","authors":["Elena Shushkevich","Long Mai","Manuel V. Loureiro","Steven Derby","Tri Kurniawan Wijaya"],"pdf_url":"https://arxiv.org/pdf/2309.13080v2.pdf","comment":"LREC-COLING 2024"},{"id":"http://arxiv.org/abs/2408.12561v1","updated":"2024-08-22T17:22:59Z","published":"2024-08-22T17:22:59Z","title":"ssProp: Energy-Efficient Training for Convolutional Neural Networks with\n Scheduled Sparse Back Propagation","summary":" Recently, deep learning has made remarkable strides, especially with\ngenerative modeling, such as large language models and probabilistic diffusion\nmodels. However, training these models often involves significant computational\nresources, requiring billions of petaFLOPs. This high resource consumption\nresults in substantial energy usage and a large carbon footprint, raising\ncritical environmental concerns. Back-propagation (BP) is a major source of\ncomputational expense during training deep learning models. To advance research\non energy-efficient training and allow for sparse learning on any machine and\ndevice, we propose a general, energy-efficient convolution module that can be\nseamlessly integrated into any deep learning architecture. Specifically, we\nintroduce channel-wise sparsity with additional gradient selection schedulers\nduring backward based on the assumption that BP is often dense and inefficient,\nwhich can lead to over-fitting and high computational consumption. Our\nexperiments demonstrate that our approach reduces 40\\% computations while\npotentially improving model performance, validated on image classification and\ngeneration tasks. This reduction can lead to significant energy savings and a\nlower carbon footprint during the research and development phases of\nlarge-scale AI systems. Additionally, our method mitigates over-fitting in a\nmanner distinct from Dropout, allowing it to be combined with Dropout to\nfurther enhance model performance and reduce computational resource usage.\nExtensive experiments validate that our method generalizes to a variety of\ndatasets and tasks and is compatible with a wide range of deep learning\narchitectures and modules. Code is publicly available at\nhttps://github.com/lujiazho/ssProp.\n","authors":["Lujia Zhong","Shuo Huang","Yonggang Shi"],"pdf_url":"https://arxiv.org/pdf/2408.12561v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2301.02458v2","updated":"2024-08-22T17:07:11Z","published":"2023-01-06T10:54:54Z","title":"Topics as Entity Clusters: Entity-based Topics from Large Language\n Models and Graph Neural Networks","summary":" Topic models aim to reveal latent structures within a corpus of text,\ntypically through the use of term-frequency statistics over bag-of-words\nrepresentations from documents. In recent years, conceptual entities --\ninterpretable, language-independent features linked to external knowledge\nresources -- have been used in place of word-level tokens, as words typically\nrequire extensive language processing with a minimal assurance of\ninterpretability. However, current literature is limited when it comes to\nexploring purely entity-driven neural topic modeling. For instance, despite the\nadvantages of using entities for eliciting thematic structure, it is unclear\nwhether current techniques are compatible with these sparsely organised,\ninformation-dense conceptual units. In this work, we explore entity-based\nneural topic modeling and propose a novel topic clustering approach using\nbimodal vector representations of entities. Concretely, we extract these latent\nrepresentations from large language models and graph neural networks trained on\na knowledge base of symbolic relations, in order to derive the most salient\naspects of these conceptual units. Analysis of coherency metrics confirms that\nour approach is better suited to working with entities in comparison to\nstate-of-the-art models, particularly when using graph-based embeddings trained\non a knowledge base.\n","authors":["Manuel V. Loureiro","Steven Derby","Tri Kurniawan Wijaya"],"pdf_url":"https://arxiv.org/pdf/2301.02458v2.pdf","comment":"16 pages, 1 figure. LREC-COLING 2024"},{"id":"http://arxiv.org/abs/2408.12548v1","updated":"2024-08-22T17:02:29Z","published":"2024-08-22T17:02:29Z","title":"Human-In-The-Loop Machine Learning for Safe and Ethical Autonomous\n Vehicles: Principles, Challenges, and Opportunities","summary":" Rapid advances in Machine Learning (ML) have triggered new trends in\nAutonomous Vehicles (AVs). ML algorithms play a crucial role in interpreting\nsensor data, predicting potential hazards, and optimizing navigation\nstrategies. However, achieving full autonomy in cluttered and complex\nsituations, such as intricate intersections, diverse sceneries, varied\ntrajectories, and complex missions, is still challenging, and the cost of data\nlabeling remains a significant bottleneck. The adaptability and robustness of\nhumans in complex scenarios motivate the inclusion of humans in ML process,\nleveraging their creativity, ethical power, and emotional intelligence to\nimprove ML effectiveness. The scientific community knows this approach as\nHuman-In-The-Loop Machine Learning (HITL-ML). Towards safe and ethical\nautonomy, we present a review of HITL-ML for AVs, focusing on Curriculum\nLearning (CL), Human-In-The-Loop Reinforcement Learning (HITL-RL), Active\nLearning (AL), and ethical principles. In CL, human experts systematically\ntrain ML models by starting with simple tasks and gradually progressing to more\ndifficult ones. HITL-RL significantly enhances the RL process by incorporating\nhuman input through techniques like reward shaping, action injection, and\ninteractive learning. AL streamlines the annotation process by targeting\nspecific instances that need to be labeled with human oversight, reducing the\noverall time and cost associated with training. Ethical principles must be\nembedded in AVs to align their behavior with societal values and norms. In\naddition, we provide insights and specify future research directions.\n","authors":["Yousef Emami","Kai Li","Luis Almeida","Wei Ni","Zhu Han"],"pdf_url":"https://arxiv.org/pdf/2408.12548v1.pdf","comment":"19 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.12545v1","updated":"2024-08-22T16:59:32Z","published":"2024-08-22T16:59:32Z","title":"Dynamics of Meta-learning Representation in the Teacher-student Scenario","summary":" Gradient-based meta-learning algorithms have gained popularity for their\nability to train models on new tasks using limited data. Empirical observations\nindicate that such algorithms are able to learn a shared representation across\ntasks, which is regarded as a key factor in their success. However, the\nin-depth theoretical understanding of the learning dynamics and the origin of\nthe shared representation remains underdeveloped. In this work, we investigate\nthe meta-learning dynamics of the non-linear two-layer neural networks trained\non streaming tasks in the teach-student scenario. Through the lens of\nstatistical physics analysis, we characterize the macroscopic behavior of the\nmeta-training processes, the formation of the shared representation, and the\ngeneralization ability of the model on new tasks. The analysis also points to\nthe importance of the choice of certain hyper-parameters of the learning\nalgorithms.\n","authors":["Hui Wang","Cho Tung Yip","Bo Li"],"pdf_url":"https://arxiv.org/pdf/2408.12545v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.04042v2","updated":"2024-08-22T16:48:56Z","published":"2022-09-08T21:46:12Z","title":"Assessing Lower Limb Strength using Internet-of-Things Enabled Chair","summary":" This project describes the application of the technologies of Machine\nLearning and Internet-of-Things to assess the lower limb strength of\nindividuals undergoing rehabilitation or therapy. Specifically, it seeks to\nmeasure and assess the progress of individuals by sensors attached to chairs\nand processing the data through Google GPU Tensorflow CoLab. Pressure sensors\nare attached to various locations on a chair, including but not limited to the\nseating area, backrest, hand rests, and legs. Sensor data from the individual\nperforming both sit-to-stand transition and stand-to-sit transition provides a\ntime series dataset regarding the pressure distribution and vibratory motion on\nthe chair. The dataset and timing information can then be fed into a machine\nlearning model to estimate the relative strength and weakness during various\nphases of the movement.\n","authors":["Hudson Kaleb Dy","Chelsea Yeh","Hanna Kaitlin Dy","Phillip Schodinger"],"pdf_url":"https://arxiv.org/pdf/2209.04042v2.pdf","comment":"12 Pages"},{"id":"http://arxiv.org/abs/2206.06885v3","updated":"2024-08-22T16:48:12Z","published":"2022-06-14T14:40:10Z","title":"Neural interval-censored survival regression with feature selection","summary":" Survival analysis is a fundamental area of focus in biomedical research,\nparticularly in the context of personalized medicine. This prominence is due to\nthe increasing prevalence of large and high-dimensional datasets, such as omics\nand medical image data. However, the literature on non-linear regression\nalgorithms and variable selection techniques for interval-censoring is either\nlimited or non-existent, particularly in the context of neural networks. Our\nobjective is to introduce a novel predictive framework tailored for\ninterval-censored regression tasks, rooted in Accelerated Failure Time (AFT)\nmodels. Our strategy comprises two key components: i) a variable selection\nphase leveraging recent advances on sparse neural network architectures, ii) a\nregression model targeting prediction of the interval-censored response. To\nassess the performance of our novel algorithm, we conducted a comprehensive\nevaluation through both numerical experiments and real-world applications that\nencompass scenarios related to diabetes and physical activity. Our results\noutperform traditional AFT algorithms, particularly in scenarios featuring\nnon-linear relationships.\n","authors":["Carlos García Meixide","Marcos Matabuena","Louis Abraham","Michael R. Kosorok"],"pdf_url":"https://arxiv.org/pdf/2206.06885v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.00028v7","updated":"2024-08-22T16:39:34Z","published":"2023-02-28T19:10:12Z","title":"Efficient Sensor Placement from Regression with Sparse Gaussian\n Processes in Continuous and Discrete Spaces","summary":" The sensor placement problem is a common problem that arises when monitoring\ncorrelated phenomena, such as temperature, precipitation, and salinity.\nExisting approaches to this problem typically formulate it as the maximization\nof information metrics, such as mutual information~(MI), and use optimization\nmethods such as greedy algorithms in discrete domains, and derivative-free\noptimization methods such as genetic algorithms in continuous domains. However,\ncomputing MI for sensor placement requires discretizing the environment, and\nits computation cost depends on the size of the discretized environment. These\nlimitations restrict these approaches from scaling to large problems.\n We present a novel formulation to the SP problem based on variational\napproximation that can be optimized using gradient descent, allowing us to\nefficiently find solutions in continuous domains. We generalize our method to\nalso handle discrete environments. Our experimental results on four real-world\ndatasets demonstrate that our approach generates sensor placements consistently\non par with or better than the prior state-of-the-art approaches in terms of\nboth MI and reconstruction quality, all while being significantly faster. Our\ncomputationally efficient approach enables both large-scale sensor placement\nand fast robotic sensor placement for informative path planning algorithms.\n","authors":["Kalvik Jakkala","Srinivas Akella"],"pdf_url":"https://arxiv.org/pdf/2303.00028v7.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2407.06888v2","updated":"2024-08-22T16:38:56Z","published":"2024-07-09T14:18:30Z","title":"A Complete Set of Quadratic Constraints for Repeated ReLU and\n Generalizations","summary":" This paper derives a complete set of quadratic constraints (QCs) for the\nrepeated ReLU. The complete set of QCs is described by a collection of matrix\ncopositivity conditions. We also show that only two functions satisfy all QCs\nin our complete set: the repeated ReLU and flipped ReLU. Thus our complete set\nof QCs bounds the repeated ReLU as tight as possible up to the sign invariance\ninherent in quadratic forms. We derive a similar complete set of incremental\nQCs for repeated ReLU, which can potentially lead to less conservative\nLipschitz bounds for ReLU networks than the standard LipSDP approach. The basic\nconstructions are also used to derive the complete sets of QCs for other\npiecewise linear activation functions such as leaky ReLU, MaxMin, and\nHouseHolder. Finally, we illustrate the use of the complete set of QCs to\nassess stability and performance for recurrent neural networks with ReLU\nactivation functions. We rely on a standard copositivity relaxation to\nformulate the stability/performance condition as a semidefinite program. Simple\nexamples are provided to illustrate that the complete sets of QCs and\nincremental QCs can yield less conservative bounds than existing sets.\n","authors":["Sahel Vahedi Noori","Bin Hu","Geir Dullerud","Peter Seiler"],"pdf_url":"https://arxiv.org/pdf/2407.06888v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12526v1","updated":"2024-08-22T16:31:32Z","published":"2024-08-22T16:31:32Z","title":"Exploiting Student Parallelism for Low-latency GPU Inference of\n BERT-like Models in Online Services","summary":" Due to high accuracy, BERT-like models have been widely adopted by\ndiscriminative text mining and web searching. However, large BERT-like models\nsuffer from inefficient online inference, as they face the following two\nproblems on GPUs. First, they rely on the large model depth to achieve high\naccuracy, which linearly increases the sequential computation on GPUs. Second,\nstochastic and dynamic online workloads cause extra costs. In this paper, we\npresent Academus for low-latency online inference of BERT-like models. At the\ncore of Academus is the novel student parallelism, which adopts boosting\nensemble and stacking distillation to distill the original deep model into an\nequivalent group of parallel and shallow student models. This enables Academus\nto achieve the lower model depth (e.g., two layers) than baselines and\nconsequently the lowest inference latency without affecting the accuracy.For\noccasional workload bursts, it can temporarily decrease the number of students\nwith minimal accuracy loss to improve throughput. Additionally, it employs\nspecialized system designs for student parallelism to better handle stochastic\nonline workloads. We conduct comprehensive experiments to verify the\neffectiveness. The results show that Academus outperforms the baselines by\n4.1X~1.6X in latency without compromising accuracy, and achieves up to 22.27X\nhigher throughput for workload bursts.\n","authors":["Weiyan Wang","Yilun Jin","Yiming Zhang","Victor Junqiu Wei","Han Tian","Li Chen","Kai Chen"],"pdf_url":"https://arxiv.org/pdf/2408.12526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12525v1","updated":"2024-08-22T16:30:24Z","published":"2024-08-22T16:30:24Z","title":"PCGRL+: Scaling, Control and Generalization in Reinforcement Learning\n Level Generators","summary":" Procedural Content Generation via Reinforcement Learning (PCGRL) has been\nintroduced as a means by which controllable designer agents can be trained\nbased only on a set of computable metrics acting as a proxy for the level's\nquality and key characteristics. While PCGRL offers a unique set of affordances\nfor game designers, it is constrained by the compute-intensive process of\ntraining RL agents, and has so far been limited to generating relatively small\nlevels. To address this issue of scale, we implement several PCGRL environments\nin Jax so that all aspects of learning and simulation happen in parallel on the\nGPU, resulting in faster environment simulation; removing the CPU-GPU transfer\nof information bottleneck during RL training; and ultimately resulting in\nsignificantly improved training speed. We replicate several key results from\nprior works in this new framework, letting models train for much longer than\npreviously studied, and evaluating their behavior after 1 billion timesteps.\nAiming for greater control for human designers, we introduce randomized level\nsizes and frozen \"pinpoints\" of pivotal game tiles as further ways of\ncountering overfitting. To test the generalization ability of learned\ngenerators, we evaluate models on large, out-of-distribution map sizes, and\nfind that partial observation sizes learn more robust design strategies.\n","authors":["Sam Earle","Zehua Jiang","Julian Togelius"],"pdf_url":"https://arxiv.org/pdf/2408.12525v1.pdf","comment":"8 pages, 7 figures, 6 tables. Published at IEEE Conference on Games,\n 2024"},{"id":"http://arxiv.org/abs/2307.13100v2","updated":"2024-08-22T16:30:06Z","published":"2023-07-24T19:41:19Z","title":"Label Noise: Correcting the Forward-Correction","summary":" Training neural network classifiers on datasets with label noise poses a risk\nof overfitting them to the noisy labels. To address this issue, researchers\nhave explored alternative loss functions that aim to be more robust. The\n`forward-correction' is a popular approach wherein the model outputs are noised\nbefore being evaluated against noisy data. When the true noise model is known,\napplying the forward-correction guarantees consistency of the learning\nalgorithm. While providing some benefit, the correction is insufficient to\nprevent overfitting to finite noisy datasets. In this work, we propose an\napproach to tackling overfitting caused by label noise. We observe that the\npresence of label noise implies a lower bound on the noisy generalised risk.\nMotivated by this observation, we propose imposing a lower bound on the\ntraining loss to mitigate overfitting. Our main contribution is providing\ntheoretical insights that allow us to approximate the lower bound given only an\nestimate of the average noise rate. We empirically demonstrate that using this\nbound significantly enhances robustness in various settings, with virtually no\nadditional computational cost.\n","authors":["William Toner","Amos Storkey"],"pdf_url":"https://arxiv.org/pdf/2307.13100v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12519v1","updated":"2024-08-22T16:15:13Z","published":"2024-08-22T16:15:13Z","title":"Advanced atom-level representations for protein flexibility prediction\n utilizing graph neural networks","summary":" Protein dynamics play a crucial role in many biological processes and drug\ninteractions. However, measuring, and simulating protein dynamics is\nchallenging and time-consuming. While machine learning holds promise in\ndeciphering the determinants of protein dynamics from structural information,\nmost existing methods for protein representation learning operate at the\nresidue level, ignoring the finer details of atomic interactions. In this work,\nwe propose for the first time to use graph neural networks (GNNs) to learn\nprotein representations at the atomic level and predict B-factors from protein\n3D structures. The B-factor reflects the atomic displacement of atoms in\nproteins, and can serve as a surrogate for protein flexibility. We compared\ndifferent GNN architectures to assess their performance. The Meta-GNN model\nachieves a correlation coefficient of 0.71 on a large and diverse test set of\nover 4k proteins (17M atoms) from the Protein Data Bank (PDB), outperforming\nprevious methods by a large margin. Our work demonstrates the potential of\nrepresentations learned by GNNs for protein flexibility prediction and other\nrelated tasks.\n","authors":["Sina Sarparast","Aldo Zaimi","Maximilian Ebert","Michael-Rock Goldsmith"],"pdf_url":"https://arxiv.org/pdf/2408.12519v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12505v1","updated":"2024-08-22T16:00:31Z","published":"2024-08-22T16:00:31Z","title":"Stochastic Compositional Minimax Optimization with Provable Convergence\n Guarantees","summary":" Stochastic compositional minimax problems are prevalent in machine learning,\nyet there are only limited established on the convergence of this class of\nproblems. In this paper, we propose a formal definition of the stochastic\ncompositional minimax problem, which involves optimizing a minimax loss with a\ncompositional structure either in primal , dual, or both primal and dual\nvariables. We introduce a simple yet effective algorithm, stochastically\nCorrected stOchastic gradient Descent Ascent (CODA), which is a descent ascent\ntype algorithm with compositional correction steps, and establish its\nconvergence rate in aforementioned three settings. In the presence of the\ncompositional structure in primal, the objective function typically becomes\nnonconvex in primal due to function composition. Thus, we consider the\nnonconvex-strongly-concave and nonconvex-concave settings and show that CODA\ncan efficiently converge to a stationary point. In the case of composition on\nthe dual, the objective function becomes nonconcave in the dual variable, and\nwe demonstrate convergence in the strongly-convex-nonconcave and\nconvex-nonconcave setting. In the case of composition on both variables, the\nprimal and dual variables may lose convexity and concavity, respectively.\nTherefore, we anaylze the convergence in weakly-convex-weakly-concave setting.\nWe also give a variance reduction version algorithm, CODA+, which achieves the\nbest known rate on nonconvex-strongly-concave and nonconvex-concave\ncompositional minimax problem. This work initiates the theoretical study of the\nstochastic compositional minimax problem on various settings and may inform\nmodern machine learning scenarios such as domain adaptation or robust\nmodel-agnostic meta-learning.\n","authors":["Yuyang Deng","Fuli Qiao","Mehrdad Mahdavi"],"pdf_url":"https://arxiv.org/pdf/2408.12505v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05920v2","updated":"2024-08-22T15:56:34Z","published":"2024-08-12T05:00:23Z","title":"Urban Region Pre-training and Prompting: A Graph-based Approach","summary":" Urban region representation is crucial for various urban downstream tasks.\nHowever, despite the proliferation of methods and their success, acquiring\ngeneral urban region knowledge and adapting to different tasks remains\nchallenging. Previous work often neglects the spatial structures and functional\nlayouts between entities, limiting their ability to capture transferable\nknowledge across regions. Further, these methods struggle to adapt effectively\nto specific downstream tasks, as they do not adequately address the unique\nfeatures and relationships required for different downstream tasks. In this\npaper, we propose a $\\textbf{G}$raph-based $\\textbf{U}$rban $\\textbf{R}$egion\n$\\textbf{P}$re-training and $\\textbf{P}$rompting framework ($\\textbf{GURPP}$)\nfor region representation learning. Specifically, we first construct an urban\nregion graph that integrates detailed spatial entity data for more effective\nurban region representation. Then, we develop a subgraph-centric urban region\npre-training model to capture the heterogeneous and transferable patterns of\ninteractions among entities. To further enhance the adaptability of these\nembeddings to different tasks, we design two graph-based prompting methods to\nincorporate explicit/hidden task knowledge. Extensive experiments on various\nurban region prediction tasks and different cities demonstrate the superior\nperformance of our GURPP framework. We wil release code and data upon paper\nnotification.\n","authors":["Jiahui Jin","Yifan Song","Dong Kan","Haojia Zhu","Xiangguo Sun","Zhicheng Li","Xigang Sun","Jinghui Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.05920v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15863v2","updated":"2024-08-22T15:56:26Z","published":"2024-07-16T14:20:00Z","title":"Overfitting In Contrastive Learning?","summary":" Overfitting describes a machine learning phenomenon where the model fits too\nclosely to the training data, resulting in poor generalization. While this\noccurrence is thoroughly documented for many forms of supervised learning, it\nis not well examined in the context of unsupervised learning. In this work we\nexamine the nature of overfitting in unsupervised contrastive learning. We show\nthat overfitting can indeed occur and the mechanism behind overfitting.\n","authors":["Zachary Rabin","Jim Davis","Benjamin Lewis","Matthew Scherreik"],"pdf_url":"https://arxiv.org/pdf/2407.15863v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06329v3","updated":"2024-08-22T15:52:38Z","published":"2023-05-10T17:33:48Z","title":"Similarity of Neural Network Models: A Survey of Functional and\n Representational Measures","summary":" Measuring similarity of neural networks to understand and improve their\nbehavior has become an issue of great importance and research interest. In this\nsurvey, we provide a comprehensive overview of two complementary perspectives\nof measuring neural network similarity: (i) representational similarity, which\nconsiders how activations of intermediate layers differ, and (ii) functional\nsimilarity, which considers how models differ in their outputs. In addition to\nproviding detailed descriptions of existing measures, we summarize and discuss\nresults on the properties of and relationships between these measures, and\npoint to open research problems. We hope our work lays a foundation for more\nsystematic research on the properties and applicability of similarity measures\nfor neural network models.\n","authors":["Max Klabunde","Tobias Schumacher","Markus Strohmaier","Florian Lemmerich"],"pdf_url":"https://arxiv.org/pdf/2305.06329v3.pdf","comment":"Added new similarity measures, application section. Improved overview\n of analyses of measures"},{"id":"http://arxiv.org/abs/2404.10259v3","updated":"2024-08-22T15:52:13Z","published":"2024-04-16T03:26:43Z","title":"Uncovering Latent Arguments in Social Media Messaging by Employing\n LLMs-in-the-Loop Strategy","summary":" The widespread use of social media has led to a surge in popularity for\nautomated methods of analyzing public opinion. Supervised methods are adept at\ntext categorization, yet the dynamic nature of social media discussions poses a\ncontinual challenge for these techniques due to the constant shifting of the\nfocus. On the other hand, traditional unsupervised methods for extracting\nthemes from public discourse, such as topic modeling, often reveal overarching\npatterns that might not capture specific nuances. Consequently, a significant\nportion of research into social media discourse still depends on\nlabor-intensive manual coding techniques and a human-in-the-loop approach,\nwhich are both time-consuming and costly. In this work, we study the problem of\ndiscovering arguments associated with a specific theme. We propose a generic\nLLMs-in-the-Loop strategy that leverages the advanced capabilities of Large\nLanguage Models (LLMs) to extract latent arguments from social media messaging.\nTo demonstrate our approach, we apply our framework to contentious topics. We\nuse two publicly available datasets: (1) the climate campaigns dataset of 14k\nFacebook ads with 25 themes and (2) the COVID-19 vaccine campaigns dataset of\n9k Facebook ads with 14 themes. Additionally, we design a downstream task as\nstance prediction by leveraging talking points in climate debates. Furthermore,\nwe analyze demographic targeting and the adaptation of messaging based on\nreal-world events.\n","authors":["Tunazzina Islam","Dan Goldwasser"],"pdf_url":"https://arxiv.org/pdf/2404.10259v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12491v1","updated":"2024-08-22T15:31:48Z","published":"2024-08-22T15:31:48Z","title":"AI in radiological imaging of soft-tissue and bone tumours: a systematic\n review evaluating against CLAIM and FUTURE-AI guidelines","summary":" Soft-tissue and bone tumours (STBT) are rare, diagnostically challenging\nlesions with variable clinical behaviours and treatment approaches. This\nsystematic review provides an overview of Artificial Intelligence (AI) methods\nusing radiological imaging for diagnosis and prognosis of these tumours,\nhighlighting challenges in clinical translation, and evaluating study alignment\nwith the Checklist for AI in Medical Imaging (CLAIM) and the FUTURE-AI\ninternational consensus guidelines for trustworthy and deployable AI to promote\nthe clinical translation of AI methods. The review covered literature from\nseveral bibliographic databases, including papers published before 17/07/2024.\nOriginal research in peer-reviewed journals focused on radiology-based AI for\ndiagnosing or prognosing primary STBT was included. Exclusion criteria were\nanimal, cadaveric, or laboratory studies, and non-English papers. Abstracts\nwere screened by two of three independent reviewers for eligibility. Eligible\npapers were assessed against guidelines by one of three independent reviewers.\nThe search identified 15,015 abstracts, from which 325 articles were included\nfor evaluation. Most studies performed moderately on CLAIM, averaging a score\nof 28.9$\\pm$7.5 out of 53, but poorly on FUTURE-AI, averaging 5.1$\\pm$2.1 out\nof 30. Imaging-AI tools for STBT remain at the proof-of-concept stage,\nindicating significant room for improvement. Future efforts by AI developers\nshould focus on design (e.g. define unmet clinical need, intended clinical\nsetting and how AI would be integrated in clinical workflow), development (e.g.\nbuild on previous work, explainability), evaluation (e.g. evaluating and\naddressing biases, evaluating AI against best practices), and data\nreproducibility and availability (making documented code and data publicly\navailable). Following these recommendations could improve clinical translation\nof AI methods.\n","authors":["Douwe J. Spaanderman","Matthew Marzetti","Xinyi Wan","Andrew F. Scarsbrook","Philip Robinson","Edwin H. G. Oei","Jacob J. Visser","Robert Hemke","Kirsten van Langevelde","David F. Hanff","Geert J. L. H. van Leenders","Cornelis Verhoef","Dirk J. Gruühagen","Wiro J. Niessen","Stefan Klein","Martijn P. A. Starmans"],"pdf_url":"https://arxiv.org/pdf/2408.12491v1.pdf","comment":"23 pages, 6 figures, 6 supplementary figures"},{"id":"http://arxiv.org/abs/2408.12481v1","updated":"2024-08-22T15:17:02Z","published":"2024-08-22T15:17:02Z","title":"Self-Learning for Personalized Keyword Spotting on Ultra-Low-Power Audio\n Sensors","summary":" This paper proposes a self-learning framework to incrementally train\n(fine-tune) a personalized Keyword Spotting (KWS) model after the deployment on\nultra-low power smart audio sensors. We address the fundamental problem of the\nabsence of labeled training data by assigning pseudo-labels to the new recorded\naudio frames based on a similarity score with respect to few user recordings.\nBy experimenting with multiple KWS models with a number of parameters up to\n0.5M on two public datasets, we show an accuracy improvement of up to +19.2%\nand +16.0% vs. the initial models pretrained on a large set of generic\nkeywords. The labeling task is demonstrated on a sensor system composed of a\nlow-power microphone and an energy-efficient Microcontroller (MCU). By\nefficiently exploiting the heterogeneous processing engines of the MCU, the\nalways-on labeling task runs in real-time with an average power cost of up to\n8.2 mW. On the same platform, we estimate an energy cost for on-device training\n10x lower than the labeling energy if sampling a new utterance every 5 s or\n16.4 s with a DS-CNN-S or a DS-CNN-M model. Our empirical result paves the way\nto self-adaptive personalized KWS sensors at the extreme edge.\n","authors":["Manuele Rusci","Francesco Paci","Marco Fariselli","Eric Flamand","Tinne Tuytelaars"],"pdf_url":"https://arxiv.org/pdf/2408.12481v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12480v1","updated":"2024-08-22T15:15:51Z","published":"2024-08-22T15:15:51Z","title":"Vintern-1B: An Efficient Multimodal Large Language Model for Vietnamese","summary":" In this report, we introduce Vintern-1B, a reliable 1-billion-parameters\nmultimodal large language model (MLLM) for Vietnamese language tasks. By\nintegrating the Qwen2-0.5B-Instruct language model with the\nInternViT-300M-448px visual model, Vintern-1B is optimized for a range of\napplications, including optical character recognition (OCR), document\nextraction, and general question-answering in Vietnamese context. The model is\nfine-tuned on an extensive dataset of over 3 million image-question-answer\npairs, achieving robust performance and reliable results across multiple\nVietnamese language benchmarks like OpenViVQA and ViTextVQA. Vintern-1B is\nsmall enough to fit into various on-device applications easily. Additionally,\nwe have open-sourced several Vietnamese vision question answering (VQA)\ndatasets for text and diagrams, created with Gemini 1.5 Flash. Our models are\navailable at: https://huggingface.co/5CD-AI/Vintern-1B-v2.\n","authors":["Khang T. Doan","Bao G. Huynh","Dung T. Hoang","Thuc D. Pham","Nhat H. Pham","Quan T. M. Nguyen","Bang Q. Vo","Suong N. Hoang"],"pdf_url":"https://arxiv.org/pdf/2408.12480v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2404.16821 by other authors"},{"id":"http://arxiv.org/abs/2408.12476v1","updated":"2024-08-22T15:13:44Z","published":"2024-08-22T15:13:44Z","title":"Predicting Solar Energy Generation with Machine Learning based on AQI\n and Weather Features","summary":" This paper addresses the pressing need for an accurate solar energy\nprediction model, which is crucial for efficient grid integration. We explore\nthe influence of the Air Quality Index and weather features on solar energy\ngeneration, employing advanced Machine Learning and Deep Learning techniques.\nOur methodology uses time series modeling and makes novel use of power\ntransform normalization and zero-inflated modeling. Various Machine Learning\nalgorithms and Conv2D Long Short-Term Memory model based Deep Learning models\nare applied to these transformations for precise predictions. Results\nunderscore the effectiveness of our approach, demonstrating enhanced prediction\naccuracy with Air Quality Index and weather features. We achieved a 0.9691\n$R^2$ Score, 0.18 MAE, 0.10 RMSE with Conv2D Long Short-Term Memory model,\nshowcasing the power transform technique's innovation in enhancing time series\nforecasting for solar energy generation. Such results help our research\ncontribute valuable insights to the synergy between Air Quality Index, weather\nfeatures, and Deep Learning techniques for solar energy prediction.\n","authors":["Arjun Shah","Varun Viswanath","Kashish Gandhi","Dr. Nilesh Madhukar Patil"],"pdf_url":"https://arxiv.org/pdf/2408.12476v1.pdf","comment":"10 pages, 11 figures"},{"id":"http://arxiv.org/abs/2408.12466v1","updated":"2024-08-22T15:06:50Z","published":"2024-08-22T15:06:50Z","title":"WCEbleedGen: A wireless capsule endoscopy dataset and its benchmarking\n for automatic bleeding classification, detection, and segmentation","summary":" Computer-based analysis of Wireless Capsule Endoscopy (WCE) is crucial.\nHowever, a medically annotated WCE dataset for training and evaluation of\nautomatic classification, detection, and segmentation of bleeding and\nnon-bleeding frames is currently lacking. The present work focused on\ndevelopment of a medically annotated WCE dataset called WCEbleedGen for\nautomatic classification, detection, and segmentation of bleeding and\nnon-bleeding frames. It comprises 2,618 WCE bleeding and non-bleeding frames\nwhich were collected from various internet resources and existing WCE datasets.\nA comprehensive benchmarking and evaluation of the developed dataset was done\nusing nine classification-based, three detection-based, and three\nsegmentation-based deep learning models. The dataset is of high-quality, is\nclass-balanced and contains single and multiple bleeding sites. Overall, our\nstandard benchmark results show that Visual Geometric Group (VGG) 19, You Only\nLook Once version 8 nano (YOLOv8n), and Link network (Linknet) performed best\nin automatic classification, detection, and segmentation-based evaluations,\nrespectively. Automatic bleeding diagnosis is crucial for WCE video\ninterpretations. This diverse dataset will aid in developing of real-time,\nmulti-task learning-based innovative solutions for automatic bleeding diagnosis\nin WCE. The dataset and code are publicly available at\nhttps://zenodo.org/records/10156571 and\nhttps://github.com/misahub2023/Benchmarking-Codes-of-the-WCEBleedGen-dataset.\n","authors":["Palak Handa","Manas Dhir","Amirreza Mahbod","Florian Schwarzhans","Ramona Woitek","Nidhi Goel","Deepak Gunjan"],"pdf_url":"https://arxiv.org/pdf/2408.12466v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12463v1","updated":"2024-08-22T15:04:59Z","published":"2024-08-22T15:04:59Z","title":"Smartphone-based Eye Tracking System using Edge Intelligence and Model\n Optimisation","summary":" A significant limitation of current smartphone-based eye-tracking algorithms\nis their low accuracy when applied to video-type visual stimuli, as they are\ntypically trained on static images. Also, the increasing demand for real-time\ninteractive applications like games, VR, and AR on smartphones requires\novercoming the limitations posed by resource constraints such as limited\ncomputational power, battery life, and network bandwidth. Therefore, we\ndeveloped two new smartphone eye-tracking techniques for video-type visuals by\ncombining Convolutional Neural Networks (CNN) with two different Recurrent\nNeural Networks (RNN), namely Long Short Term Memory (LSTM) and Gated Recurrent\nUnit (GRU). Our CNN+LSTM and CNN+GRU models achieved an average Root Mean\nSquare Error of 0.955cm and 1.091cm, respectively. To address the computational\nconstraints of smartphones, we developed an edge intelligence architecture to\nenhance the performance of smartphone-based eye tracking. We applied various\noptimisation methods like quantisation and pruning to deep learning models for\nbetter energy, CPU, and memory usage on edge devices, focusing on real-time\nprocessing. Using model quantisation, the model inference time in the CNN+LSTM\nand CNN+GRU models was reduced by 21.72% and 19.50%, respectively, on edge\ndevices.\n","authors":["Nishan Gunawardena","Gough Yumu Lui","Jeewani Anupama Ginige","Bahman Javadi"],"pdf_url":"https://arxiv.org/pdf/2408.12463v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12460v1","updated":"2024-08-22T14:59:37Z","published":"2024-08-22T14:59:37Z","title":"Finding Closure: A Closer Look at the Gestalt Law of Closure in\n Convolutional Neural Networks","summary":" The human brain has an inherent ability to fill in gaps to perceive figures\nas complete wholes, even when parts are missing or fragmented. This phenomenon\nis known as Closure in psychology, one of the Gestalt laws of perceptual\norganization, explaining how the human brain interprets visual stimuli. Given\nthe importance of Closure for human object recognition, we investigate whether\nneural networks rely on a similar mechanism. Exploring this crucial human\nvisual skill in neural networks has the potential to highlight their\ncomparability to humans. Recent studies have examined the Closure effect in\nneural networks. However, they typically focus on a limited selection of\nConvolutional Neural Networks (CNNs) and have not reached a consensus on their\ncapability to perform Closure. To address these gaps, we present a systematic\nframework for investigating the Closure principle in neural networks. We\nintroduce well-curated datasets designed to test for Closure effects, including\nboth modal and amodal completion. We then conduct experiments on various CNNs\nemploying different measurements. Our comprehensive analysis reveals that VGG16\nand DenseNet-121 exhibit the Closure effect, while other CNNs show variable\nresults. We interpret these findings by blending insights from psychology and\nneural network research, offering a unique perspective that enhances\ntransparency in understanding neural networks. Our code and dataset will be\nmade available on GitHub.\n","authors":["Yuyan Zhang","Derya Soydaner","Lisa Koßmann","Fatemeh Behrad","Johan Wagemans"],"pdf_url":"https://arxiv.org/pdf/2408.12460v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00429v2","updated":"2024-08-22T14:50:24Z","published":"2024-06-29T12:48:53Z","title":"Time Series Clustering with General State Space Models via Stochastic\n Variational Inference","summary":" In this paper, we propose a novel method of model-based time series\nclustering with mixtures of general state space models (MSSMs). Each component\nof MSSMs is associated with each cluster. An advantage of the proposed method\nis that it enables the use of time series models appropriate to the specific\ntime series. This not only improves clustering and prediction accuracy but also\nenhances the interpretability of the estimated parameters. The parameters of\nthe MSSMs are estimated using stochastic variational inference, a subtype of\nvariational inference. The proposed method estimates the latent variables of an\narbitrary state space model by using neural networks with a normalizing flow as\na variational estimator. The number of clusters can be estimated using the\nBayesian information criterion. In addition, to prevent MSSMs from converging\nto the local optimum, we propose several optimization tricks, including an\nadditional penalty term called entropy annealing. To our best knowledge, the\nproposed method is the first computationally feasible one for time series\nclustering based on general (possibly nonlinear, non-Gaussian) state space\nmodels. Experiments on simulated datasets show that the proposed method is\neffective for clustering, parameter estimation, and estimating the number of\nclusters.\n","authors":["Ryoichi Ishizuka","Takashi Imai","Kaoru Kawamoto"],"pdf_url":"https://arxiv.org/pdf/2407.00429v2.pdf","comment":"23 pages, 4 figures"},{"id":"http://arxiv.org/abs/2403.18540v2","updated":"2024-08-22T14:49:20Z","published":"2024-03-27T13:17:15Z","title":"skscope: Fast Sparsity-Constrained Optimization in Python","summary":" Applying iterative solvers on sparsity-constrained optimization (SCO)\nrequires tedious mathematical deduction and careful programming/debugging that\nhinders these solvers' broad impact. In the paper, the library skscope is\nintroduced to overcome such an obstacle. With skscope, users can solve the SCO\nby just programming the objective function. The convenience of skscope is\ndemonstrated through two examples in the paper, where sparse linear regression\nand trend filtering are addressed with just four lines of code. More\nimportantly, skscope's efficient implementation allows state-of-the-art solvers\nto quickly attain the sparse solution regardless of the high dimensionality of\nparameter space. Numerical experiments reveal the available solvers in skscope\ncan achieve up to 80x speedup on the competing relaxation solutions obtained\nvia the benchmarked convex solver. skscope is published on the Python Package\nIndex (PyPI) and Conda, and its source code is available at:\nhttps://github.com/abess-team/skscope.\n","authors":["Zezhi Wang","Jin Zhu","Peng Chen","Huiyang Peng","Xiaoke Zhang","Anran Wang","Junxian Zhu","Xueqin Wang"],"pdf_url":"https://arxiv.org/pdf/2403.18540v2.pdf","comment":"4 pages;add experiment"},{"id":"http://arxiv.org/abs/2407.21316v2","updated":"2024-08-22T14:46:40Z","published":"2024-07-31T03:54:41Z","title":"Diff-Cleanse: Identifying and Mitigating Backdoor Attacks in Diffusion\n Models","summary":" Diffusion models (DMs) are regarded as one of the most advanced generative\nmodels today, yet recent studies suggest that they are vulnerable to backdoor\nattacks, which establish hidden associations between particular input patterns\nand model behaviors, compromising model integrity by causing undesirable\nactions with manipulated inputs. This vulnerability poses substantial risks,\nincluding reputational damage to model owners and the dissemination of harmful\ncontent. To mitigate the threat of backdoor attacks, there have been some\ninvestigations on backdoor detection and model repair. However, previous work\nfails to reliably purify the models backdoored by state-of-the-art attack\nmethods, rendering the field much underexplored. To bridge this gap, we\nintroduce Diff-Cleanse, a novel two-stage backdoor defense framework\nspecifically designed for DMs. The first stage employs a novel trigger\ninversion technique to reconstruct the trigger and detect the backdoor, and the\nsecond stage utilizes a structural pruning method to eliminate the backdoor. We\nevaluate our framework on hundreds of DMs that are attacked by three existing\nbackdoor attack methods with a wide range of hyperparameter settings. Extensive\nexperiments demonstrate that Diff-Cleanse achieves nearly 100\\% detection\naccuracy and effectively mitigates backdoor impacts, preserving the model's\nbenign performance with minimal compromise. Our code is avaliable at\nhttps://github.com/shymuel/diff-cleanse.\n","authors":["Jiang Hao","Xiao Jin","Hu Xiaoguang","Chen Tianyou","Zhao Jiajia"],"pdf_url":"https://arxiv.org/pdf/2407.21316v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12446v1","updated":"2024-08-22T14:41:49Z","published":"2024-08-22T14:41:49Z","title":"EX-DRL: Hedging Against Heavy Losses with EXtreme Distributional\n Reinforcement Learning","summary":" Recent advancements in Distributional Reinforcement Learning (DRL) for\nmodeling loss distributions have shown promise in developing hedging strategies\nin derivatives markets. A common approach in DRL involves learning the\nquantiles of loss distributions at specified levels using Quantile Regression\n(QR). This method is particularly effective in option hedging due to its direct\nquantile-based risk assessment, such as Value at Risk (VaR) and Conditional\nValue at Risk (CVaR). However, these risk measures depend on the accurate\nestimation of extreme quantiles in the loss distribution's tail, which can be\nimprecise in QR-based DRL due to the rarity and extremity of tail data, as\nhighlighted in the literature. To address this issue, we propose EXtreme DRL\n(EX-DRL), which enhances extreme quantile prediction by modeling the tail of\nthe loss distribution with a Generalized Pareto Distribution (GPD). This method\nintroduces supplementary data to mitigate the scarcity of extreme quantile\nobservations, thereby improving estimation accuracy through QR. Comprehensive\nexperiments on gamma hedging options demonstrate that EX-DRL improves existing\nQR-based models by providing more precise estimates of extreme quantiles,\nthereby improving the computation and reliability of risk metrics for complex\nfinancial risk management.\n","authors":["Parvin Malekzadeh","Zissis Poulos","Jacky Chen","Zeyu Wang","Konstantinos N. Plataniotis"],"pdf_url":"https://arxiv.org/pdf/2408.12446v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2408.12444v1","updated":"2024-08-22T14:40:28Z","published":"2024-08-22T14:40:28Z","title":"Verifiable Homomorphic Linear Combinations in Multi-Instance Time-Lock\n Puzzles","summary":" Time-Lock Puzzles (TLPs) have been developed to securely transmit sensitive\ninformation into the future without relying on a trusted third party.\nMulti-instance TLP is a scalable variant of TLP that enables a server to\nefficiently find solutions to different puzzles provided by a client at once.\nNevertheless, existing multi-instance TLPs lack support for (verifiable)\nhomomorphic computation. To address this limitation, we introduce the\n\"Multi-Instance partially Homomorphic TLP\" (MH-TLP), a multi-instance TLP\nsupporting efficient verifiable homomorphic linear combinations of puzzles\nbelonging to a client. It ensures anyone can verify the correctness of\ncomputations and solutions. Building on MH-TLP, we further propose the\n\"Multi-instance Multi-client verifiable partially Homomorphic TLP\" (MMH-TLP).\nIt not only supports all the features of MH-TLP but also allows for verifiable\nhomomorphic linear combinations of puzzles from different clients. Our schemes\nrefrain from using asymmetric-key cryptography for verification and, unlike\nmost homomorphic TLPs, do not require a trusted third party. A comprehensive\ncost analysis demonstrates that our schemes scale linearly with the number of\nclients and puzzles.\n","authors":["Aydin Abadi"],"pdf_url":"https://arxiv.org/pdf/2408.12444v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2406.15070"},{"id":"http://arxiv.org/abs/2408.12425v1","updated":"2024-08-22T14:20:11Z","published":"2024-08-22T14:20:11Z","title":"Dynamic Gated Recurrent Neural Network for Compute-efficient Speech\n Enhancement","summary":" This paper introduces a new Dynamic Gated Recurrent Neural Network (DG-RNN)\nfor compute-efficient speech enhancement models running on resource-constrained\nhardware platforms. It leverages the slow evolution characteristic of RNN\nhidden states over steps, and updates only a selected set of neurons at each\nstep by adding a newly proposed select gate to the RNN model. This select gate\nallows the computation cost of the conventional RNN to be reduced during\nnetwork inference. As a realization of the DG-RNN, we further propose the\nDynamic Gated Recurrent Unit (D-GRU) which does not require additional\nparameters. Test results obtained from several state-of-the-art\ncompute-efficient RNN-based speech enhancement architectures using the DNS\nchallenge dataset, show that the D-GRU based model variants maintain similar\nspeech intelligibility and quality metrics comparable to the baseline GRU based\nmodels even with an average 50% reduction in GRU computes.\n","authors":["Longbiao Cheng","Ashutosh Pandey","Buye Xu","Tobi Delbruck","Shih-Chii Liu"],"pdf_url":"https://arxiv.org/pdf/2408.12425v1.pdf","comment":"Accepted to Interspeech 2024"},{"id":"http://arxiv.org/abs/2303.12767v2","updated":"2024-08-22T14:19:06Z","published":"2023-03-22T17:32:56Z","title":"Can we trust the evaluation on ChatGPT?","summary":" ChatGPT, the first large language model (LLM) with mass adoption, has\ndemonstrated remarkable performance in numerous natural language tasks. Despite\nits evident usefulness, evaluating ChatGPT's performance in diverse problem\ndomains remains challenging due to the closed nature of the model and its\ncontinuous updates via Reinforcement Learning from Human Feedback (RLHF). We\nhighlight the issue of data contamination in ChatGPT evaluations, with a case\nstudy of the task of stance detection. We discuss the challenge of preventing\ndata contamination and ensuring fair model evaluation in the age of closed and\ncontinuously trained models.\n","authors":["Rachith Aiyappa","Jisun An","Haewoon Kwak","Yong-Yeol Ahn"],"pdf_url":"https://arxiv.org/pdf/2303.12767v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12423v1","updated":"2024-08-22T14:18:16Z","published":"2024-08-22T14:18:16Z","title":"Multi-Knowledge Fusion Network for Time Series Representation Learning","summary":" Forecasting the behaviour of complex dynamical systems such as interconnected\nsensor networks characterized by high-dimensional multivariate time series(MTS)\nis of paramount importance for making informed decisions and planning for the\nfuture in a broad spectrum of applications. Graph forecasting networks(GFNs)\nare well-suited for forecasting MTS data that exhibit spatio-temporal\ndependencies. However, most prior works of GFN-based methods on MTS forecasting\nrely on domain-expertise to model the nonlinear dynamics of the system, but\nneglect the potential to leverage the inherent relational-structural\ndependencies among time series variables underlying MTS data. On the other\nhand, contemporary works attempt to infer the relational structure of the\ncomplex dependencies between the variables and simultaneously learn the\nnonlinear dynamics of the interconnected system but neglect the possibility of\nincorporating domain-specific prior knowledge to improve forecast accuracy. To\nthis end, we propose a hybrid architecture that combines explicit prior\nknowledge with implicit knowledge of the relational structure within the MTS\ndata. It jointly learns intra-series temporal dependencies and inter-series\nspatial dependencies by encoding time-conditioned structural spatio-temporal\ninductive biases to provide more accurate and reliable forecasts. It also\nmodels the time-varying uncertainty of the multi-horizon forecasts to support\ndecision-making by providing estimates of prediction uncertainty. The proposed\narchitecture has shown promising results on multiple benchmark datasets and\noutperforms state-of-the-art forecasting methods by a significant margin. We\nreport and discuss the ablation studies to validate our forecasting\narchitecture.\n","authors":["Sagar Srinivas Sakhinana","Shivam Gupta","Krishna Sai Sudhir Aripirala","Venkataramana Runkana"],"pdf_url":"https://arxiv.org/pdf/2408.12423v1.pdf","comment":"Paper accepted at ML4IoT Workshop, International Conference on\n Learning Representations(ICLR) 2023"},{"id":"http://arxiv.org/abs/2408.12419v1","updated":"2024-08-22T14:12:50Z","published":"2024-08-22T14:12:50Z","title":"4D Diffusion for Dynamic Protein Structure Prediction with Reference\n Guided Motion Alignment","summary":" Protein structure prediction is pivotal for understanding the\nstructure-function relationship of proteins, advancing biological research, and\nfacilitating pharmaceutical development and experimental design. While deep\nlearning methods and the expanded availability of experimental 3D protein\nstructures have accelerated structure prediction, the dynamic nature of protein\nstructures has received limited attention. This study introduces an innovative\n4D diffusion model incorporating molecular dynamics (MD) simulation data to\nlearn dynamic protein structures. Our approach is distinguished by the\nfollowing components: (1) a unified diffusion model capable of generating\ndynamic protein structures, including both the backbone and side chains,\nutilizing atomic grouping and side-chain dihedral angle predictions; (2) a\nreference network that enhances structural consistency by integrating the\nlatent embeddings of the initial 3D protein structures; and (3) a motion\nalignment module aimed at improving temporal structural coherence across\nmultiple time steps. To our knowledge, this is the first diffusion-based model\naimed at predicting protein trajectories across multiple time steps\nsimultaneously. Validation on benchmark datasets demonstrates that our model\nexhibits high accuracy in predicting dynamic 3D structures of proteins\ncontaining up to 256 amino acids over 32 time steps, effectively capturing both\nlocal flexibility in stable states and significant conformational changes.\n","authors":["Kaihui Cheng","Ce Liu","Qingkun Su","Jun Wang","Liwei Zhang","Yining Tang","Yao Yao","Siyu Zhu","Yuan Qi"],"pdf_url":"https://arxiv.org/pdf/2408.12419v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12416v1","updated":"2024-08-22T14:12:06Z","published":"2024-08-22T14:12:06Z","title":"Unlearning Trojans in Large Language Models: A Comparison Between\n Natural Language and Source Code","summary":" This work investigates the application of Machine Unlearning (MU) for\nmitigating the impact of trojans embedded in conventional large language models\nof natural language (Text-LLMs) and large language models of code (Code-LLMs)\nWe propose a novel unlearning approach, LYA, that leverages both gradient\nascent and elastic weight consolidation, a Fisher Information Matrix (FIM)\nbased regularization technique, to unlearn trojans from poisoned models. We\ncompare the effectiveness of LYA against conventional techniques like\nfine-tuning, retraining, and vanilla gradient ascent. The subject models we\ninvestigate are BERT and CodeBERT, for sentiment analysis and code defect\ndetection tasks, respectively. Our findings demonstrate that the combination of\ngradient ascent and FIM-based regularization, as done in LYA, outperforms\nexisting methods in removing the trojan's influence from the poisoned model,\nwhile preserving its original functionality. To the best of our knowledge, this\nis the first work that compares and contrasts MU of trojans in LLMs, in the NL\nand Coding domain.\n","authors":["Mahdi Kazemi","Aftab Hussain","Md Rafiqul Islam Rabin","Mohammad Amin Alipour","Sen Lin"],"pdf_url":"https://arxiv.org/pdf/2408.12416v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12408v1","updated":"2024-08-22T13:58:55Z","published":"2024-08-22T13:58:55Z","title":"An Evaluation of Deep Learning Models for Stock Market Trend Prediction","summary":" The stock market is a fundamental component of financial systems, reflecting\neconomic health, providing investment opportunities, and influencing global\ndynamics. Accurate stock market predictions can lead to significant gains and\npromote better investment decisions. However, predicting stock market trends is\nchallenging due to their non-linear and stochastic nature. This study\ninvestigates the efficacy of advanced deep learning models for short-term trend\nforecasting using daily and hourly closing prices from the S&P 500 index and\nthe Brazilian ETF EWZ. The models explored include Temporal Convolutional\nNetworks (TCN), Neural Basis Expansion Analysis for Time Series Forecasting\n(N-BEATS), Temporal Fusion Transformers (TFT), Neural Hierarchical\nInterpolation for Time Series Forecasting (N-HiTS), and Time-series Dense\nEncoder (TiDE). Furthermore, we introduce the Extended Long Short-Term Memory\nfor Time Series (xLSTM-TS) model, an xLSTM adaptation optimised for time series\nprediction. Wavelet denoising techniques were applied to smooth the signal and\nreduce minor fluctuations, providing cleaner data as input for all approaches.\nDenoising significantly improved performance in predicting stock price\ndirection. Among the models tested, xLSTM-TS consistently outperformed others.\nFor example, it achieved a test accuracy of 72.82% and an F1 score of 73.16% on\nthe EWZ daily dataset. By leveraging advanced deep learning models and\neffective data preprocessing techniques, this research provides valuable\ninsights into the application of machine learning for market movement\nforecasting, highlighting both the potential and the challenges involved.\n","authors":["Gonzalo Lopez Gil","Paul Duhamel-Sebline","Andrew McCarren"],"pdf_url":"https://arxiv.org/pdf/2408.12408v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12409v1","updated":"2024-08-22T13:58:55Z","published":"2024-08-22T13:58:55Z","title":"Multi-Source Knowledge-Based Hybrid Neural Framework for Time Series\n Representation Learning","summary":" Accurately predicting the behavior of complex dynamical systems,\ncharacterized by high-dimensional multivariate time series(MTS) in\ninterconnected sensor networks, is crucial for informed decision-making in\nvarious applications to minimize risk. While graph forecasting networks(GFNs)\nare ideal for forecasting MTS data that exhibit spatio-temporal dependencies,\nprior works rely solely on the domain-specific knowledge of time-series\nvariables inter-relationships to model the nonlinear dynamics, neglecting\ninherent relational structural dependencies among the variables within the MTS\ndata. In contrast, contemporary works infer relational structures from MTS data\nbut neglect domain-specific knowledge. The proposed hybrid architecture\naddresses these limitations by combining both domain-specific knowledge and\nimplicit knowledge of the relational structure underlying the MTS data using\nKnowledge-Based Compositional Generalization. The hybrid architecture shows\npromising results on multiple benchmark datasets, outperforming\nstate-of-the-art forecasting methods. Additionally, the architecture models the\ntime varying uncertainty of multi-horizon forecasts.\n","authors":["Sagar Srinivas Sakhinana","Krishna Sai Sudhir Aripirala","Shivam Gupta","Venkataramana Runkana"],"pdf_url":"https://arxiv.org/pdf/2408.12409v1.pdf","comment":"Paper is accepted at Knowledge-Based Compositional Generalization\n Workshop, International Joint Conferences on Artificial\n Intelligence(IJCAI-23)"},{"id":"http://arxiv.org/abs/2404.02785v3","updated":"2024-08-22T13:57:32Z","published":"2024-04-03T14:55:17Z","title":"Domain Generalization through Meta-Learning: A Survey","summary":" Deep neural networks (DNNs) have revolutionized artificial intelligence but\noften lack performance when faced with out-of-distribution (OOD) data, a common\nscenario due to the inevitable domain shifts in real-world applications. This\nlimitation stems from the common assumption that training and testing data\nshare the same distribution--an assumption frequently violated in practice.\nDespite their effectiveness with large amounts of data and computational power,\nDNNs struggle with distributional shifts and limited labeled data, leading to\noverfitting and poor generalization across various tasks and domains.\nMeta-learning presents a promising approach by employing algorithms that\nacquire transferable knowledge across various tasks for fast adaptation,\neliminating the need to learn each task from scratch. This survey paper delves\ninto the realm of meta-learning with a focus on its contribution to domain\ngeneralization. We first clarify the concept of meta-learning for domain\ngeneralization and introduce a novel taxonomy based on the feature extraction\nstrategy and the classifier learning methodology, offering a granular view of\nmethodologies. Additionally, we present a decision graph to assist readers in\nnavigating the taxonomy based on data availability and domain shifts, enabling\nthem to select and develop a proper model tailored to their specific problem\nrequirements. Through an exhaustive review of existing methods and underlying\ntheories, we map out the fundamentals of the field. Our survey provides\npractical insights and an informed discussion on promising research directions.\n","authors":["Arsham Gholamzadeh Khoee","Yinan Yu","Robert Feldt"],"pdf_url":"https://arxiv.org/pdf/2404.02785v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07862v2","updated":"2024-08-22T13:57:30Z","published":"2024-02-12T18:14:43Z","title":"AI-Augmented Predictions: LLM Assistants Improve Human Forecasting\n Accuracy","summary":" Large language models (LLMs) match and sometimes exceeding human performance\nin many domains. This study explores the potential of LLMs to augment human\njudgement in a forecasting task. We evaluate the effect on human forecasters of\ntwo LLM assistants: one designed to provide high-quality (\"superforecasting\")\nadvice, and the other designed to be overconfident and base-rate neglecting,\nthus providing noisy forecasting advice. We compare participants using these\nassistants to a control group that received a less advanced model that did not\nprovide numerical predictions or engaged in explicit discussion of predictions.\nParticipants (N = 991) answered a set of six forecasting questions and had the\noption to consult their assigned LLM assistant throughout. Our preregistered\nanalyses show that interacting with each of our frontier LLM assistants\nsignificantly enhances prediction accuracy by between 24 percent and 28 percent\ncompared to the control group. Exploratory analyses showed a pronounced outlier\neffect in one forecasting item, without which we find that the superforecasting\nassistant increased accuracy by 41 percent, compared with 29 percent for the\nnoisy assistant. We further examine whether LLM forecasting augmentation\ndisproportionately benefits less skilled forecasters, degrades the\nwisdom-of-the-crowd by reducing prediction diversity, or varies in\neffectiveness with question difficulty. Our data do not consistently support\nthese hypotheses. Our results suggest that access to a frontier LLM assistant,\neven a noisy one, can be a helpful decision aid in cognitively demanding tasks\ncompared to a less powerful model that does not provide specific forecasting\nadvice. However, the effects of outliers suggest that further research into the\nrobustness of this pattern is needed.\n","authors":["Philipp Schoenegger","Peter S. Park","Ezra Karger","Sean Trott","Philip E. Tetlock"],"pdf_url":"https://arxiv.org/pdf/2402.07862v2.pdf","comment":"22 pages pages (main text comprised of 19 pages, appendix comprised\n of three pages). 10 visualizations in the main text (four figures, six\n tables), three additional figures in the appendix"},{"id":"http://arxiv.org/abs/2408.08399v2","updated":"2024-08-22T13:29:46Z","published":"2024-08-15T19:54:53Z","title":"An Efficient and Explainable Transformer-Based Few-Shot Learning for\n Modeling Electricity Consumption Profiles Across Thousands of Domains","summary":" Electricity Consumption Profiles (ECPs) are crucial for operating and\nplanning power distribution systems, especially with the increasing numbers of\nvarious low-carbon technologies such as solar panels and electric vehicles.\nTraditional ECP modeling methods typically assume the availability of\nsufficient ECP data. However, in practice, the accessibility of ECP data is\nlimited due to privacy issues or the absence of metering devices. Few-shot\nlearning (FSL) has emerged as a promising solution for ECP modeling in\ndata-scarce scenarios. Nevertheless, standard FSL methods, such as those used\nfor images, are unsuitable for ECP modeling because (1) these methods usually\nassume several source domains with sufficient data and several target domains.\nHowever, in the context of ECP modeling, there may be thousands of source\ndomains with a moderate amount of data and thousands of target domains. (2)\nStandard FSL methods usually involve cumbersome knowledge transfer mechanisms,\nsuch as pre-training and fine-tuning, whereas ECP modeling requires more\nlightweight methods. (3) Deep learning models often lack explainability,\nhindering their application in industry. This paper proposes a novel FSL method\nthat exploits Transformers and Gaussian Mixture Models (GMMs) for ECP modeling\nto address the above-described issues. Results show that our method can\naccurately restore the complex ECP distribution with a minimal amount of ECP\ndata (e.g., only 1.6\\% of the complete domain dataset) while it outperforms\nstate-of-the-art time series modeling methods, maintaining the advantages of\nbeing both lightweight and interpretable. The project is open-sourced at\nhttps://github.com/xiaweijie1996/TransformerEM-GMM.git.\n","authors":["Weijie Xia","Gao Peng","Chenguang Wang","Peter Palensky","Eric Pauwels","Pedro P. Vergara"],"pdf_url":"https://arxiv.org/pdf/2408.08399v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12385v1","updated":"2024-08-22T13:26:41Z","published":"2024-08-22T13:26:41Z","title":"Sharper Bounds for Chebyshev Moment Matching with Applications to\n Differential Privacy and Beyond","summary":" We study the problem of approximately recovering a probability distribution\ngiven noisy measurements of its Chebyshev polynomial moments. We sharpen prior\nwork, proving that accurate recovery in the Wasserstein distance is possible\nwith more noise than previously known.\n As a main application, our result yields a simple \"linear query\" algorithm\nfor constructing a differentially private synthetic data distribution with\nWasserstein-1 error $\\tilde{O}(1/n)$ based on a dataset of $n$ points in\n$[-1,1]$. This bound is optimal up to log factors and matches a recent\nbreakthrough of Boedihardjo, Strohmer, and Vershynin [Probab. Theory. Rel.,\n2024], which uses a more complex \"superregular random walk\" method to beat an\n$O(1/\\sqrt{n})$ accuracy barrier inherent to earlier approaches.\n We illustrate a second application of our new moment-based recovery bound in\nnumerical linear algebra: by improving an approach of Braverman, Krishnan, and\nMusco [STOC 2022], our result yields a faster algorithm for estimating the\nspectral density of a symmetric matrix up to small error in the Wasserstein\ndistance.\n","authors":["Cameron Musco","Christopher Musco","Lucas Rosenblatt","Apoorv Vikram Singh"],"pdf_url":"https://arxiv.org/pdf/2408.12385v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12381v1","updated":"2024-08-22T13:21:47Z","published":"2024-08-22T13:21:47Z","title":"Sampling Strategies based on Wisdom of Crowds for Amazon Deforestation\n Detection","summary":" Conserving tropical forests is highly relevant socially and ecologically\nbecause of their critical role in the global ecosystem. However, the ongoing\ndeforestation and degradation affect millions of hectares each year,\nnecessitating government or private initiatives to ensure effective forest\nmonitoring. In April 2019, a project based on Citizen Science and Machine\nLearning models called ForestEyes (FE) was launched with the aim of providing\nsupplementary data to assist experts from government and non-profit\norganizations in their deforestation monitoring efforts. Recent research has\nshown that labeling FE project volunteers/citizen scientists helps tailor\nmachine learning models. In this sense, we adopt the FE project to create\ndifferent sampling strategies based on the wisdom of crowds to select the most\nsuitable samples from the training set to learn an SVM technique and obtain\nbetter classification results in deforestation detection tasks. In our\nexperiments, we can show that our strategy based on user entropy-increasing\nachieved the best classification results in the deforestation detection task\nwhen compared with the random sampling strategies, as well as, reducing the\nconvergence time of the SVM technique.\n","authors":["Hugo Resende","Eduardo B. Neto","Fabio A. M. Cappabianco","Alvaro L. Fazenda","Fabio A. Faria"],"pdf_url":"https://arxiv.org/pdf/2408.12381v1.pdf","comment":"6 pages, 5 figus, paper accepted at the SIBGRAPI 2024"},{"id":"http://arxiv.org/abs/2408.12373v1","updated":"2024-08-22T13:15:49Z","published":"2024-08-22T13:15:49Z","title":"Cell-ontology guided transcriptome foundation model","summary":" Transcriptome foundation models TFMs hold great promises of deciphering the\ntranscriptomic language that dictate diverse cell functions by self-supervised\nlearning on large-scale single-cell gene expression data, and ultimately\nunraveling the complex mechanisms of human diseases. However, current TFMs\ntreat cells as independent samples and ignore the taxonomic relationships\nbetween cell types, which are available in cell ontology graphs. We argue that\neffectively leveraging this ontology information during the TFM pre-training\ncan improve learning biologically meaningful gene co-expression patterns while\npreserving TFM as a general purpose foundation model for downstream zero-shot\nand fine-tuning tasks. To this end, we present \\textbf{s}ingle \\textbf{c}ell,\n\\textbf{Cell}-\\textbf{o}ntology guided TFM scCello. We introduce cell-type\ncoherence loss and ontology alignment loss, which are minimized along with the\nmasked gene expression prediction loss during the pre-training. The novel loss\ncomponent guide scCello to learn the cell-type-specific representation and the\nstructural relation between cell types from the cell ontology graph,\nrespectively. We pre-trained scCello on 22 million cells from CellxGene\ndatabase leveraging their cell-type labels mapped to the cell ontology graph\nfrom Open Biological and Biomedical Ontology Foundry. Our TFM demonstrates\ncompetitive generalization and transferability performance over the existing\nTFMs on biologically important tasks including identifying novel cell types of\nunseen cells, prediction of cell-type-specific marker genes, and cancer drug\nresponses.\n","authors":["Xinyu Yuan","Zhihao Zhan","Zuobai Zhang","Manqi Zhou","Jianan Zhao","Boyu Han","Yue Li","Jian Tang"],"pdf_url":"https://arxiv.org/pdf/2408.12373v1.pdf","comment":"All anonymous reviewers' constructive suggestions are appreciated.\n The next version will be updated soon"},{"id":"http://arxiv.org/abs/2408.03608v2","updated":"2024-08-22T13:13:56Z","published":"2024-08-07T07:54:19Z","title":"Mixstyle-Entropy: Domain Generalization with Causal Intervention and\n Perturbation","summary":" Despite the considerable advancements achieved by deep neural networks, their\nperformance tends to degenerate when the test environment diverges from the\ntraining ones. Domain generalization (DG) solves this issue by learning\nrepresentations independent of domain-related information, thus facilitating\nextrapolation to unseen environments. Existing approaches typically focus on\nformulating tailored training objectives to extract shared features from the\nsource data. However, the disjointed training and testing procedures may\ncompromise robustness, particularly in the face of unforeseen variations during\ndeployment. In this paper, we propose a novel and holistic framework based on\ncausality, named InPer, designed to enhance model generalization by\nincorporating causal intervention during training and causal perturbation\nduring testing. Specifically, during the training phase, we employ\nentropy-based causal intervention (EnIn) to refine the selection of causal\nvariables. To identify samples with anti-interference causal variables from the\ntarget domain, we propose a novel metric, homeostatic score, through causal\nperturbation (HoPer) to construct a prototype classifier in test time.\nExperimental results across multiple cross-domain tasks confirm the efficacy of\nInPer.\n","authors":["Luyao Tang","Yuxuan Yuan","Chaoqi Chen","Xinghao Ding","Yue Huang"],"pdf_url":"https://arxiv.org/pdf/2408.03608v2.pdf","comment":"Accepted by BMVC2024"},{"id":"http://arxiv.org/abs/2402.16823v3","updated":"2024-08-22T13:06:51Z","published":"2024-02-26T18:48:27Z","title":"Language Agents as Optimizable Graphs","summary":" Various human-designed prompt engineering techniques have been proposed to\nimprove problem solvers based on Large Language Models (LLMs), yielding many\ndisparate code bases. We unify these approaches by describing LLM-based agents\nas computational graphs. The nodes implement functions to process multimodal\ndata or query LLMs, and the edges describe the information flow between\noperations. Graphs can be recursively combined into larger composite graphs\nrepresenting hierarchies of inter-agent collaboration (where edges connect\noperations of different agents). Our novel automatic graph optimizers (1)\nrefine node-level LLM prompts (node optimization) and (2) improve agent\norchestration by changing graph connectivity (edge optimization). Experiments\ndemonstrate that our framework can be used to efficiently develop, integrate,\nand automatically improve various LLM agents. The code can be found at\nhttps://github.com/metauto-ai/gptswarm.\n","authors":["Mingchen Zhuge","Wenyi Wang","Louis Kirsch","Francesco Faccio","Dmitrii Khizbullin","Jürgen Schmidhuber"],"pdf_url":"https://arxiv.org/pdf/2402.16823v3.pdf","comment":"Project Website: https://gptswarm.org ; Github Repo:\n https://github.com/metauto-ai/gptswarm . In Forty-first International\n Conference on Machine Learning (2024)"},{"id":"http://arxiv.org/abs/2408.12366v1","updated":"2024-08-22T13:06:31Z","published":"2024-08-22T13:06:31Z","title":"Robust Principal Component Analysis via Discriminant Sample Weight\n Learning","summary":" Principal component analysis (PCA) is a classical feature extraction method,\nbut it may be adversely affected by outliers, resulting in inaccurate learning\nof the projection matrix. This paper proposes a robust method to estimate both\nthe data mean and the PCA projection matrix by learning discriminant sample\nweights from data containing outliers. Each sample in the dataset is assigned a\nweight, and the proposed algorithm iteratively learns the weights, the mean,\nand the projection matrix, respectively. Specifically, when the mean and the\nprojection matrix are available, via fine-grained analysis of outliers, a\nweight for each sample is learned hierarchically so that outliers have small\nweights while normal samples have large weights. With the learned weights\navailable, a weighted optimization problem is solved to estimate both the data\nmean and the projection matrix. Because the learned weights discriminate\noutliers from normal samples, the adverse influence of outliers is mitigated\ndue to the corresponding small weights. Experiments on toy data, UCI dataset,\nand face dataset demonstrate the effectiveness of the proposed method in\nestimating the mean and the projection matrix from the data containing\noutliers.\n","authors":["Yingzhuo Deng","Ke Hu","Bo Li","Yao Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.12366v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12365v1","updated":"2024-08-22T13:03:55Z","published":"2024-08-22T13:03:55Z","title":"Enhancing Uncertainty Communication in Time Series Predictions: Insights\n and Recommendations","summary":" As the world increasingly relies on mathematical models for forecasts in\ndifferent areas, effective communication of uncertainty in time series\npredictions is important for informed decision making. This study explores how\nusers estimate probabilistic uncertainty in time series predictions under\ndifferent variants of line charts depicting uncertainty. It examines the role\nof individual characteristics and the influence of user-reported metrics on\nuncertainty estimations. By addressing these aspects, this paper aims to\nenhance the understanding of uncertainty visualization and for improving\ncommunication in time series forecast visualizations and the design of\nprediction data dashboards.As the world increasingly relies on mathematical\nmodels for forecasts in different areas, effective communication of uncertainty\nin time series predictions is important for informed decision making. This\nstudy explores how users estimate probabilistic uncertainty in time series\npredictions under different variants of line charts depicting uncertainty. It\nexamines the role of individual characteristics and the influence of\nuser-reported metrics on uncertainty estimations. By addressing these aspects,\nthis paper aims to enhance the understanding of uncertainty visualization and\nfor improving communication in time series forecast visualizations and the\ndesign of prediction data dashboards.\n","authors":["Apoorva Karagappa","Pawandeep Kaur Betz","Jonas Gilg","Moritz Zeumer","Andreas Gerndt","Bernhard Preim"],"pdf_url":"https://arxiv.org/pdf/2408.12365v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12353v1","updated":"2024-08-22T12:51:28Z","published":"2024-08-22T12:51:28Z","title":"Distributed quasi-Newton robust estimation under differential privacy","summary":" For distributed computing with Byzantine machines under Privacy Protection\n(PP) constraints, this paper develops a robust PP distributed quasi-Newton\nestimation, which only requires the node machines to transmit five vectors to\nthe central processor with high asymptotic relative efficiency. Compared with\nthe gradient descent strategy which requires more rounds of transmission and\nthe Newton iteration strategy which requires the entire Hessian matrix to be\ntransmitted, the novel quasi-Newton iteration has advantages in reducing\nprivacy budgeting and transmission cost. Moreover, our PP algorithm does not\ndepend on the boundedness of gradients and second-order derivatives. When\ngradients and second-order derivatives follow sub-exponential distributions, we\noffer a mechanism that can ensure PP with a sufficiently high probability.\nFurthermore, this novel estimator can achieve the optimal convergence rate and\nthe asymptotic normality. The numerical studies on synthetic and real data sets\nevaluate the performance of the proposed algorithm.\n","authors":["Chuhan Wang","Lixing Zhu","Xuehu Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.12353v1.pdf","comment":"38 pages, 6 figures"},{"id":"http://arxiv.org/abs/2312.16563v2","updated":"2024-08-22T12:50:09Z","published":"2023-12-27T13:04:46Z","title":"RDGCL: Reaction-Diffusion Graph Contrastive Learning for Recommendation","summary":" Contrastive learning (CL) has emerged as a promising technique for improving\nrecommender systems, addressing the challenge of data sparsity by using\nself-supervised signals from raw data. Integration of CL with graph\nconvolutional network (GCN)-based collaborative filterings (CFs) has been\nexplored in recommender systems. However, current CL-based recommendation\nmodels heavily rely on low-pass filters and graph augmentations. In this paper,\ninspired by the reaction-diffusion equation, we propose a novel CL method for\nrecommender systems called the reaction-diffusion graph contrastive learning\nmodel (RDGCL). We design our own GCN for CF based on the equations of\ndiffusion, i.e., low-pass filter, and reaction, i.e., high-pass filter. Our\nproposed CL-based training occurs between reaction and diffusion-based\nembeddings, so there is no need for graph augmentations. Experimental\nevaluation on 5 benchmark datasets demonstrates that our proposed method\noutperforms state-of-the-art CL-based recommendation models. By enhancing\nrecommendation accuracy and diversity, our method brings an advancement in CL\nfor recommender systems.\n","authors":["Jeongwhan Choi","Hyowon Wi","Chaejeong Lee","Sung-Bae Cho","Dongha Lee","Noseong Park"],"pdf_url":"https://arxiv.org/pdf/2312.16563v2.pdf","comment":"Jeongwhan Choi and Hyowon Wi are co-first authors with equal\n contributions"},{"id":"http://arxiv.org/abs/2207.07089v2","updated":"2024-08-22T12:48:10Z","published":"2022-07-14T17:40:05Z","title":"A Personalized Zero-Shot ECG Arrhythmia Monitoring System: From Sparse\n Representation Based Domain Adaption to Energy Efficient Abnormal Beat\n Detection for Practical ECG Surveillance","summary":" This paper proposes a low-cost and highly accurate ECG-monitoring system\nintended for personalized early arrhythmia detection for wearable mobile\nsensors. Earlier supervised approaches for personalized ECG monitoring require\nboth abnormal and normal heartbeats for the training of the dedicated\nclassifier. However, in a real-world scenario where the personalized algorithm\nis embedded in a wearable device, such training data is not available for\nhealthy people with no cardiac disorder history. In this study, (i) we propose\na null space analysis on the healthy signal space obtained via sparse\ndictionary learning, and investigate how a simple null space projection or\nalternatively regularized least squares-based classification methods can reduce\nthe computational complexity, without sacrificing the detection accuracy, when\ncompared to sparse representation-based classification. (ii) Then we introduce\na sparse representation-based domain adaptation technique in order to project\nother existing users' abnormal and normal signals onto the new user's signal\nspace, enabling us to train the dedicated classifier without having any\nabnormal heartbeat of the new user. Therefore, zero-shot learning can be\nachieved without the need for synthetic abnormal heartbeat generation. An\nextensive set of experiments performed on the benchmark MIT-BIH ECG dataset\nshows that when this domain adaptation-based training data generator is used\nwith a simple 1-D CNN classifier, the method outperforms the prior work by a\nsignificant margin. (iii) Then, by combining (i) and (ii), we propose an\nensemble classifier that further improves the performance. This approach for\nzero-shot arrhythmia detection achieves an average accuracy level of 98.2% and\nan F1-Score of 92.8%. Finally, a personalized energy-efficient ECG monitoring\nscheme is proposed using the above-mentioned innovations.\n","authors":["Mehmet Yamaç","Mert Duman","İlke Adalıoğlu","Serkan Kiranyaz","Moncef Gabbouj"],"pdf_url":"https://arxiv.org/pdf/2207.07089v2.pdf","comment":"Software implementation: https://github.com/MertDuman/Zero-Shot-ECG"},{"id":"http://arxiv.org/abs/2406.02616v4","updated":"2024-08-22T12:40:29Z","published":"2024-06-03T09:41:42Z","title":"Adaptive Layer Splitting for Wireless LLM Inference in Edge Computing: A\n Model-Based Reinforcement Learning Approach","summary":" Optimizing the deployment of large language models (LLMs) in edge computing\nenvironments is critical for enhancing privacy and computational efficiency.\nToward efficient wireless LLM inference in edge computing, this study\ncomprehensively analyzes the impact of different splitting points in mainstream\nopen-source LLMs. On this basis, this study introduces a framework taking\ninspiration from model-based reinforcement learning (MBRL) to determine the\noptimal splitting point across the edge and user equipment (UE). By\nincorporating a reward surrogate model, our approach significantly reduces the\ncomputational cost of frequent performance evaluations. Extensive simulations\ndemonstrate that this method effectively balances inference performance and\ncomputational load under varying network conditions, providing a robust\nsolution for LLM deployment in decentralized settings.\n","authors":["Yuxuan Chen","Rongpeng Li","Xiaoxue Yu","Zhifeng Zhao","Honggang Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.02616v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05241v5","updated":"2024-08-22T12:30:44Z","published":"2024-04-08T07:11:33Z","title":"LightFF: Lightweight Inference for Forward-Forward Algorithm","summary":" The human brain performs tasks with an outstanding energy efficiency, i.e.,\nwith approximately 20 Watts. The state-of-the-art Artificial/Deep Neural\nNetworks (ANN/DNN), on the other hand, have recently been shown to consume\nmassive amounts of energy. The training of these ANNs/DNNs is done almost\nexclusively based on the back-propagation algorithm, which is known to be\nbiologically implausible. This has led to a new generation of forward-only\ntechniques, including the Forward-Forward algorithm. In this paper, we propose\na lightweight inference scheme specifically designed for DNNs trained using the\nForward-Forward algorithm. We have evaluated our proposed lightweight inference\nscheme in the case of the MNIST and CIFAR datasets, as well as two real-world\napplications, namely, epileptic seizure detection and cardiac arrhythmia\nclassification using wearable technologies, where complexity overheads/energy\nconsumption is a major constraint, and demonstrate its relevance. Our code is\navailable at https://github.com/AminAminifar/LightFF.\n","authors":["Amin Aminifar","Baichuan Huang","Azra Abtahi","Amir Aminifar"],"pdf_url":"https://arxiv.org/pdf/2404.05241v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12337v1","updated":"2024-08-22T12:23:29Z","published":"2024-08-22T12:23:29Z","title":"Fine-tuning Smaller Language Models for Question Answering over\n Financial Documents","summary":" Recent research has shown that smaller language models can acquire\nsubstantial reasoning abilities when fine-tuned with reasoning exemplars\ncrafted by a significantly larger teacher model. We explore this paradigm for\nthe financial domain, focusing on the challenge of answering questions that\nrequire multi-hop numerical reasoning over financial texts. We assess the\nperformance of several smaller models that have been fine-tuned to generate\nprograms that encode the required financial reasoning and calculations. Our\nfindings demonstrate that these fine-tuned smaller models approach the\nperformance of the teacher model.\n To provide a granular analysis of model performance, we propose an approach\nto investigate the specific student model capabilities that are enhanced by\nfine-tuning. Our empirical analysis indicates that fine-tuning refines the\nstudent models ability to express and apply the required financial concepts\nalong with adapting the entity extraction for the specific data format. In\naddition, we hypothesize and demonstrate that comparable financial reasoning\ncapability can be induced using relatively smaller datasets.\n","authors":["Karmvir Singh Phogat","Sai Akhil Puranam","Sridhar Dasaratha","Chetan Harsha","Shashishekar Ramakrishna"],"pdf_url":"https://arxiv.org/pdf/2408.12337v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12334v1","updated":"2024-08-22T12:22:00Z","published":"2024-08-22T12:22:00Z","title":"Enhanced Expressivity in Graph Neural Networks with Lanczos-Based Linear\n Constraints","summary":" Graph Neural Networks (GNNs) excel in handling graph-structured data but\noften underperform in link prediction tasks compared to classical methods,\nmainly due to the limitations of the commonly used Message Passing GNNs\n(MPNNs). Notably, their ability to distinguish non-isomorphic graphs is limited\nby the 1-dimensional Weisfeiler-Lehman test. Our study presents a novel method\nto enhance the expressivity of GNNs by embedding induced subgraphs into the\ngraph Laplacian matrix's eigenbasis. We introduce a Learnable Lanczos algorithm\nwith Linear Constraints (LLwLC), proposing two novel subgraph extraction\nstrategies: encoding vertex-deleted subgraphs and applying Neumann eigenvalue\nconstraints. For the former, we conjecture that LLwLC establishes a universal\napproximator, offering efficient time complexity. The latter focuses on link\nrepresentations enabling differentiation between $k$-regular graphs and node\nautomorphism, a vital aspect for link prediction tasks. Our approach results in\nan extremely lightweight architecture, reducing the need for extensive training\ndatasets. Empirically, our method improves performance in challenging link\nprediction tasks across benchmark datasets, establishing its practical utility\nand supporting our theoretical findings. Notably, LLwLC achieves 20x and 10x\nspeedup by only requiring 5% and 10% data from the PubMed and OGBL-Vessel\ndatasets while comparing to the state-of-the-art.\n","authors":["Niloofar Azizi","Nils Kriege","Horst Bischof"],"pdf_url":"https://arxiv.org/pdf/2408.12334v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12320v1","updated":"2024-08-22T11:57:07Z","published":"2024-08-22T11:57:07Z","title":"PolyRouter: A Multi-LLM Querying System","summary":" With the rapid growth of Large Language Models (LLMs) across various domains,\nnumerous new LLMs have emerged, each possessing domain-specific expertise. This\nproliferation has highlighted the need for quick, high-quality, and\ncost-effective LLM query response methods. Yet, no single LLM exists to\nefficiently balance this trilemma. Some models are powerful but extremely\ncostly, while others are fast and inexpensive but qualitatively inferior. To\naddress this challenge, we present PolyRouter, a non-monolithic LLM querying\nsystem that seamlessly integrates various LLM experts into a single query\ninterface and dynamically routes incoming queries to the most high-performant\nexpert based on query's requirements. Through extensive experiments, we\ndemonstrate that when compared to standalone expert models, PolyRouter improves\nquery efficiency by up to 40%, and leads to significant cost reductions of up\nto 30%, while maintaining or enhancing model performance by up to 10%.\n","authors":["Dimitris Stripelis","Zijian Hu","Jipeng Zhang","Zhaozhuo Xu","Alay Shah","Han Jin","Yuhang Yao","Salman Avestimehr","Chaoyang He"],"pdf_url":"https://arxiv.org/pdf/2408.12320v1.pdf","comment":"14 pages, 7 figures, 2 tables"},{"id":"http://arxiv.org/abs/2408.12319v1","updated":"2024-08-22T11:55:43Z","published":"2024-08-22T11:55:43Z","title":"Neural-ANOVA: Model Decomposition for Interpretable Machine Learning","summary":" The analysis of variance (ANOVA) decomposition offers a systematic method to\nunderstand the interaction effects that contribute to a specific decision\noutput. In this paper we introduce Neural-ANOVA, an approach to decompose\nneural networks into glassbox models using the ANOVA decomposition. Our\napproach formulates a learning problem, which enables rapid and closed-form\nevaluation of integrals over subspaces that appear in the calculation of the\nANOVA decomposition. Finally, we conduct numerical experiments to illustrate\nthe advantages of enhanced interpretability and model validation by a\ndecomposition of the learned interaction effects.\n","authors":["Steffen Limmer","Steffen Udluft","Clemens Otte"],"pdf_url":"https://arxiv.org/pdf/2408.12319v1.pdf","comment":"8 pages, 4 figures, 5 tables"},{"id":"http://arxiv.org/abs/2302.09193v3","updated":"2024-08-22T11:55:20Z","published":"2023-02-17T23:58:14Z","title":"Copula-based transferable models for synthetic population generation","summary":" Population synthesis involves generating synthetic yet realistic\nrepresentations of a target population of micro-agents for behavioral modeling\nand simulation. Traditional methods, often reliant on target population\nsamples, such as census data or travel surveys, face limitations due to high\ncosts and small sample sizes, particularly at smaller geographical scales. We\npropose a novel framework based on copulas to generate synthetic data for\ntarget populations where only empirical marginal distributions are known. This\nmethod utilizes samples from different populations with similar marginal\ndependencies, introduces a spatial component into population synthesis, and\nconsiders various information sources for more realistic generators.\nConcretely, the process involves normalizing the data and treating it as\nrealizations of a given copula, and then training a generative model before\nincorporating the information on the marginals of the target population.\nUtilizing American Community Survey data, we assess our framework's performance\nthrough standardized root mean squared error (SRMSE) and so-called sampled\nzeros. We focus on its capacity to transfer a model learned from one population\nto another. Our experiments include transfer tests between regions at the same\ngeographical level as well as to lower geographical levels, hence evaluating\nthe framework's adaptability in varied spatial contexts. We compare Bayesian\nNetworks, Variational Autoencoders, and Generative Adversarial Networks, both\nindividually and combined with our copula framework. Results show that the\ncopula enhances machine learning methods in matching the marginals of the\nreference data. Furthermore, it consistently surpasses Iterative Proportional\nFitting in terms of SRMSE in the transferability experiments, while introducing\nunique observations not found in the original training sample.\n","authors":["Pascal Jutras-Dubé","Mohammad B. Al-Khasawneh","Zhichao Yang","Javier Bas","Fabian Bastin","Cinzia Cirillo"],"pdf_url":"https://arxiv.org/pdf/2302.09193v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12308v1","updated":"2024-08-22T11:34:34Z","published":"2024-08-22T11:34:34Z","title":"Deep Learning with CNNs: A Compact Holistic Tutorial with Focus on\n Supervised Regression (Preprint)","summary":" In this tutorial, we present a compact and holistic discussion of Deep\nLearning with a focus on Convolutional Neural Networks (CNNs) and supervised\nregression. While there are numerous books and articles on the individual\ntopics we cover, comprehensive and detailed tutorials that address Deep\nLearning from a foundational yet rigorous and accessible perspective are rare.\nMost resources on CNNs are either too advanced, focusing on cutting-edge\narchitectures, or too narrow, addressing only specific applications like image\nclassification.This tutorial not only summarizes the most relevant concepts but\nalso provides an in-depth exploration of each, offering a complete yet agile\nset of ideas. Moreover, we highlight the powerful synergy between learning\ntheory, statistic, and machine learning, which together underpin the Deep\nLearning and CNN frameworks. We aim for this tutorial to serve as an optimal\nresource for students, professors, and anyone interested in understanding the\nfoundations of Deep Learning. Upon acceptance we will provide an accompanying\nrepository under\n\\href{https://github.com/neoglez/deep-learning-tutorial}{https://github.com/neoglez/deep-learning-tutorial}\n Keywords: Tutorial, Deep Learning, Convolutional Neural Networks, Machine\nLearning.\n","authors":["Yansel Gonzalez Tejeda","Helmut A. Mayer"],"pdf_url":"https://arxiv.org/pdf/2408.12308v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12307v1","updated":"2024-08-22T11:31:51Z","published":"2024-08-22T11:31:51Z","title":"Leveraging Unlabeled Data Sharing through Kernel Function Approximation\n in Offline Reinforcement Learning","summary":" Offline reinforcement learning (RL) learns policies from a fixed dataset, but\noften requires large amounts of data. The challenge arises when labeled\ndatasets are expensive, especially when rewards have to be provided by human\nlabelers for large datasets. In contrast, unlabelled data tends to be less\nexpensive. This situation highlights the importance of finding effective ways\nto use unlabelled data in offline RL, especially when labelled data is limited\nor expensive to obtain. In this paper, we present the algorithm to utilize the\nunlabeled data in the offline RL method with kernel function approximation and\ngive the theoretical guarantee. We present various eigenvalue decay conditions\nof $\\mathcal{H}_k$ which determine the complexity of the algorithm. In summary,\nour work provides a promising approach for exploiting the advantages offered by\nunlabeled data in offline RL, whilst maintaining theoretical assurances.\n","authors":["Yen-Ru Lai","Fu-Chieh Chang","Pei-Yuan Wu"],"pdf_url":"https://arxiv.org/pdf/2408.12307v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09218v3","updated":"2024-08-22T11:23:27Z","published":"2024-08-17T14:55:15Z","title":"FQGA-single: Towards Fewer Training Epochs and Fewer Model Parameters\n for Image-to-Image Translation Tasks","summary":" CycleGAN was trained on SynthRAD Grand Challenge Dataset using the\nsingle-epoch modification (SEM) method proposed in this paper which is referred\nto as (CycleGAN-single) compared to the usual method of training CycleGAN on\naround 200 epochs (CycleGAN-multi). Model performance were evaluated\nqualitatively and quantitatively with quantitative performance metrics like\nPSNR, SSIM, MAE and MSE. The consideration of both quantitative and qualitative\nperformance when evaluating a model is unique to certain image-to-image\ntranslation tasks like medical imaging of patient data as detailed in this\npaper. Also, this paper shows that good quantitative performance does not\nalways imply good qualitative performance and the converse is also not always\nTrue (i.e. good qualitative performance does not always imply good quantitative\nperformance). This paper also proposes a lightweight model called FQGA (Fast\nPaired Image-to-Image Translation Quarter-Generator Adversary) which has 1/4\nthe number of parameters compared to CycleGAN (when comparing their Generator\nModels). FQGA outperforms CycleGAN qualitatively and quantitatively even only\nafter training on 20 epochs. Finally, using SEM method on FQGA allowed it to\nagain outperform CycleGAN both quantitatively and qualitatively. These\nperformance gains even with fewer model parameters and fewer epochs (which will\nresult in time and computational savings) may also be applicable to other\nimage-to-image translation tasks in Machine Learning apart from the Medical\nimage-translation task discussed in this paper between Cone Beam Computed\nTomography (CBCT) and Computed Tomography (CT) images.\n","authors":["Cho Yang"],"pdf_url":"https://arxiv.org/pdf/2408.09218v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12300v1","updated":"2024-08-22T11:18:11Z","published":"2024-08-22T11:18:11Z","title":"Tackling Data Heterogeneity in Federated Learning via Loss Decomposition","summary":" Federated Learning (FL) is a rising approach towards collaborative and\nprivacy-preserving machine learning where large-scale medical datasets remain\nlocalized to each client. However, the issue of data heterogeneity among\nclients often compels local models to diverge, leading to suboptimal global\nmodels. To mitigate the impact of data heterogeneity on FL performance, we\nstart with analyzing how FL training influence FL performance by decomposing\nthe global loss into three terms: local loss, distribution shift loss and\naggregation loss. Remarkably, our loss decomposition reveals that existing\nlocal training-based FL methods attempt to reduce the distribution shift loss,\nwhile the global aggregation-based FL methods propose better aggregation\nstrategies to reduce the aggregation loss. Nevertheless, a comprehensive joint\neffort to minimize all three terms is currently limited in the literature,\nleading to subpar performance when dealing with data heterogeneity challenges.\nTo fill this gap, we propose a novel FL method based on global loss\ndecomposition, called FedLD, to jointly reduce these three loss terms. Our\nFedLD involves a margin control regularization in local training to reduce the\ndistribution shift loss, and a principal gradient-based server aggregation\nstrategy to reduce the aggregation loss. Notably, under different levels of\ndata heterogeneity, our strategies achieve better and more robust performance\non retinal and chest X-ray classification compared to other FL algorithms. Our\ncode is available at\n\\href{https://github.com/Zeng-Shuang/FedLD}{https://github.com/Zeng-Shuang/FedLD}.\n","authors":["Shuang Zeng","Pengxin Guo","Shuai Wang","Jianbo Wang","Yuyin Zhou","Liangqiong Qu"],"pdf_url":"https://arxiv.org/pdf/2408.12300v1.pdf","comment":"Accepted at MICCAI 2024"},{"id":"http://arxiv.org/abs/2408.12296v1","updated":"2024-08-22T11:14:37Z","published":"2024-08-22T11:14:37Z","title":"Multiple testing for signal-agnostic searches of new physics with\n machine learning","summary":" In this work, we address the question of how to enhance signal-agnostic\nsearches by leveraging multiple testing strategies. Specifically, we consider\nhypothesis tests relying on machine learning, where model selection can\nintroduce a bias towards specific families of new physics signals. We show that\nit is beneficial to combine different tests, characterised by distinct choices\nof hyperparameters, and that performances comparable to the best available test\nare generally achieved while providing a more uniform response to various types\nof anomalies. Focusing on the New Physics Learning Machine, a methodology to\nperform a signal-agnostic likelihood-ratio test, we explore a number of\napproaches to multiple testing, such as combining p-values and aggregating test\nstatistics.\n","authors":["Gaia Grosso","Marco Letizia"],"pdf_url":"https://arxiv.org/pdf/2408.12296v1.pdf","comment":"17 pages, 5 tables, 6 figures"},{"id":"http://arxiv.org/abs/2406.03833v2","updated":"2024-08-22T10:53:09Z","published":"2024-06-06T08:08:01Z","title":"Talos: A More Effective and Efficient Adversarial Defense for GNN Models\n Based on the Global Homophily of Graphs","summary":" Graph neural network (GNN) models play a pivotal role in numerous tasks\ninvolving graph-related data analysis. Despite their efficacy, similar to other\ndeep learning models, GNNs are susceptible to adversarial attacks. Even minor\nperturbations in graph data can induce substantial alterations in model\npredictions. While existing research has explored various adversarial defense\ntechniques for GNNs, the challenge of defending against adversarial attacks on\nreal-world scale graph data remains largely unresolved. On one hand, methods\nreliant on graph purification and preprocessing tend to excessively emphasize\nlocal graph information, leading to sub-optimal defensive outcomes. On the\nother hand, approaches rooted in graph structure learning entail significant\ntime overheads, rendering them impractical for large-scale graphs. In this\npaper, we propose a new defense method named Talos, which enhances the global,\nrather than local, homophily of graphs as a defense. Experiments show that the\nproposed approach notably outperforms state-of-the-art defense approaches,\nwhile imposing little computational overhead.\n","authors":["Duanyu Li","Huijun Wu","Min Xie","Xugang Wu","Zhenwei Wu","Wenzhe Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.03833v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12288v1","updated":"2024-08-22T10:52:32Z","published":"2024-08-22T10:52:32Z","title":"Demystifying Functional Random Forests: Novel Explainability Tools for\n Model Transparency in High-Dimensional Spaces","summary":" The advent of big data has raised significant challenges in analysing\nhigh-dimensional datasets across various domains such as medicine, ecology, and\neconomics. Functional Data Analysis (FDA) has proven to be a robust framework\nfor addressing these challenges, enabling the transformation of\nhigh-dimensional data into functional forms that capture intricate temporal and\nspatial patterns. However, despite advancements in functional classification\nmethods and very high performance demonstrated by combining FDA and ensemble\nmethods, a critical gap persists in the literature concerning the transparency\nand interpretability of black-box models, e.g. Functional Random Forests (FRF).\nIn response to this need, this paper introduces a novel suite of explainability\ntools to illuminate the inner mechanisms of FRF. We propose using Functional\nPartial Dependence Plots (FPDPs), Functional Principal Component (FPC)\nProbability Heatmaps, various model-specific and model-agnostic FPCs'\nimportance metrics, and the FPC Internal-External Importance and Explained\nVariance Bubble Plot. These tools collectively enhance the transparency of FRF\nmodels by providing a detailed analysis of how individual FPCs contribute to\nmodel predictions. By applying these methods to an ECG dataset, we demonstrate\nthe effectiveness of these tools in revealing critical patterns and improving\nthe explainability of FRF.\n","authors":["Fabrizio Maturo","Annamaria Porreca"],"pdf_url":"https://arxiv.org/pdf/2408.12288v1.pdf","comment":"33 pages"},{"id":"http://arxiv.org/abs/2310.12671v3","updated":"2024-08-22T10:15:22Z","published":"2023-10-19T12:00:33Z","title":"Neural networks for insurance pricing with frequency and severity data:\n a benchmark study from data preprocessing to technical tariff","summary":" Insurers usually turn to generalized linear models for modeling claim\nfrequency and severity data. Due to their success in other fields, machine\nlearning techniques are gaining popularity within the actuarial toolbox. Our\npaper contributes to the literature on frequency-severity insurance pricing\nwith machine learning via deep learning structures. We present a benchmark\nstudy on four insurance data sets with frequency and severity targets in the\npresence of multiple types of input features. We compare in detail the\nperformance of: a generalized linear model on binned input data, a\ngradient-boosted tree model, a feed-forward neural network (FFNN), and the\ncombined actuarial neural network (CANN). The CANNs combine a baseline\nprediction established with a GLM and GBM, respectively, with a neural network\ncorrection. We explain the data preprocessing steps with specific focus on the\nmultiple types of input features typically present in tabular insurance data\nsets, such as postal codes, numeric and categorical covariates. Autoencoders\nare used to embed the categorical variables into the neural network, and we\nexplore their potential advantages in a frequency-severity setting. Model\nperformance is evaluated not only on out-of-sample deviance but also using\nstatistical and calibration performance criteria and managerial tools to get\nmore nuanced insights. Finally, we construct global surrogate models for the\nneural nets' frequency and severity models. These surrogates enable the\ntranslation of the essential insights captured by the FFNNs or CANNs to GLMs.\nAs such, a technical tariff table results that can easily be deployed in\npractice.\n","authors":["Freek Holvoet","Katrien Antonio","Roel Henckaerts"],"pdf_url":"https://arxiv.org/pdf/2310.12671v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12273v1","updated":"2024-08-22T10:14:10Z","published":"2024-08-22T10:14:10Z","title":"Geometrical structures of digital fluctuations in parameter space of\n neural networks trained with adaptive momentum optimization","summary":" We present results of numerical experiments for neural networks with\nstochastic gradient-based optimization with adaptive momentum. This widely\napplied optimization has proved convergence and practical efficiency, but for\nlong-run training becomes numerically unstable. We show that numerical\nartifacts are observable not only for large-scale models and finally lead to\ndivergence also for case of shallow narrow networks. We argue this theory by\nexperiments with more than 1600 neural networks trained for 50000 epochs. Local\nobservations show presence of the same behavior of network parameters in both\nstable and unstable training segments. Geometrical behavior of parameters forms\ndouble twisted spirals in the parameter space and is caused by alternating of\nnumerical perturbations with next relaxation oscillations in values for 1st and\n2nd momentum.\n","authors":["Igor V. Netay"],"pdf_url":"https://arxiv.org/pdf/2408.12273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12270v1","updated":"2024-08-22T10:08:34Z","published":"2024-08-22T10:08:34Z","title":"Variance reduction of diffusion model's gradients with Taylor\n approximation-based control variate","summary":" Score-based models, trained with denoising score matching, are remarkably\neffective in generating high dimensional data. However, the high variance of\ntheir training objective hinders optimisation. We attempt to reduce it with a\ncontrol variate, derived via a $k$-th order Taylor expansion on the training\nobjective and its gradient. We prove an equivalence between the two and\ndemonstrate empirically the effectiveness of our approach on a low dimensional\nproblem setting; and study its effect on larger problems.\n","authors":["Paul Jeha","Will Grathwohl","Michael Riis Andersen","Carl Henrik Ek","Jes Frellsen"],"pdf_url":"https://arxiv.org/pdf/2408.12270v1.pdf","comment":"14 pages, ICML Structured Probabilistic Inference & Generative\n Modeling 2024"},{"id":"http://arxiv.org/abs/2408.09672v2","updated":"2024-08-22T10:07:50Z","published":"2024-08-19T03:15:41Z","title":"Regularization for Adversarial Robust Learning","summary":" Despite the growing prevalence of artificial neural networks in real-world\napplications, their vulnerability to adversarial attacks remains a significant\nconcern, which motivates us to investigate the robustness of machine learning\nmodels. While various heuristics aim to optimize the distributionally robust\nrisk using the $\\infty$-Wasserstein metric, such a notion of robustness\nfrequently encounters computation intractability. To tackle the computational\nchallenge, we develop a novel approach to adversarial training that integrates\n$\\phi$-divergence regularization into the distributionally robust risk\nfunction. This regularization brings a notable improvement in computation\ncompared with the original formulation. We develop stochastic gradient methods\nwith biased oracles to solve this problem efficiently, achieving the\nnear-optimal sample complexity. Moreover, we establish its regularization\neffects and demonstrate it is asymptotic equivalence to a regularized empirical\nrisk minimization framework, by considering various scaling regimes of the\nregularization parameter and robustness level. These regimes yield gradient\nnorm regularization, variance regularization, or a smoothed gradient norm\nregularization that interpolates between these extremes. We numerically\nvalidate our proposed method in supervised learning, reinforcement learning,\nand contextual learning and showcase its state-of-the-art performance against\nvarious adversarial attacks.\n","authors":["Jie Wang","Rui Gao","Yao Xie"],"pdf_url":"https://arxiv.org/pdf/2408.09672v2.pdf","comment":"51 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.12266v1","updated":"2024-08-22T10:04:00Z","published":"2024-08-22T10:04:00Z","title":"Accounts of using the Tustin-Net architecture on a rotary inverted\n pendulum","summary":" In this report we investigate the use of the Tustin neural network\narchitecture (Tustin-Net) for the identification of a physical rotary inverse\npendulum. This physics-based architecture is of particular interest as it\nbuilds on the known relationship between velocities and positions. We here aim\nat discussing the advantages, limitations and performance of Tustin-Nets\ncompared to first-principles grey-box models on a real physical apparatus,\nshowing how, with a standard training procedure, the former can hardly achieve\nthe same accuracy as the latter. To address this limitation, we present a\ntraining strategy based on transfer learning that yields Tustin-Nets that are\ncompetitive with the first-principles model, without requiring extensive\nknowledge of the setup as the latter.\n","authors":["Stijn van Esch","Fabio Bonassi","Thomas B. Schön"],"pdf_url":"https://arxiv.org/pdf/2408.12266v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12263v1","updated":"2024-08-22T10:00:20Z","published":"2024-08-22T10:00:20Z","title":"Toward the Evaluation of Large Language Models Considering Score\n Variance across Instruction Templates","summary":" The natural language understanding (NLU) performance of large language models\n(LLMs) has been evaluated across various tasks and datasets. The existing\nevaluation methods, however, do not take into account the variance in scores\ndue to differences in prompts, which leads to unfair evaluation and comparison\nof NLU performance. Moreover, evaluation designed for specific prompts is\ninappropriate for instruction tuning, which aims to perform well with any\nprompt. It is therefore necessary to find a way to measure NLU performance in a\nfair manner, considering score variance between different instruction\ntemplates. In this study, we provide English and Japanese cross-lingual\ndatasets for evaluating the NLU performance of LLMs, which include multiple\ninstruction templates for fair evaluation of each task, along with regular\nexpressions to constrain the output format. Furthermore, we propose the Sharpe\nscore as an evaluation metric that takes into account the variance in scores\nbetween templates. Comprehensive analysis of English and Japanese LLMs reveals\nthat the high variance among templates has a significant impact on the fair\nevaluation of LLMs.\n","authors":["Yusuke Sakai","Adam Nohejl","Jiangnan Hang","Hidetaka Kamigaito","Taro Watanabe"],"pdf_url":"https://arxiv.org/pdf/2408.12263v1.pdf","comment":"19 pages, 7 figures"},{"id":"http://arxiv.org/abs/2406.09291v3","updated":"2024-08-22T09:39:36Z","published":"2024-06-13T16:29:06Z","title":"A Flexible, Equivariant Framework for Subgraph GNNs via Graph Products\n and Graph Coarsening","summary":" Subgraph Graph Neural Networks (Subgraph GNNs) enhance the expressivity of\nmessage-passing GNNs by representing graphs as sets of subgraphs. They have\nshown impressive performance on several tasks, but their complexity limits\napplications to larger graphs. Previous approaches suggested processing only\nsubsets of subgraphs, selected either randomly or via learnable sampling.\nHowever, they make suboptimal subgraph selections or can only cope with very\nsmall subset sizes, inevitably incurring performance degradation. This paper\nintroduces a new Subgraph GNNs framework to address these issues. We employ a\ngraph coarsening function to cluster nodes into super-nodes with induced\nconnectivity. The product between the coarsened and the original graph reveals\nan implicit structure whereby subgraphs are associated with specific sets of\nnodes. By running generalized message-passing on such graph product, our method\neffectively implements an efficient, yet powerful Subgraph GNN. Controlling the\ncoarsening function enables meaningful selection of any number of subgraphs\nwhile, contrary to previous methods, being fully compatible with standard\ntraining techniques. Notably, we discover that the resulting node feature\ntensor exhibits new, unexplored permutation symmetries. We leverage this\nstructure, characterize the associated linear equivariant layers and\nincorporate them into the layers of our Subgraph GNN architecture. Extensive\nexperiments on multiple graph learning benchmarks demonstrate that our method\nis significantly more flexible than previous approaches, as it can seamlessly\nhandle any number of subgraphs, while consistently outperforming baseline\napproaches.\n","authors":["Guy Bar-Shalom","Yam Eitan","Fabrizio Frasca","Haggai Maron"],"pdf_url":"https://arxiv.org/pdf/2406.09291v3.pdf","comment":"Preprint, under review"},{"id":"http://arxiv.org/abs/2408.12249v1","updated":"2024-08-22T09:37:40Z","published":"2024-08-22T09:37:40Z","title":"LLMs are not Zero-Shot Reasoners for Biomedical Information Extraction","summary":" Large Language Models (LLMs) are increasingly adopted for applications in\nhealthcare, reaching the performance of domain experts on tasks such as\nquestion answering and document summarisation. Despite their success on these\ntasks, it is unclear how well LLMs perform on tasks that are traditionally\npursued in the biomedical domain, such as structured information extration. To\nbreach this gap, in this paper, we systematically benchmark LLM performance in\nMedical Classification and Named Entity Recognition (NER) tasks. We aim to\ndisentangle the contribution of different factors to the performance,\nparticularly the impact of LLMs' task knowledge and reasoning capabilities,\ntheir (parametric) domain knowledge, and addition of external knowledge. To\nthis end we evaluate various open LLMs -- including BioMistral and Llama-2\nmodels -- on a diverse set of biomedical datasets, using standard prompting,\nChain-of-Thought (CoT) and Self-Consistency based reasoning as well as\nRetrieval-Augmented Generation (RAG) with PubMed and Wikipedia corpora.\nCounter-intuitively, our results reveal that standard prompting consistently\noutperforms more complex techniques across both tasks, laying bare the\nlimitations in the current application of CoT, self-consistency and RAG in the\nbiomedical domain. Our findings suggest that advanced prompting methods\ndeveloped for knowledge- or reasoning-intensive tasks, such as CoT or RAG, are\nnot easily portable to biomedical tasks where precise structured outputs are\nrequired. This highlights the need for more effective integration of external\nknowledge and reasoning mechanisms in LLMs to enhance their performance in\nreal-world biomedical applications.\n","authors":["Aishik Nagar","Viktor Schlegel","Thanh-Tung Nguyen","Hao Li","Yuping Wu","Kuluhan Binici","Stefan Winkler"],"pdf_url":"https://arxiv.org/pdf/2408.12249v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2408.02349v3","updated":"2024-08-22T09:25:51Z","published":"2024-08-05T09:54:08Z","title":"Active Sensing of Knee Osteoarthritis Progression with Reinforcement\n Learning","summary":" Osteoarthritis (OA) is the most common musculoskeletal disease, which has no\ncure. Knee OA (KOA) is one of the highest causes of disability worldwide, and\nit costs billions of United States dollars to the global community. Prediction\nof KOA progression has been of high interest to the community for years, as it\ncan advance treatment development through more efficient clinical trials and\nimprove patient outcomes through more efficient healthcare utilization.\nExisting approaches for predicting KOA, however, are predominantly static, i.e.\nconsider data from a single time point to predict progression many years into\nthe future, and knee level, i.e. consider progression in a single joint only.\nDue to these and related reasons, these methods fail to deliver the level of\npredictive performance, which is sufficient to result in cost savings and\nbetter patient outcomes. Collecting extensive data from all patients on a\nregular basis could address the issue, but it is limited by the high cost at a\npopulation level. In this work, we propose to go beyond static prediction\nmodels in OA, and bring a novel Active Sensing (AS) approach, designed to\ndynamically follow up patients with the objective of maximizing the number of\ninformative data acquisitions, while minimizing their total cost over a period\nof time. Our approach is based on Reinforcement Learning (RL), and it leverages\na novel reward function designed specifically for AS of disease progression in\nmore than one part of a human body. Our method is end-to-end, relies on\nmulti-modal Deep Learning, and requires no human input at inference time.\nThroughout an exhaustive experimental evaluation, we show that using RL can\nprovide a higher monetary benefit when compared to state-of-the-art baselines.\n","authors":["Khanh Nguyen","Huy Hoang Nguyen","Egor Panfilov","Aleksei Tiulpin"],"pdf_url":"https://arxiv.org/pdf/2408.02349v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02302v3","updated":"2024-08-22T09:15:38Z","published":"2024-03-04T18:32:12Z","title":"Beyond Specialization: Assessing the Capabilities of MLLMs in Age and\n Gender Estimation","summary":" Multimodal Large Language Models (MLLMs) have recently gained immense\npopularity. Powerful commercial models like ChatGPT-4V and Gemini, as well as\nopen-source ones such as LLaVA, are essentially general-purpose models and are\napplied to solve a wide variety of tasks, including those in computer vision.\nThese neural networks possess such strong general knowledge and reasoning\nabilities that they have proven capable of working even on tasks for which they\nwere not specifically trained. We compared the capabilities of the most\npowerful MLLMs to date: ShareGPT4V, ChatGPT, LLaVA-Next in a specialized task\nof age and gender estimation with our state-of-the-art specialized model,\nMiVOLO. We also updated MiVOLO and provide details and new metrics in this\narticle. This comparison has yielded some interesting results and insights\nabout the strengths and weaknesses of the participating models. Furthermore, we\nattempted various ways to fine-tune the ShareGPT4V model for this specific\ntask, aiming to achieve state-of-the-art results in this particular challenge.\nAlthough such a model would not be practical in production, as it is incredibly\nexpensive compared to a specialized model like MiVOLO, it could be very useful\nin some tasks, like data annotation.\n","authors":["Maksim Kuprashevich","Grigorii Alekseenko","Irina Tolstykh"],"pdf_url":"https://arxiv.org/pdf/2403.02302v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12237v1","updated":"2024-08-22T09:13:27Z","published":"2024-08-22T09:13:27Z","title":"Weight Scope Alignment: A Frustratingly Easy Method for Model Merging","summary":" Merging models becomes a fundamental procedure in some applications that\nconsider model efficiency and robustness. The training randomness or Non-I.I.D.\ndata poses a huge challenge for averaging-based model fusion. Previous research\nefforts focus on element-wise regularization or neural permutations to enhance\nmodel averaging while overlooking weight scope variations among models, which\ncan significantly affect merging effectiveness. In this paper, we reveal\nvariations in weight scope under different training conditions, shedding light\non its influence on model merging. Fortunately, the parameters in each layer\nbasically follow the Gaussian distribution, which inspires a novel and simple\nregularization approach named Weight Scope Alignment (WSA). It contains two key\ncomponents: 1) leveraging a target weight scope to guide the model training\nprocess for ensuring weight scope matching in the subsequent model merging. 2)\nfusing the weight scope of two or more models into a unified one for\nmulti-stage model fusion. We extend the WSA regularization to two different\nscenarios, including Mode Connectivity and Federated Learning. Abundant\nexperimental studies validate the effectiveness of our approach.\n","authors":["Yichu Xu","Xin-Chun Li","Le Gan","De-Chuan Zhan"],"pdf_url":"https://arxiv.org/pdf/2408.12237v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15660v2","updated":"2024-08-22T09:12:19Z","published":"2024-07-22T14:18:52Z","title":"MuTT: A Multimodal Trajectory Transformer for Robot Skills","summary":" High-level robot skills represent an increasingly popular paradigm in robot\nprogramming. However, configuring the skills' parameters for a specific task\nremains a manual and time-consuming endeavor. Existing approaches for learning\nor optimizing these parameters often require numerous real-world executions or\ndo not work in dynamic environments. To address these challenges, we propose\nMuTT, a novel encoder-decoder transformer architecture designed to predict\nenvironment-aware executions of robot skills by integrating vision, trajectory,\nand robot skill parameters. Notably, we pioneer the fusion of vision and\ntrajectory, introducing a novel trajectory projection. Furthermore, we\nillustrate MuTT's efficacy as a predictor when combined with a model-based\nrobot skill optimizer. This approach facilitates the optimization of robot\nskill parameters for the current environment, without the need for real-world\nexecutions during optimization. Designed for compatibility with any\nrepresentation of robot skills, MuTT demonstrates its versatility across three\ncomprehensive experiments, showcasing superior performance across two different\nskill representations.\n","authors":["Claudius Kienle","Benjamin Alt","Onur Celik","Philipp Becker","Darko Katic","Rainer Jäkel","Gerhard Neumann"],"pdf_url":"https://arxiv.org/pdf/2407.15660v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.13627v2","updated":"2024-08-22T08:46:14Z","published":"2024-06-19T15:20:28Z","title":"Can AI be enabled to dynamical downscaling? A Latent Diffusion Model to\n mimic km-scale COSMO5.0\\_CLM9 simulations","summary":" Downscaling techniques are one of the most prominent applications of Deep\nLearning (DL) in Earth System Modeling. A robust DL downscaling model can\ngenerate high-resolution fields from coarse-scale numerical model simulations,\nsaving the timely and resourceful applications of regional/local models.\nAdditionally, generative DL models have the potential to provide uncertainty\ninformation, by generating ensemble-like scenario pools, a task that is\ncomputationally prohibitive for traditional numerical simulations. In this\nstudy, we apply a Latent Diffusion Model (LDM) to downscale ERA5 data over\nItaly up to a resolution of 2 km. The high-resolution target data consists of\n2-m temperature and 10-m horizontal wind components from a dynamical\ndownscaling performed with COSMO_CLM. Our goal is to demonstrate that recent\nadvancements in generative modeling enable DL to deliver results comparable to\nthose of numerical dynamical models, given the same input data, preserving the\nrealism of fine-scale features and flow characteristics. A selection of\npredictors from ERA5 is used as input to the LDM, and a residual approach\nagainst a reference UNET is leveraged in applying the LDM. The performance of\nthe generative LDM is compared with reference baselines of increasing\ncomplexity: quadratic interpolation of ERA5, a UNET, and a Generative\nAdversarial Network (GAN) built on the same reference UNET. Results highlight\nthe improvements introduced by the LDM architecture and the residual approach\nover these baselines. The models are evaluated on a yearly test dataset,\nassessing the models' performance through deterministic metrics, spatial\ndistribution of errors, and reconstruction of frequency and power spectra\ndistributions.\n","authors":["Elena Tomasi","Gabriele Franch","Marco Cristoforetti"],"pdf_url":"https://arxiv.org/pdf/2406.13627v2.pdf","comment":"24 pages, 14 figures"},{"id":"http://arxiv.org/abs/2408.01018v3","updated":"2024-08-22T08:45:45Z","published":"2024-08-02T05:36:14Z","title":"GNN-SKAN: Harnessing the Power of SwallowKAN to Advance Molecular\n Representation Learning with GNNs","summary":" Effective molecular representation learning is crucial for advancing\nmolecular property prediction and drug design. Mainstream molecular\nrepresentation learning approaches are based on Graph Neural Networks (GNNs).\nHowever, these approaches struggle with three significant challenges:\ninsufficient annotations, molecular diversity, and architectural limitations\nsuch as over-squashing, which leads to the loss of critical structural details.\nTo address these challenges, we introduce a new class of GNNs that integrates\nthe Kolmogorov-Arnold Networks (KANs), known for their robust data-fitting\ncapabilities and high accuracy in small-scale AI + Science tasks. By\nincorporating KANs into GNNs, our model enhances the representation of\nmolecular structures. We further advance this approach with a variant called\nSwallowKAN (SKAN), which employs adaptive Radial Basis Functions (RBFs) as the\ncore of the non-linear neurons. This innovation improves both computational\nefficiency and adaptability to diverse molecular structures. Building on the\nstrengths of SKAN, we propose a new class of GNNs, GNN-SKAN, and its augmented\nvariant, GNN-SKAN+, which incorporates a SKAN-based classifier to further boost\nperformance. To our knowledge, this is the first work to integrate KANs into\nGNN architectures tailored for molecular representation learning. Experiments\nacross 6 classification datasets, 6 regression datasets, and 4 few-shot\nlearning datasets demonstrate that our approach achieves new state-of-the-art\nperformance in terms of accuracy and computational cost.\n","authors":["Ruifeng Li","Mingqian Li","Wei Liu","Hongyang Chen"],"pdf_url":"https://arxiv.org/pdf/2408.01018v3.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.12212v1","updated":"2024-08-22T08:41:52Z","published":"2024-08-22T08:41:52Z","title":"Relational decomposition for program synthesis","summary":" We introduce a novel approach to program synthesis that decomposes complex\nfunctional tasks into simpler relational synthesis sub-tasks. We demonstrate\nthe effectiveness of our approach using an off-the-shelf inductive logic\nprogramming (ILP) system on three challenging datasets. Our results show that\n(i) a relational representation can outperform a functional one, and (ii) an\noff-the-shelf ILP system with a relational encoding can outperform\ndomain-specific approaches.\n","authors":["Céline Hocquette","Andrew Cropper"],"pdf_url":"https://arxiv.org/pdf/2408.12212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12209v1","updated":"2024-08-22T08:35:41Z","published":"2024-08-22T08:35:41Z","title":"Zeroth-Order Stochastic Mirror Descent Algorithms for Minimax Excess\n Risk Optimization","summary":" The minimax excess risk optimization (MERO) problem is a new variation of the\ntraditional distributionally robust optimization (DRO) problem, which achieves\nuniformly low regret across all test distributions under suitable conditions.\nIn this paper, we propose a zeroth-order stochastic mirror descent (ZO-SMD)\nalgorithm available for both smooth and non-smooth MERO to estimate the minimal\nrisk of each distrbution, and finally solve MERO as (non-)smooth stochastic\nconvex-concave (linear) minimax optimization problems. The proposed algorithm\nis proved to converge at optimal convergence rates of\n$\\mathcal{O}\\left(1/\\sqrt{t}\\right)$ on the estimate of $R_i^*$ and\n$\\mathcal{O}\\left(1/\\sqrt{t}\\right)$ on the optimization error of both smooth\nand non-smooth MERO. Numerical results show the efficiency of the proposed\nalgorithm.\n","authors":["Zhihao Gu","Zi Xu"],"pdf_url":"https://arxiv.org/pdf/2408.12209v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12208v1","updated":"2024-08-22T08:35:11Z","published":"2024-08-22T08:35:11Z","title":"Fair Augmentation for Graph Collaborative Filtering","summary":" Recent developments in recommendation have harnessed the collaborative power\nof graph neural networks (GNNs) in learning users' preferences from user-item\nnetworks. Despite emerging regulations addressing fairness of automated\nsystems, unfairness issues in graph collaborative filtering remain\nunderexplored, especially from the consumer's perspective. Despite numerous\ncontributions on consumer unfairness, only a few of these works have delved\ninto GNNs. A notable gap exists in the formalization of the latest mitigation\nalgorithms, as well as in their effectiveness and reliability on cutting-edge\nmodels. This paper serves as a solid response to recent research highlighting\nunfairness issues in graph collaborative filtering by reproducing one of the\nlatest mitigation methods. The reproduced technique adjusts the system fairness\nlevel by learning a fair graph augmentation. Under an experimental setup based\non 11 GNNs, 5 non-GNN models, and 5 real-world networks across diverse domains,\nour investigation reveals that fair graph augmentation is consistently\neffective on high-utility models and large datasets. Experiments on the\ntransferability of the fair augmented graph open new issues for future\nrecommendation studies. Source code: https://github.com/jackmedda/FA4GCF.\n","authors":["Ludovico Boratto","Francesco Fabbri","Gianni Fenu","Mirko Marras","Giacomo Medda"],"pdf_url":"https://arxiv.org/pdf/2408.12208v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12199v1","updated":"2024-08-22T08:21:28Z","published":"2024-08-22T08:21:28Z","title":"Efficient Learning for Linear Properties of Bounded-Gate Quantum\n Circuits","summary":" The vast and complicated large-qubit state space forbids us to\ncomprehensively capture the dynamics of modern quantum computers via classical\nsimulations or quantum tomography. However, recent progress in quantum learning\ntheory invokes a crucial question: given a quantum circuit containing d tunable\nRZ gates and G-d Clifford gates, can a learner perform purely classical\ninference to efficiently predict its linear properties using new classical\ninputs, after learning from data obtained by incoherently measuring states\ngenerated by the same circuit but with different classical inputs? In this\nwork, we prove that the sample complexity scaling linearly in d is necessary\nand sufficient to achieve a small prediction error, while the corresponding\ncomputational complexity may scale exponentially in d. Building upon these\nderived complexity bounds, we further harness the concept of classical shadow\nand truncated trigonometric expansion to devise a kernel-based learning model\ncapable of trading off prediction error and computational complexity,\ntransitioning from exponential to polynomial scaling in many practical\nsettings. Our results advance two crucial realms in quantum computation: the\nexploration of quantum algorithms with practical utilities and learning-based\nquantum system certification. We conduct numerical simulations to validate our\nproposals across diverse scenarios, encompassing quantum information processing\nprotocols, Hamiltonian simulation, and variational quantum algorithms up to 60\nqubits.\n","authors":["Yuxuan Du","Min-Hsiu Hsieh","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2408.12199v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12198v1","updated":"2024-08-22T08:20:39Z","published":"2024-08-22T08:20:39Z","title":"Two-level deep domain decomposition method","summary":" This study presents a two-level Deep Domain Decomposition Method (Deep-DDM)\naugmented with a coarse-level network for solving boundary value problems using\nphysics-informed neural networks (PINNs). The addition of the coarse level\nnetwork improves scalability and convergence rates compared to the single level\nmethod. Tested on a Poisson equation with Dirichlet boundary conditions, the\ntwo-level deep DDM demonstrates superior performance, maintaining efficient\nconvergence regardless of the number of subdomains. This advance provides a\nmore scalable and effective approach to solving complex partial differential\nequations with machine learning.\n","authors":["Victorita Dolean","Serge Gratton","Alexander Heinlein","Valentin Mercier"],"pdf_url":"https://arxiv.org/pdf/2408.12198v1.pdf","comment":"Preprint proceeding format"},{"id":"http://arxiv.org/abs/2408.12193v1","updated":"2024-08-22T08:16:02Z","published":"2024-08-22T08:16:02Z","title":"Empowering Wireless Network Applications with Deep Learning-based Radio\n Propagation Models","summary":" The efficient deployment and operation of any wireless communication\necosystem rely on knowledge of the received signal quality over the target\ncoverage area. This knowledge is typically acquired through radio propagation\nsolvers, which however suffer from intrinsic and well-known performance\nlimitations. This article provides a primer on how integrating deep learning\nand conventional propagation modeling techniques can enhance multiple vital\nfacets of wireless network operation, and yield benefits in terms of efficiency\nand reliability. By highlighting the pivotal role that the deep learning-based\nradio propagation models will assume in next-generation wireless networks, we\naspire to propel further research in this direction and foster their adoption\nin additional applications.\n","authors":["Stefanos Bakirtzis","Cagkan Yapar","Marco Fiore","Jie Zhang","Ian Wassell"],"pdf_url":"https://arxiv.org/pdf/2408.12193v1.pdf","comment":"7 pages, 3 Figures, 1 Table"},{"id":"http://arxiv.org/abs/2210.02899v3","updated":"2024-08-22T08:10:37Z","published":"2022-09-22T11:19:49Z","title":"Self-supervised Learning for Clustering of Wireless Spectrum Activity","summary":" In recent years, much work has been done on processing of wireless spectrum\ndata involving machine learning techniques in domain-related problems for\ncognitive radio networks, such as anomaly detection, modulation classification,\ntechnology classification and device fingerprinting. Most of the solutions are\nbased on labeled data, created in a controlled manner and processed with\nsupervised learning approaches. However, spectrum data measured in real-world\nenvironment is highly nondeterministic, making its labeling a laborious and\nexpensive process, requiring domain expertise, thus being one of the main\ndrawbacks of using supervised learning approaches in this domain. In this\npaper, we investigate the use of self-supervised learning (SSL) for exploring\nspectrum activities in a real-world unlabeled data. In particular, we compare\nthe performance of two SSL models, one based on a reference DeepCluster\narchitecture and one adapted for spectrum activity identification and\nclustering, and a baseline model based on K-means clustering algorithm. We show\nthat SSL models achieve superior performance regarding the quality of extracted\nfeatures and clustering performance. With SSL models we achieve reduction of\nthe feature vectors size by two orders of magnitude, while improving the\nperformance by a factor of 2 to 2.5 across the evaluation metrics, supported by\nvisual assessment. Additionally we show that adaptation of the reference SSL\narchitecture to the domain data provides reduction of model complexity by one\norder of magnitude, while preserving or even improving the clustering\nperformance.\n","authors":["Ljupcho Milosheski","Gregor Cerar","Blaž Bertalanič","Carolina Fortuna","Mihael Mohorčič"],"pdf_url":"https://arxiv.org/pdf/2210.02899v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12186v1","updated":"2024-08-22T08:02:10Z","published":"2024-08-22T08:02:10Z","title":"Transformers are Minimax Optimal Nonparametric In-Context Learners","summary":" In-context learning (ICL) of large language models has proven to be a\nsurprisingly effective method of learning a new task from only a few\ndemonstrative examples. In this paper, we study the efficacy of ICL from the\nviewpoint of statistical learning theory. We develop approximation and\ngeneralization error bounds for a transformer composed of a deep neural network\nand one linear attention layer, pretrained on nonparametric regression tasks\nsampled from general function spaces including the Besov space and piecewise\n$\\gamma$-smooth class. We show that sufficiently trained transformers can\nachieve -- and even improve upon -- the minimax optimal estimation risk in\ncontext by encoding the most relevant basis representations during pretraining.\nOur analysis extends to high-dimensional or sequential data and distinguishes\nthe \\emph{pretraining} and \\emph{in-context} generalization gaps. Furthermore,\nwe establish information-theoretic lower bounds for meta-learners w.r.t. both\nthe number of tasks and in-context examples. These findings shed light on the\nroles of task diversity and representation learning for ICL.\n","authors":["Juno Kim","Tai Nakamaki","Taiji Suzuki"],"pdf_url":"https://arxiv.org/pdf/2408.12186v1.pdf","comment":"40 pages, 3 figures, ICML 2024 Workshop on Theoretical Foundations of\n Foundation Models"},{"id":"http://arxiv.org/abs/2408.12185v1","updated":"2024-08-22T08:00:50Z","published":"2024-08-22T08:00:50Z","title":"Rank and Align: Towards Effective Source-free Graph Domain Adaptation","summary":" Graph neural networks (GNNs) have achieved impressive performance in graph\ndomain adaptation. However, extensive source graphs could be unavailable in\nreal-world scenarios due to privacy and storage concerns. To this end, we\ninvestigate an underexplored yet practical problem of source-free graph domain\nadaptation, which transfers knowledge from source models instead of source\ngraphs to a target domain. To solve this problem, we introduce a novel\nGNN-based approach called Rank and Align (RNA), which ranks graph similarities\nwith spectral seriation for robust semantics learning, and aligns inharmonic\ngraphs with harmonic graphs which close to the source domain for subgraph\nextraction. In particular, to overcome label scarcity, we employ the spectral\nseriation algorithm to infer the robust pairwise rankings, which can guide\nsemantic learning using a similarity learning objective. To depict distribution\nshifts, we utilize spectral clustering and the silhouette coefficient to detect\nharmonic graphs, which the source model can easily classify. To reduce\npotential domain discrepancy, we extract domain-invariant subgraphs from\ninharmonic graphs by an adversarial edge sampling process, which guides the\ninvariant learning of GNNs. Extensive experiments on several benchmark datasets\ndemonstrate the effectiveness of our proposed RNA.\n","authors":["Junyu Luo","Zhiping Xiao","Yifan Wang","Xiao Luo","Jingyang Yuan","Wei Ju","Langechuan Liu","Ming Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.12185v1.pdf","comment":"Published in IJCAI2024"},{"id":"http://arxiv.org/abs/2408.09461v2","updated":"2024-08-22T07:59:30Z","published":"2024-08-18T12:49:52Z","title":"Advancements in Molecular Property Prediction: A Survey of Single and\n Multimodal Approaches","summary":" Molecular Property Prediction (MPP) plays a pivotal role across diverse\ndomains, spanning drug discovery, material science, and environmental\nchemistry. Fueled by the exponential growth of chemical data and the evolution\nof artificial intelligence, recent years have witnessed remarkable strides in\nMPP. However, the multifaceted nature of molecular data, such as molecular\nstructures, SMILES notation, and molecular images, continues to pose a\nfundamental challenge in its effective representation. To address this,\nrepresentation learning techniques are instrumental as they acquire informative\nand interpretable representations of molecular data. This article explores\nrecent AI/-based approaches in MPP, focusing on both single and multiple\nmodality representation techniques. It provides an overview of various molecule\nrepresentations and encoding schemes, categorizes MPP methods by their use of\nmodalities, and outlines datasets and tools available for feature generation.\nThe article also analyzes the performance of recent methods and suggests future\nresearch directions to advance the field of MPP.\n","authors":["Tanya Liyaqat","Tanvir Ahmad","Chandni Saxena"],"pdf_url":"https://arxiv.org/pdf/2408.09461v2.pdf","comment":"Submitted to the journal"},{"id":"http://arxiv.org/abs/2402.17363v3","updated":"2024-08-22T07:45:09Z","published":"2024-02-27T09:55:34Z","title":"CGGM: A conditional graph generation model with adaptive sparsity for\n node anomaly detection in IoT networks","summary":" Dynamic graphs are extensively employed for detecting anomalous behavior in\nnodes within the Internet of Things (IoT). Graph generative models are often\nused to address the issue of imbalanced node categories in dynamic graphs.\nNeverthe less, the constraints it faces include the monotonicity of adjacency\nrelationships, the difficulty in constructing multi-dimensional features for\nnodes, and the lack of a method for end-to-end generation of multiple\ncategories of nodes. In this paper, we propose a novel graph generation model,\ncalled CGGM, specifically for generating samples belonging to the minority\nclass. The framework consists two core module: a conditional graph generation\nmodule and a graph-based anomaly detection module. The generative module adapts\nto the sparsity of the matrix by downsampling a noise adjacency matrix, and\nincorporates a multi-dimensional feature encoder based on multi-head\nself-attention to capture latent dependencies among features. Additionally, a\nlatent space constraint is combined with the distribution distance to\napproximate the latent distribution of real data. The graph-based anomaly\ndetection module utilizes the generated balanced dataset to predict the node\nbehaviors. Extensive experiments have shown that CGGM outperforms the\nstate-of-the-art methods in terms of accuracy and divergence. The results also\ndemonstrate CGGM can generated diverse data categories, that enhancing the\nperformance of multi-category classification task.\n","authors":["Xianshi Su","Munan Li","Runze Ma","Jialong Li","Tongbang Jiang","Hao Long"],"pdf_url":"https://arxiv.org/pdf/2402.17363v3.pdf","comment":"23 pages, 19 figures"},{"id":"http://arxiv.org/abs/2408.12175v1","updated":"2024-08-22T07:42:43Z","published":"2024-08-22T07:42:43Z","title":"How disentangled are your classification uncertainties?","summary":" Uncertainty Quantification in Machine Learning has progressed to predicting\nthe source of uncertainty in a prediction: Uncertainty from stochasticity in\nthe data (aleatoric), or uncertainty from limitations of the model (epistemic).\nGenerally, each uncertainty is evaluated in isolation, but this obscures the\nfact that they are often not truly disentangled. This work proposes a set of\nexperiments to evaluate disentanglement of aleatoric and epistemic uncertainty,\nand uses these methods to compare two competing formulations for\ndisentanglement (the Information Theoretic approach, and the Gaussian Logits\napproach). The results suggest that the Information Theoretic approach gives\nbetter disentanglement, but that either predicted source of uncertainty is\nstill largely contaminated by the other for both methods. We conclude that with\nthe current methods for disentangling, aleatoric and epistemic uncertainty are\nnot reliably separated, and we provide a clear set of experimental criteria\nthat good uncertainty disentanglement should follow.\n","authors":["Ivo Pascal de Jong","Andreea Ioana Sburlea","Matias Valdenegro-Toro"],"pdf_url":"https://arxiv.org/pdf/2408.12175v1.pdf","comment":"11 pages, 11 figures"},{"id":"http://arxiv.org/abs/2311.06756v2","updated":"2024-08-22T07:35:35Z","published":"2023-11-12T07:13:37Z","title":"Personalized Federated Learning via ADMM with Moreau Envelope","summary":" Personalized federated learning (PFL) is an approach proposed to address the\nissue of poor convergence on heterogeneous data. However, most existing PFL\nframeworks require strong assumptions for convergence. In this paper, we\npropose an alternating direction method of multipliers (ADMM) for training PFL\nmodels with Moreau envelope (FLAME), which achieves a sublinear convergence\nrate, relying on the relatively weak assumption of gradient Lipschitz\ncontinuity. Moreover, due to the gradient-free nature of ADMM, FLAME alleviates\nthe need for hyperparameter tuning, particularly in avoiding the adjustment of\nthe learning rate when training the global model. In addition, we propose a\nbiased client selection strategy to expedite the convergence of training of PFL\nmodels. Our theoretical analysis establishes the global convergence under both\nunbiased and biased client selection strategies. Our experiments validate that\nFLAME, when trained on heterogeneous data, outperforms state-of-the-art methods\nin terms of model performance. Regarding communication efficiency, it exhibits\nan average speedup of 3.75x compared to the baselines. Furthermore,\nexperimental results validate that the biased client selection strategy speeds\nup the convergence of both personalized and global models.\n","authors":["Shengkun Zhu","Jinshan Zeng","Sheng Wang","Yuan Sun","Zhiyong Peng"],"pdf_url":"https://arxiv.org/pdf/2311.06756v2.pdf","comment":"I have uploaded the latest version of this paper to arXiv:2407.16397.\n Due to my mistake, I didn't use 'replacement' but instead uploaded a new\n version. I deeply apologize for my error"},{"id":"http://arxiv.org/abs/2408.12171v1","updated":"2024-08-22T07:33:11Z","published":"2024-08-22T07:33:11Z","title":"Recent Advances on Machine Learning for Computational Fluid Dynamics: A\n Survey","summary":" This paper explores the recent advancements in enhancing Computational Fluid\nDynamics (CFD) tasks through Machine Learning (ML) techniques. We begin by\nintroducing fundamental concepts, traditional methods, and benchmark datasets,\nthen examine the various roles ML plays in improving CFD. The literature\nsystematically reviews papers in recent five years and introduces a novel\nclassification for forward modeling: Data-driven Surrogates, Physics-Informed\nSurrogates, and ML-assisted Numerical Solutions. Furthermore, we also review\nthe latest ML methods in inverse design and control, offering a novel\nclassification and providing an in-depth discussion. Then we highlight\nreal-world applications of ML for CFD in critical scientific and engineering\ndisciplines, including aerodynamics, combustion, atmosphere & ocean science,\nbiology fluid, plasma, symbolic regression, and reduced order modeling.\nBesides, we identify key challenges and advocate for future research directions\nto address these challenges, such as multi-scale representation, physical\nknowledge encoding, scientific foundation model and automatic scientific\ndiscovery. This review serves as a guide for the rapidly expanding ML for CFD\ncommunity, aiming to inspire insights for future advancements. We draw the\nconclusion that ML is poised to significantly transform CFD research by\nenhancing simulation accuracy, reducing computational time, and enabling more\ncomplex analyses of fluid dynamics. The paper resources can be viewed at\nhttps://github.com/WillDreamer/Awesome-AI4CFD.\n","authors":["Haixin Wang","Yadi Cao","Zijie Huang","Yuxuan Liu","Peiyan Hu","Xiao Luo","Zezheng Song","Wanjia Zhao","Jilin Liu","Jinan Sun","Shikun Zhang","Long Wei","Yue Wang","Tailin Wu","Zhi-Ming Ma","Yizhou Sun"],"pdf_url":"https://arxiv.org/pdf/2408.12171v1.pdf","comment":"22 pages, 6 figures"},{"id":"http://arxiv.org/abs/2305.18205v2","updated":"2024-08-22T07:24:37Z","published":"2023-05-26T13:24:33Z","title":"Pulse shape discrimination based on the Tempotron: a powerful classifier\n on GPU","summary":" This study utilized the Tempotron, a robust classifier based on a\nthird-generation neural network model, for pulse shape discrimination. By\neliminating the need for manual feature extraction, the Tempotron model can\nprocess pulse signals directly, generating discrimination results based on\nprior knowledge. The study performed experiments using GPU acceleration,\nresulting in over 500 times faster compared to the CPU-based model, and\ninvestigated the impact of noise augmentation on the Tempotron performance.\nExperimental results substantiated that Tempotron serves as a formidable\nclassifier, adept at accomplishing high discrimination accuracy on both AmBe\nand time-of-flight PuBe datasets. Furthermore, analyzing the neural activity of\nTempotron during training shed light on its learning characteristics and aided\nin selecting its hyperparameters. Moreover, the study addressed the constraints\nand potential avenues for future development in utilizing the Tempotron for\npulse shape discrimination. The dataset used in this study and the GPU-based\nTempotron are publicly available on GitHub at\nhttps://github.com/HaoranLiu507/TempotronGPU.\n","authors":["Haoran Liu","Peng Li","Ming-Zhe Liu","Kai-Ming Wang","Zhuo Zuo","Bing-Qi Liu"],"pdf_url":"https://arxiv.org/pdf/2305.18205v2.pdf","comment":"12 pages, 9 figures"},{"id":"http://arxiv.org/abs/2408.01129v3","updated":"2024-08-22T07:18:01Z","published":"2024-08-02T09:18:41Z","title":"A Survey of Mamba","summary":" As one of the most representative DL techniques, Transformer architecture has\nempowered numerous advanced models, especially the large language models (LLMs)\nthat comprise billions of parameters, becoming a cornerstone in deep learning.\nDespite the impressive achievements, Transformers still face inherent\nlimitations, particularly the time-consuming inference resulting from the\nquadratic computation complexity of attention calculation. Recently, a novel\narchitecture named Mamba, drawing inspiration from classical state space models\n(SSMs), has emerged as a promising alternative for building foundation models,\ndelivering comparable modeling abilities to Transformers while preserving\nnear-linear scalability concerning sequence length. This has sparked an\nincreasing number of studies actively exploring Mamba's potential to achieve\nimpressive performance across diverse domains. Given such rapid evolution,\nthere is a critical need for a systematic review that consolidates existing\nMamba-empowered models, offering a comprehensive understanding of this emerging\nmodel architecture. In this survey, we therefore conduct an in-depth\ninvestigation of recent Mamba-associated studies, covering three main aspects:\nthe advancements of Mamba-based models, the techniques of adapting Mamba to\ndiverse data, and the applications where Mamba can excel. Specifically, we\nfirst review the foundational knowledge of various representative deep learning\nmodels and the details of Mamba-1&2 as preliminaries. Then, to showcase the\nsignificance of Mamba for AI, we comprehensively review the related studies\nfocusing on Mamba models' architecture design, data adaptability, and\napplications. Finally, we present a discussion of current limitations and\nexplore various promising research directions to provide deeper insights for\nfuture investigations.\n","authors":["Haohao Qu","Liangbo Ning","Rui An","Wenqi Fan","Tyler Derr","Hui Liu","Xin Xu","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2408.01129v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11789v7","updated":"2024-08-22T06:48:17Z","published":"2023-03-20T08:37:08Z","title":"Decentralized Online Learning for Random Inverse Problems Over Graphs","summary":" We propose a decentralized online learning algorithm for distributed random\ninverse problems over network graphs with online measurements, and unifies the\ndistributed parameter estimation in Hilbert spaces and the least mean square\nproblem in reproducing kernel Hilbert spaces (RKHS-LMS). We transform the\nconvergence of the algorithm into the asymptotic stability of a class of\ninhomogeneous random difference equations in Hilbert spaces with\n$L_{2}$-bounded martingale difference terms and develop the $L_2$-asymptotic\nstability theory in Hilbert spaces. We show that if the network graph is\nconnected and the sequence of forward operators satisfies the\ninfinite-dimensional spatio-temporal persistence of excitation condition, then\nthe estimates of all nodes are mean square and almost surely strongly\nconsistent. Moreover, we propose a decentralized online learning algorithm in\nRKHS based on non-stationary online data streams, and prove that the algorithm\nis mean square and almost surely strongly consistent if the operators induced\nby the random input data satisfy the infinite-dimensional spatio-temporal\npersistence of excitation condition.\n","authors":["Tao Li","Xiwei Zhang","Yan Chen"],"pdf_url":"https://arxiv.org/pdf/2303.11789v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12153v1","updated":"2024-08-22T06:42:09Z","published":"2024-08-22T06:42:09Z","title":"DimeRec: A Unified Framework for Enhanced Sequential Recommendation via\n Generative Diffusion Models","summary":" Sequential Recommendation (SR) plays a pivotal role in recommender systems by\ntailoring recommendations to user preferences based on their non-stationary\nhistorical interactions. Achieving high-quality performance in SR requires\nattention to both item representation and diversity. However, designing an SR\nmethod that simultaneously optimizes these merits remains a long-standing\nchallenge. In this study, we address this issue by integrating recent\ngenerative Diffusion Models (DM) into SR. DM has demonstrated utility in\nrepresentation learning and diverse image generation. Nevertheless, a\nstraightforward combination of SR and DM leads to sub-optimal performance due\nto discrepancies in learning objectives (recommendation vs. noise\nreconstruction) and the respective learning spaces (non-stationary vs.\nstationary). To overcome this, we propose a novel framework called DimeRec\n(\\textbf{Di}ffusion with \\textbf{m}ulti-interest \\textbf{e}nhanced\n\\textbf{Rec}ommender). DimeRec synergistically combines a guidance extraction\nmodule (GEM) and a generative diffusion aggregation module (DAM). The GEM\nextracts crucial stationary guidance signals from the user's non-stationary\ninteraction history, while the DAM employs a generative diffusion process\nconditioned on GEM's outputs to reconstruct and generate consistent\nrecommendations. Our numerical experiments demonstrate that DimeRec\nsignificantly outperforms established baseline methods across three publicly\navailable datasets. Furthermore, we have successfully deployed DimeRec on a\nlarge-scale short video recommendation platform, serving hundreds of millions\nof users. Live A/B testing confirms that our method improves both users' time\nspent and result diversification.\n","authors":["Wuchao Li","Rui Huang","Haijun Zhao","Chi Liu","Kai Zheng","Qi Liu","Na Mou","Guorui Zhou","Defu Lian","Yang Song","Wentian Bao","Enyun Yu","Wenwu Ou"],"pdf_url":"https://arxiv.org/pdf/2408.12153v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12151v1","updated":"2024-08-22T06:40:32Z","published":"2024-08-22T06:40:32Z","title":"A Tighter Complexity Analysis of SparseGPT","summary":" In this work, we improved the analysis of the running time of SparseGPT\n[Frantar, Alistarh ICML 2023] from $O(d^{3})$ to $O(d^{\\omega} + d^{2+a+o(1)} +\nd^{1+\\omega(1,1,a)-a})$ for any $a \\in [0, 1]$, where $\\omega$ is the exponent\nof matrix multiplication. In particular, for the current $\\omega \\approx 2.371$\n[Alman, Duan, Williams, Xu, Xu, Zhou 2024], our running times boil down to\n$O(d^{2.53})$. This running time is due to the analysis of the lazy update\nbehavior in iterative maintenance problems, such as [Deng, Song, Weinstein\n2022, Brand, Song, Zhou ICML 2024].\n","authors":["Xiaoyu Li","Yingyu Liang","Zhenmei Shi","Zhao Song"],"pdf_url":"https://arxiv.org/pdf/2408.12151v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12150v1","updated":"2024-08-22T06:32:53Z","published":"2024-08-22T06:32:53Z","title":"DeepHQ: Learned Hierarchical Quantizer for Progressive Deep Image Coding","summary":" Unlike fixed- or variable-rate image coding, progressive image coding (PIC)\naims to compress various qualities of images into a single bitstream,\nincreasing the versatility of bitstream utilization and providing high\ncompression efficiency compared to simulcast compression. Research on neural\nnetwork (NN)-based PIC is in its early stages, mainly focusing on applying\nvarying quantization step sizes to the transformed latent representations in a\nhierarchical manner. These approaches are designed to compress only the\nprogressively added information as the quality improves, considering that a\nwider quantization interval for lower-quality compression includes multiple\nnarrower sub-intervals for higher-quality compression. However, the existing\nmethods are based on handcrafted quantization hierarchies, resulting in\nsub-optimal compression efficiency. In this paper, we propose an NN-based\nprogressive coding method that firstly utilizes learned quantization step sizes\nvia learning for each quantization layer. We also incorporate selective\ncompression with which only the essential representation components are\ncompressed for each quantization layer. We demonstrate that our method achieves\nsignificantly higher coding efficiency than the existing approaches with\ndecreased decoding time and reduced model size.\n","authors":["Jooyoung Lee","Se Yoon Jeong","Munchurl Kim"],"pdf_url":"https://arxiv.org/pdf/2408.12150v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01079v2","updated":"2024-08-22T06:25:19Z","published":"2024-07-01T08:34:40Z","title":"On Statistical Rates and Provably Efficient Criteria of Latent Diffusion\n Transformers (DiTs)","summary":" We investigate the statistical and computational limits of latent\n\\textbf{Di}ffusion \\textbf{T}ransformers (\\textbf{DiT}s) under the\nlow-dimensional linear latent space assumption. Statistically, we study the\nuniversal approximation and sample complexity of the DiTs score function, as\nwell as the distribution recovery property of the initial data. Specifically,\nunder mild data assumptions, we derive an approximation error bound for the\nscore network of latent DiTs, which is sub-linear in the latent space\ndimension. Additionally, we derive the corresponding sample complexity bound\nand show that the data distribution generated from the estimated score function\nconverges toward a proximate area of the original one. Computationally, we\ncharacterize the hardness of both forward inference and backward computation of\nlatent DiTs, assuming the Strong Exponential Time Hypothesis (SETH). For\nforward inference, we identify efficient criteria for all possible latent DiTs\ninference algorithms and showcase our theory by pushing the efficiency toward\nalmost-linear time inference. For backward computation, we leverage the\nlow-rank structure within the gradient computation of DiTs training for\npossible algorithmic speedup. Specifically, we show that such speedup achieves\nalmost-linear time latent DiTs training by casting the DiTs gradient as a\nseries of chained low-rank approximations with bounded error. Under the\nlow-dimensional assumption, we show that the convergence rate and the\ncomputational efficiency are both dominated by the dimension of the subspace,\nsuggesting that latent DiTs have the potential to bypass the challenges\nassociated with the high dimensionality of initial data.\n","authors":["Jerry Yao-Chieh Hu","Weimin Wu","Zhao Song","Han Liu"],"pdf_url":"https://arxiv.org/pdf/2407.01079v2.pdf","comment":"v2 fixed typos, added Fig. 1 and added clarifications"},{"id":"http://arxiv.org/abs/2312.15960v3","updated":"2024-08-22T06:24:12Z","published":"2023-12-26T08:49:57Z","title":"MoTCoder: Elevating Large Language Models with Modular of Thought for\n Challenging Programming Tasks","summary":" Large Language Models (LLMs) have showcased impressive capabilities in\nhandling straightforward programming tasks. However, their performance tends to\nfalter when confronted with more challenging programming problems. We observe\nthat conventional models often generate solutions as monolithic code blocks,\nrestricting their effectiveness in tackling intricate questions. To overcome\nthis limitation, we present Modular-of-Thought Coder (MoTCoder). We introduce a\npioneering framework for MoT instruction tuning, designed to promote the\ndecomposition of tasks into logical sub-tasks and sub-modules. Our\ninvestigations reveal that, through the cultivation and utilization of\nsub-modules, MoTCoder significantly improves both the modularity and\ncorrectness of the generated solutions, leading to substantial relative pass@1\nimprovements of 12.9% on APPS and 9.43% on CodeContests. Our codes are\navailable at https://github.com/dvlab-research/MoTCoder.\n","authors":["Jingyao Li","Pengguang Chen","Bin Xia","Hong Xu","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2312.15960v3.pdf","comment":"Model: https://huggingface.co/JingyaoLi/MoTCoder-15B-v1.0. Code:\n https://github.com/dvlab-research/MoTCoder"},{"id":"http://arxiv.org/abs/2406.07528v2","updated":"2024-08-22T06:09:53Z","published":"2024-06-11T17:55:03Z","title":"QuickLLaMA: Query-aware Inference Acceleration for Large Language Models","summary":" The capacity of Large Language Models (LLMs) to comprehend and reason over\nlong contexts is pivotal for advancements in diverse fields. Yet, they still\nstuggle with capturing long-distance dependencies within sequences to deeply\nunderstand semantics. To address this issue, we introduce Query-aware Inference\nfor LLMs (Q-LLM), a system designed to process extensive sequences akin to\nhuman cognition. By focusing on memory data relevant to a given query, Q-LLM\ncan accurately capture pertinent information within a fixed window size and\nprovide precise answers to queries. It doesn't require extra training and can\nbe seamlessly integrated with any LLMs. Q-LLM using LLaMA3 (QuickLLaMA) can\nread Harry Potter within 30s and accurately answer the questions. On widely\nrecognized benchmarks, Q-LLM improved by 7.17% compared to the current\nstate-of-the-art on LLaMA3, and by 3.26% on Mistral on the $\\infty$-bench. In\nthe Needle-in-a-Haystack and BABILong task, Q-LLM improved upon the current\nSOTA by 7.0% and 6.1%. Our code can be found in\nhttps://github.com/dvlab-research/Q-LLM.\n","authors":["Jingyao Li","Han Shi","Xin Jiang","Zhenguo Li","Hong Xu","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2406.07528v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11455v2","updated":"2024-08-22T05:46:23Z","published":"2024-08-21T09:21:59Z","title":"Using Part-based Representations for Explainable Deep Reinforcement\n Learning","summary":" Utilizing deep learning models to learn part-based representations holds\nsignificant potential for interpretable-by-design approaches, as these models\nincorporate latent causes obtained from feature representations through simple\naddition. However, training a part-based learning model presents challenges,\nparticularly in enforcing non-negative constraints on the model's parameters,\nwhich can result in training difficulties such as instability and convergence\nissues. Moreover, applying such approaches in Deep Reinforcement Learning (RL)\nis even more demanding due to the inherent instabilities that impact many\noptimization methods. In this paper, we propose a non-negative training\napproach for actor models in RL, enabling the extraction of part-based\nrepresentations that enhance interpretability while adhering to non-negative\nconstraints. To this end, we employ a non-negative initialization technique, as\nwell as a modified sign-preserving training method, which can ensure better\ngradient flow compared to existing approaches. We demonstrate the effectiveness\nof the proposed approach using the well-known Cartpole benchmark.\n","authors":["Manos Kirtas","Konstantinos Tsampazis","Loukia Avramelou","Nikolaos Passalis","Anastasios Tefas"],"pdf_url":"https://arxiv.org/pdf/2408.11455v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12139v1","updated":"2024-08-22T05:45:48Z","published":"2024-08-22T05:45:48Z","title":"DRExplainer: Quantifiable Interpretability in Drug Response Prediction\n with Directed Graph Convolutional Network","summary":" Predicting the response of a cancer cell line to a therapeutic drug is\npivotal for personalized medicine. Despite numerous deep learning methods that\nhave been developed for drug response prediction, integrating diverse\ninformation about biological entities and predicting the directional response\nremain major challenges. Here, we propose a novel interpretable predictive\nmodel, DRExplainer, which leverages a directed graph convolutional network to\nenhance the prediction in a directed bipartite network framework. DRExplainer\nconstructs a directed bipartite network integrating multi-omics profiles of\ncell lines, the chemical structure of drugs and known drug response to achieve\ndirected prediction. Then, DRExplainer identifies the most relevant subgraph to\neach prediction in this directed bipartite network by learning a mask,\nfacilitating critical medical decision-making. Additionally, we introduce a\nquantifiable method for model interpretability that leverages a ground truth\nbenchmark dataset curated from biological features. In computational\nexperiments, DRExplainer outperforms state-of-the-art predictive methods and\nanother graph-based explanation method under the same experimental setting.\nFinally, the case studies further validate the interpretability and the\neffectiveness of DRExplainer in predictive novel drug response. Our code is\navailable at: https://github.com/vshy-dream/DRExplainer.\n","authors":["Haoyuan Shi","Tao Xu","Xiaodi Li","Qian Gao","Junfeng Xia","Zhenyu Yue"],"pdf_url":"https://arxiv.org/pdf/2408.12139v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12136v1","updated":"2024-08-22T05:38:48Z","published":"2024-08-22T05:38:48Z","title":"Domain Adaptation for Offline Reinforcement Learning with Limited\n Samples","summary":" Offline reinforcement learning (RL) learns effective policies from a static\ntarget dataset. Despite state-of-the-art (SOTA) offline RL algorithms being\npromising, they highly rely on the quality of the target dataset. The\nperformance of SOTA algorithms can degrade in scenarios with limited samples in\nthe target dataset, which is often the case in real-world applications. To\naddress this issue, domain adaptation that leverages auxiliary samples from\nrelated source datasets (such as simulators) can be beneficial. In this\ncontext, determining the optimal way to trade off the source and target\ndatasets remains a critical challenge in offline RL. To the best of our\nknowledge, this paper proposes the first framework that theoretically and\nexperimentally explores how the weight assigned to each dataset affects the\nperformance of offline RL. We establish the performance bounds and convergence\nneighborhood of our framework, both of which depend on the selection of the\nweight. Furthermore, we identify the existence of an optimal weight for\nbalancing the two datasets. All theoretical guarantees and optimal weight\ndepend on the quality of the source dataset and the size of the target dataset.\nOur empirical results on the well-known Procgen Benchmark substantiate our\ntheoretical contributions.\n","authors":["Weiqin Chen","Sandipan Mishra","Santiago Paternain"],"pdf_url":"https://arxiv.org/pdf/2408.12136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.12979v3","updated":"2024-08-22T05:33:23Z","published":"2021-12-24T07:39:02Z","title":"Integrating Physics-Based Modeling with Machine Learning for Lithium-Ion\n Batteries","summary":" Mathematical modeling of lithium-ion batteries (LiBs) is a primary challenge\nin advanced battery management. This paper proposes two new frameworks to\nintegrate physics-based models with machine learning to achieve high-precision\nmodeling for LiBs. The frameworks are characterized by informing the machine\nlearning model of the state information of the physical model, enabling a deep\nintegration between physics and machine learning. Based on the frameworks, a\nseries of hybrid models are constructed, through combining an electrochemical\nmodel and an equivalent circuit model, respectively, with a feedforward neural\nnetwork. The hybrid models are relatively parsimonious in structure and can\nprovide considerable voltage predictive accuracy under a broad range of\nC-rates, as shown by extensive simulations and experiments. The study further\nexpands to conduct aging-aware hybrid modeling, leading to the design of a\nhybrid model conscious of the state-of-health to make prediction. The\nexperiments show that the model has high voltage predictive accuracy throughout\na LiB's cycle life.\n","authors":["Hao Tu","Scott Moura","Yebin Wang","Huazhen Fang"],"pdf_url":"https://arxiv.org/pdf/2112.12979v3.pdf","comment":"15 pages, 10 figures, 2 tables. arXiv admin note: text overlap with\n arXiv:2103.11580"},{"id":"http://arxiv.org/abs/2408.12133v1","updated":"2024-08-22T05:28:22Z","published":"2024-08-22T05:28:22Z","title":"Self-supervised Learning for Geospatial AI: A Survey","summary":" The proliferation of geospatial data in urban and territorial environments\nhas significantly facilitated the development of geospatial artificial\nintelligence (GeoAI) across various urban applications. Given the vast yet\ninherently sparse labeled nature of geospatial data, there is a critical need\nfor techniques that can effectively leverage such data without heavy reliance\non labeled datasets. This requirement aligns with the principles of\nself-supervised learning (SSL), which has attracted increasing attention for\nits adoption in geospatial data. This paper conducts a comprehensive and\nup-to-date survey of SSL techniques applied to or developed for three primary\ndata (geometric) types prevalent in geospatial vector data: points, polylines,\nand polygons. We systematically categorize various SSL techniques into\npredictive and contrastive methods, discussing their application with respect\nto each data type in enhancing generalization across various downstream tasks.\nFurthermore, we review the emerging trends of SSL for GeoAI, and several\ntask-specific SSL techniques. Finally, we discuss several key challenges in the\ncurrent research and outline promising directions for future investigation. By\npresenting a structured analysis of relevant studies, this paper aims to\ninspire continued advancements in the integration of SSL with GeoAI,\nencouraging innovative methods to harnessing the power of geospatial data.\n","authors":["Yile Chen","Weiming Huang","Kaiqi Zhao","Yue Jiang","Gao Cong"],"pdf_url":"https://arxiv.org/pdf/2408.12133v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.00380v3","updated":"2024-08-22T05:07:18Z","published":"2024-08-01T08:41:13Z","title":"EXAONEPath 1.0 Patch-level Foundation Model for Pathology","summary":" Recent advancements in digital pathology have led to the development of\nnumerous foundational models that utilize self-supervised learning on patches\nextracted from gigapixel whole slide images (WSIs). While this approach\nleverages vast amounts of unlabeled data, we have discovered a significant\nissue: features extracted from these self-supervised models tend to cluster by\nindividual WSIs, a phenomenon we term WSI-specific feature collapse. This\nproblem can potentially limit the model's generalization ability and\nperformance on various downstream tasks. To address this issue, we introduce\nEXAONEPath, a novel foundational model trained on patches that have undergone\nstain normalization. Stain normalization helps reduce color variability arising\nfrom different laboratories and scanners, enabling the model to learn more\nconsistent features. EXAONEPath is trained using 285,153,903 patches extracted\nfrom a total of 34,795 WSIs. Our experiments demonstrate that EXAONEPath\nsignificantly mitigates the feature collapse problem, indicating that the model\nhas learned more generalized features rather than overfitting to individual WSI\ncharacteristics. We compared EXAONEPath with state-of-the-art models across six\ndownstream task datasets, and our results show that EXAONEPath achieves\nsuperior performance relative to the number of WSIs used and the model's\nparameter count. This suggests that the application of stain normalization has\nsubstantially improved the model's efficiency and generalization capabilities.\n","authors":["Juseung Yun","Yi Hu","Jinhyung Kim","Jongseong Jang","Soonyoung Lee"],"pdf_url":"https://arxiv.org/pdf/2408.00380v3.pdf","comment":"License updated"},{"id":"http://arxiv.org/abs/2309.08560v2","updated":"2024-08-22T05:05:13Z","published":"2023-09-15T17:28:06Z","title":"Deep Reinforcement Learning for Efficient and Fair Allocation of Health\n Care Resources","summary":" Scarcity of health care resources could result in the unavoidable consequence\nof rationing. For example, ventilators are often limited in supply, especially\nduring public health emergencies or in resource-constrained health care\nsettings, such as amid the pandemic of COVID-19. Currently, there is no\nuniversally accepted standard for health care resource allocation protocols,\nresulting in different governments prioritizing patients based on various\ncriteria and heuristic-based protocols. In this study, we investigate the use\nof reinforcement learning for critical care resource allocation policy\noptimization to fairly and effectively ration resources. We propose a\ntransformer-based deep Q-network to integrate the disease progression of\nindividual patients and the interaction effects among patients during the\ncritical care resource allocation. We aim to improve both fairness of\nallocation and overall patient outcomes. Our experiments demonstrate that our\nmethod significantly reduces excess deaths and achieves a more equitable\ndistribution under different levels of ventilator shortage, when compared to\nexisting severity-based and comorbidity-based methods in use by different\ngovernments. Our source code is included in the supplement and will be released\non Github upon publication.\n","authors":["Yikuan Li","Chengsheng Mao","Kaixuan Huang","Hanyin Wang","Zheng Yu","Mengdi Wang","Yuan Luo"],"pdf_url":"https://arxiv.org/pdf/2309.08560v2.pdf","comment":"9 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2408.12129v1","updated":"2024-08-22T04:52:02Z","published":"2024-08-22T04:52:02Z","title":"Deep Analysis of Time Series Data for Smart Grid Startup Strategies: A\n Transformer-LSTM-PSO Model Approach","summary":" Grid startup, an integral component of the power system, holds strategic\nimportance for ensuring the reliability and efficiency of the electrical grid.\nHowever, current methodologies for in-depth analysis and precise prediction of\ngrid startup scenarios are inadequate. To address these challenges, we propose\na novel method based on the Transformer-LSTM-PSO model. This model uniquely\ncombines the Transformer's self-attention mechanism, LSTM's temporal modeling\ncapabilities, and the parameter tuning features of the particle swarm\noptimization algorithm. It is designed to more effectively capture the complex\ntemporal relationships in grid startup schemes. Our experiments demonstrate\nsignificant improvements, with our model achieving lower RMSE and MAE values\nacross multiple datasets compared to existing benchmarks, particularly in the\nNYISO Electric Market dataset where the RMSE was reduced by approximately 15%\nand the MAE by 20% compared to conventional models. Our main contribution is\nthe development of a Transformer-LSTM-PSO model that significantly enhances the\naccuracy and efficiency of smart grid startup predictions. The application of\nthe Transformer-LSTM-PSO model represents a significant advancement in smart\ngrid predictive analytics, concurrently fostering the development of more\nreliable and intelligent grid management systems.\n","authors":["Zecheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.12129v1.pdf","comment":"46 pages"},{"id":"http://arxiv.org/abs/2408.12124v1","updated":"2024-08-22T04:32:22Z","published":"2024-08-22T04:32:22Z","title":"Recording Brain Activity While Listening to Music Using Wearable EEG\n Devices Combined with Bidirectional Long Short-Term Memory Networks","summary":" Electroencephalography (EEG) signals are crucial for investigating brain\nfunction and cognitive processes. This study aims to address the challenges of\nefficiently recording and analyzing high-dimensional EEG signals while\nlistening to music to recognize emotional states. We propose a method combining\nBidirectional Long Short-Term Memory (Bi-LSTM) networks with attention\nmechanisms for EEG signal processing. Using wearable EEG devices, we collected\nbrain activity data from participants listening to music. The data was\npreprocessed, segmented, and Differential Entropy (DE) features were extracted.\nWe then constructed and trained a Bi-LSTM model to enhance key feature\nextraction and improve emotion recognition accuracy. Experiments were conducted\non the SEED and DEAP datasets. The Bi-LSTM-AttGW model achieved 98.28% accuracy\non the SEED dataset and 92.46% on the DEAP dataset in multi-class emotion\nrecognition tasks, significantly outperforming traditional models such as SVM\nand EEG-Net. This study demonstrates the effectiveness of combining Bi-LSTM\nwith attention mechanisms, providing robust technical support for applications\nin brain-computer interfaces (BCI) and affective computing. Future work will\nfocus on improving device design, incorporating multimodal data, and further\nenhancing emotion recognition accuracy, aiming to achieve practical\napplications in real-world scenarios.\n","authors":["Jingyi Wang","Zhiqun Wang","Guiran Liu"],"pdf_url":"https://arxiv.org/pdf/2408.12124v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2408.11478v2","updated":"2024-08-22T04:29:58Z","published":"2024-08-21T09:43:27Z","title":"LAKD-Activation Mapping Distillation Based on Local Learning","summary":" Knowledge distillation is widely applied in various fundamental vision models\nto enhance the performance of compact models. Existing knowledge distillation\nmethods focus on designing different distillation targets to acquire knowledge\nfrom teacher models. However, these methods often overlook the efficient\nutilization of distilled information, crudely coupling different types of\ninformation, making it difficult to explain how the knowledge from the teacher\nnetwork aids the student network in learning. This paper proposes a novel\nknowledge distillation framework, Local Attention Knowledge Distillation\n(LAKD), which more efficiently utilizes the distilled information from teacher\nnetworks, achieving higher interpretability and competitive performance. The\nframework establishes an independent interactive training mechanism through a\nseparation-decoupling mechanism and non-directional activation mapping. LAKD\ndecouples the teacher's features and facilitates progressive interaction\ntraining from simple to complex. Specifically, the student network is divided\ninto local modules with independent gradients to decouple the knowledge\ntransferred from the teacher. The non-directional activation mapping helps the\nstudent network integrate knowledge from different local modules by learning\ncoarse-grained feature knowledge. We conducted experiments on the CIFAR-10,\nCIFAR-100, and ImageNet datasets, and the results show that our LAKD method\nsignificantly outperforms existing methods, consistently achieving\nstate-of-the-art performance across different datasets.\n","authors":["Yaoze Zhang","Yuming Zhang","Yu Zhao","Yue Zhang","Feiyu Zhu"],"pdf_url":"https://arxiv.org/pdf/2408.11478v2.pdf","comment":"8 pages,7 figures"},{"id":"http://arxiv.org/abs/2402.01109v4","updated":"2024-08-22T04:29:11Z","published":"2024-02-02T02:56:50Z","title":"Vaccine: Perturbation-aware Alignment for Large Language Models against\n Harmful Fine-tuning","summary":" The new paradigm of finetuning-as-a-service introduces a new attack surface\nfor Large Language Models (LLMs): a few harmful data uploaded by users can\neasily trick the finetuning to produce an alignment-broken model. We conduct an\nempirical analysis and uncover a \\textit{harmful embedding drift} phenomenon,\nshowing a probable cause of the alignment-broken effect. Inspired by our\nfindings, we propose Vaccine, a perturbation-aware alignment technique to\nmitigate the security risk of users finetuning. The core idea of Vaccine is to\nproduce invariant hidden embeddings by progressively adding crafted\nperturbation to them in the alignment phase. This enables the embeddings to\nwithstand harmful perturbation from un-sanitized user data in the finetuning\nphase. Our results on open source mainstream LLMs (e.g., Llama2, Opt, Vicuna)\ndemonstrate that Vaccine can boost the robustness of alignment against harmful\nprompts induced embedding drift while reserving reasoning ability towards\nbenign prompts. Our code is available at\n\\url{https://github.com/git-disl/Vaccine}.\n","authors":["Tiansheng Huang","Sihao Hu","Ling Liu"],"pdf_url":"https://arxiv.org/pdf/2402.01109v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.13886v2","updated":"2024-08-22T04:22:44Z","published":"2023-04-27T00:13:17Z","title":"Improving the Utility of Differentially Private Clustering through\n Dynamical Processing","summary":" This study aims to alleviate the trade-off between utility and privacy of\ndifferentially private clustering. Existing works focus on simple methods,\nwhich show poor performance for non-convex clusters. To fit complex cluster\ndistributions, we propose sophisticated dynamical processing inspired by Morse\ntheory, with which we hierarchically connect the Gaussian sub-clusters obtained\nthrough existing methods. Our theoretical results imply that the proposed\ndynamical processing introduces little to no additional privacy loss.\nExperiments show that our framework can improve the clustering performance of\nexisting methods at the same privacy level.\n","authors":["Junyoung Byun","Yujin Choi","Jaewook Lee"],"pdf_url":"https://arxiv.org/pdf/2304.13886v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11168v3","updated":"2024-08-22T04:16:42Z","published":"2024-06-17T03:17:33Z","title":"Two-Timescale Optimization Framework for Decentralized Linear-Quadratic\n Optimal Control","summary":" A $\\mathcal{H}_2$-guaranteed decentralized linear-quadratic optimal control\nwith convex parameterization and convex-bounded uncertainty is studied in this\npaper, where several sparsity promoting functions are added, respectively, into\nthe $\\mathcal{H}_2$ cost to penalize the number of communication links among\ndecentralized controllers. Then, the sparse feedback gain is investigated to\nminimize the modified $\\mathcal{H}_2$ cost together with the stability\nguarantee, and the corresponding main results are of three parts. First, the\nweighted-$\\ell_1$ sparsity promoting function is of concern, and a\ntwo-timescale algorithm is developed based on the BSUM (Block Successive\nUpper-bound Minimization) framework and a primal-dual splitting approach.\nSecond, the optimization problem induced by piecewise quadratic sparsity\npenalty is investigated, which exhibits an accelerated convergence rate. Third,\nthe nonconvex sparse optimization problem with $\\ell_0$-penalty is studied,\nwhich can be approximated by successive coordinatewise convex optimization\nproblems.\n","authors":["Lechen Feng","Yuan-Hua Ni","Xuebo Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.11168v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14753v2","updated":"2024-08-22T04:13:18Z","published":"2024-06-20T21:50:46Z","title":"A General Control-Theoretic Approach for Reinforcement Learning: Theory\n and Algorithms","summary":" We devise a control-theoretic reinforcement learning approach to support\ndirect learning of the optimal policy. We establish various theoretical\nproperties of our approach, such as convergence and optimality of our\ncontrol-theoretic operator, a new control-policy-parameter gradient ascent\ntheorem, and a specific gradient ascent algorithm based on this theorem. As a\nrepresentative example, we adapt our approach to a particular control-theoretic\nframework and empirically evaluate its performance on several classical\nreinforcement learning tasks, demonstrating significant improvements in\nsolution quality, sample complexity, and running time of our control-theoretic\napproach over state-of-the-art baseline methods.\n","authors":["Weiqin Chen","Mark S. Squillante","Chai Wah Wu","Santiago Paternain"],"pdf_url":"https://arxiv.org/pdf/2406.14753v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.01473v3","updated":"2024-08-22T04:12:59Z","published":"2023-11-01T06:55:09Z","title":"Adversarial Examples in the Physical World: A Survey","summary":" Deep neural networks (DNNs) have demonstrated high vulnerability to\nadversarial examples, raising broad security concerns about their applications.\nBesides the attacks in the digital world, the practical implications of\nadversarial examples in the physical world present significant challenges and\nsafety concerns. However, current research on physical adversarial examples\n(PAEs) lacks a comprehensive understanding of their unique characteristics,\nleading to limited significance and understanding. In this paper, we address\nthis gap by thoroughly examining the characteristics of PAEs within a practical\nworkflow encompassing training, manufacturing, and re-sampling processes. By\nanalyzing the links between physical adversarial attacks, we identify\nmanufacturing and re-sampling as the primary sources of distinct attributes and\nparticularities in PAEs. Leveraging this knowledge, we develop a comprehensive\nanalysis and classification framework for PAEs based on their specific\ncharacteristics, covering over 100 studies on physical-world adversarial\nexamples. Furthermore, we investigate defense strategies against PAEs and\nidentify open challenges and opportunities for future research. We aim to\nprovide a fresh, thorough, and systematic understanding of PAEs, thereby\npromoting the development of robust adversarial learning and its application in\nopen-world scenarios to provide the community with a continuously updated list\nof physical world adversarial sample resources, including papers, code, \\etc,\nwithin the proposed framework\n","authors":["Jiakai Wang","Xianglong Liu","Jin Hu","Donghua Wang","Siyang Wu","Tingsong Jiang","Yuanfang Guo","Aishan Liu","Jiantao Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.01473v3.pdf","comment":"Adversarial examples, physical-world scenarios, attacks and defenses"},{"id":"http://arxiv.org/abs/2403.00225v3","updated":"2024-08-22T04:03:10Z","published":"2024-03-01T02:00:44Z","title":"Robust Policy Learning via Offline Skill Diffusion","summary":" Skill-based reinforcement learning (RL) approaches have shown considerable\npromise, especially in solving long-horizon tasks via hierarchical structures.\nThese skills, learned task-agnostically from offline datasets, can accelerate\nthe policy learning process for new tasks. Yet, the application of these skills\nin different domains remains restricted due to their inherent dependency on the\ndatasets, which poses a challenge when attempting to learn a skill-based policy\nvia RL for a target domain different from the datasets' domains. In this paper,\nwe present a novel offline skill learning framework DuSkill which employs a\nguided Diffusion model to generate versatile skills extended from the limited\nskills in datasets, thereby enhancing the robustness of policy learning for\ntasks in different domains. Specifically, we devise a guided diffusion-based\nskill decoder in conjunction with the hierarchical encoding to disentangle the\nskill embedding space into two distinct representations, one for encapsulating\ndomain-invariant behaviors and the other for delineating the factors that\ninduce domain variations in the behaviors. Our DuSkill framework enhances the\ndiversity of skills learned offline, thus enabling to accelerate the learning\nprocedure of high-level policies for different domains. Through experiments, we\nshow that DuSkill outperforms other skill-based imitation learning and RL\nalgorithms for several long-horizon tasks, demonstrating its benefits in\nfew-shot imitation and online RL.\n","authors":["Woo Kyung Kim","Minjong Yoo","Honguk Woo"],"pdf_url":"https://arxiv.org/pdf/2403.00225v3.pdf","comment":"11 pages, 6 figures; Accepted for AAAI Conference on Artificial\n Intelligence (AAAI 2024); Published version"},{"id":"http://arxiv.org/abs/2408.12115v1","updated":"2024-08-22T03:59:52Z","published":"2024-08-22T03:59:52Z","title":"Cross-border Commodity Pricing Strategy Optimization via Mixed Neural\n Network for Time Series Analysis","summary":" In the context of global trade, cross-border commodity pricing largely\ndetermines the competitiveness and market share of businesses. However,\nexisting methodologies often prove inadequate, as they lack the agility and\nprecision required to effectively respond to the dynamic international markets.\nTime series data is of great significance in commodity pricing and can reveal\nmarket dynamics and trends. Therefore, we propose a new method based on the\nhybrid neural network model CNN-BiGRU-SSA. The goal is to achieve accurate\nprediction and optimization of cross-border commodity pricing strategies\nthrough in-depth analysis and optimization of time series data. Our model\nundergoes experimental validation across multiple datasets. The results show\nthat our method achieves significant performance advantages on datasets such as\nUNCTAD, IMF, WITS and China Customs. For example, on the UNCTAD dataset, our\nmodel reduces MAE to 4.357, RMSE to 5.406, and R2 to 0.961, significantly\nbetter than other models. On the IMF and WITS datasets, our method also\nachieves similar excellent performance. These experimental results verify the\neffectiveness and reliability of our model in the field of cross-border\ncommodity pricing. Overall, this study provides an important reference for\nenterprises to formulate more reasonable and effective cross-border commodity\npricing strategies, thereby enhancing market competitiveness and profitability.\nAt the same time, our method also lays a foundation for the application of deep\nlearning in the fields of international trade and economic strategy\noptimization, which has important theoretical and practical significance.\n","authors":["Lijuan Wang","Yijia Hu","Yan Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.12115v1.pdf","comment":"30 pages"},{"id":"http://arxiv.org/abs/2408.12113v1","updated":"2024-08-22T03:55:28Z","published":"2024-08-22T03:55:28Z","title":"Risk Analysis in Customer Relationship Management via Quantile Region\n Convolutional Neural Network-Long Short-Term Memory and Cross-Attention\n Mechanism","summary":" Risk analysis is an important business decision support task in customer\nrelationship management (CRM), involving the identification of potential risks\nor challenges that may affect customer satisfaction, retention rates, and\noverall business performance. To enhance risk analysis in CRM, this paper\ncombines the advantages of quantile region convolutional neural network-long\nshort-term memory (QRCNN-LSTM) and cross-attention mechanisms for modeling. The\nQRCNN-LSTM model combines sequence modeling with deep learning architectures\ncommonly used in natural language processing tasks, enabling the capture of\nboth local and global dependencies in sequence data. The cross-attention\nmechanism enhances interactions between different input data parts, allowing\nthe model to focus on specific areas or features relevant to CRM risk analysis.\nBy applying QRCNN-LSTM and cross-attention mechanisms to CRM risk analysis,\nempirical evidence demonstrates that this approach can effectively identify\npotential risks and provide data-driven support for business decisions.\n","authors":["Yaowen Huang","Jun Der Leu","Baoli Lu","Yan Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.12113v1.pdf","comment":"44 pages"},{"id":"http://arxiv.org/abs/2408.12112v1","updated":"2024-08-22T03:54:08Z","published":"2024-08-22T03:54:08Z","title":"Balancing Act: Prioritization Strategies for LLM-Designed Restless\n Bandit Rewards","summary":" LLMs are increasingly used to design reward functions based on human\npreferences in Reinforcement Learning (RL). We focus on LLM-designed rewards\nfor Restless Multi-Armed Bandits, a framework for allocating limited resources\namong agents. In applications such as public health, this approach empowers\ngrassroots health workers to tailor automated allocation decisions to community\nneeds. In the presence of multiple agents, altering the reward function based\non human preferences can impact subpopulations very differently, leading to\ncomplex tradeoffs and a multi-objective resource allocation problem. We are the\nfirst to present a principled method termed Social Choice Language Model for\ndealing with these tradeoffs for LLM-designed rewards for multiagent planners\nin general and restless bandits in particular. The novel part of our model is a\ntransparent and configurable selection component, called an adjudicator,\nexternal to the LLM that controls complex tradeoffs via a user-selected social\nwelfare function. Our experiments demonstrate that our model reliably selects\nmore effective, aligned, and balanced reward functions compared to purely\nLLM-based approaches.\n","authors":["Shresth Verma","Niclas Boehmer","Lingkai Kong","Milind Tambe"],"pdf_url":"https://arxiv.org/pdf/2408.12112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12110v1","updated":"2024-08-22T03:51:39Z","published":"2024-08-22T03:51:39Z","title":"Pareto Inverse Reinforcement Learning for Diverse Expert Policy\n Generation","summary":" Data-driven offline reinforcement learning and imitation learning approaches\nhave been gaining popularity in addressing sequential decision-making problems.\nYet, these approaches rarely consider learning Pareto-optimal policies from a\nlimited pool of expert datasets. This becomes particularly marked due to\npractical limitations in obtaining comprehensive datasets for all preferences,\nwhere multiple conflicting objectives exist and each expert might hold a unique\noptimization preference for these objectives. In this paper, we adapt inverse\nreinforcement learning (IRL) by using reward distance estimates for\nregularizing the discriminator. This enables progressive generation of a set of\npolicies that accommodate diverse preferences on the multiple objectives, while\nusing only two distinct datasets, each associated with a different expert\npreference. In doing so, we present a Pareto IRL framework (ParIRL) that\nestablishes a Pareto policy set from these limited datasets. In the framework,\nthe Pareto policy set is then distilled into a single, preference-conditioned\ndiffusion model, thus allowing users to immediately specify which expert's\npatterns they prefer. Through experiments, we show that ParIRL outperforms\nother IRL algorithms for various multi-objective control tasks, achieving the\ndense approximation of the Pareto frontier. We also demonstrate the\napplicability of ParIRL with autonomous driving in CARLA.\n","authors":["Woo Kyung Kim","Minjong Yoo","Honguk Woo"],"pdf_url":"https://arxiv.org/pdf/2408.12110v1.pdf","comment":"13 pages, 7 figures; Accepted for International Joint Conference on\n Artificial Intelligence (IJCAI) 2024; Published version"},{"id":"http://arxiv.org/abs/2404.03764v2","updated":"2024-08-22T03:42:55Z","published":"2024-03-30T07:32:58Z","title":"Covariate-Elaborated Robust Partial Information Transfer with\n Conditional Spike-and-Slab Prior","summary":" The popularity of transfer learning stems from the fact that it can borrow\ninformation from useful auxiliary datasets. Existing statistical transfer\nlearning methods usually adopt a global similarity measure between the source\ndata and the target data, which may lead to inefficiency when only partial\ninformation is shared. In this paper, we propose a novel Bayesian transfer\nlearning method named ``CONCERT'' to allow robust partial information transfer\nfor high-dimensional data analysis. A conditional spike-and-slab prior is\nintroduced in the joint distribution of target and source parameters for\ninformation transfer. By incorporating covariate-specific priors, we can\ncharacterize partial similarities and integrate source information\ncollaboratively to improve the performance on the target. In contrast to\nexisting work, the CONCERT is a one-step procedure, which achieves variable\nselection and information transfer simultaneously. We establish variable\nselection consistency, as well as estimation and prediction error bounds for\nCONCERT. Our theory demonstrates the covariate-specific benefit of transfer\nlearning. To ensure that our algorithm is scalable, we adopt the variational\nBayes framework to facilitate implementation. Extensive experiments and two\nreal data applications showcase the validity and advantage of CONCERT over\nexisting cutting-edge transfer learning methods.\n","authors":["Ruqian Zhang","Yijiao Zhang","Annie Qu","Zhongyi Zhu","Juan Shen"],"pdf_url":"https://arxiv.org/pdf/2404.03764v2.pdf","comment":"35 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.12105v1","updated":"2024-08-22T03:41:14Z","published":"2024-08-22T03:41:14Z","title":"You Only Merge Once: Learning the Pareto Set of Preference-Aware Model\n Merging","summary":" Model merging, which combines multiple models into a single model, has gained\nincreasing popularity in recent years. By efficiently integrating the\ncapabilities of various models without their original training data, this\nsignificantly reduces the parameter count and memory usage. However, current\nmethods can only produce one single merged model. This necessitates a\nperformance trade-off due to conflicts among the various models, and the\nresultant one-size-fits-all model may not align with the preferences of\ndifferent users who may prioritize certain models over others. To address this\nissue, we propose preference-aware model merging, and formulate this as a\nmulti-objective optimization problem in which the performance of the merged\nmodel on each base model's task is treated as an objective. In only one merging\nprocess, the proposed parameter-efficient structure can generate the whole\nPareto set of merged models, each representing the Pareto-optimal model for a\ngiven user-specified preference. Merged models can also be selected from the\nlearned Pareto set that are tailored to different user preferences.\nExperimental results on a number of benchmark datasets demonstrate that the\nproposed preference-aware Pareto Merging can obtain a diverse set of trade-off\nmodels and outperforms state-of-the-art model merging baselines.\n","authors":["Weiyu Chen","James Kwok"],"pdf_url":"https://arxiv.org/pdf/2408.12105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12102v1","updated":"2024-08-22T03:34:03Z","published":"2024-08-22T03:34:03Z","title":"Integrating Audio, Visual, and Semantic Information for Enhanced\n Multimodal Speaker Diarization","summary":" Speaker diarization, the process of segmenting an audio stream or transcribed\nspeech content into homogenous partitions based on speaker identity, plays a\ncrucial role in the interpretation and analysis of human speech. Most existing\nspeaker diarization systems rely exclusively on unimodal acoustic information,\nmaking the task particularly challenging due to the innate ambiguities of audio\nsignals. Recent studies have made tremendous efforts towards audio-visual or\naudio-semantic modeling to enhance performance. However, even the incorporation\nof up to two modalities often falls short in addressing the complexities of\nspontaneous and unstructured conversations. To exploit more meaningful dialogue\npatterns, we propose a novel multimodal approach that jointly utilizes audio,\nvisual, and semantic cues to enhance speaker diarization. Our method elegantly\nformulates the multimodal modeling as a constrained optimization problem.\nFirst, we build insights into the visual connections among active speakers and\nthe semantic interactions within spoken content, thereby establishing abundant\npairwise constraints. Then we introduce a joint pairwise constraint propagation\nalgorithm to cluster speakers based on these visual and semantic constraints.\nThis integration effectively leverages the complementary strengths of different\nmodalities, refining the affinity estimation between individual speaker\nembeddings. Extensive experiments conducted on multiple multimodal datasets\ndemonstrate that our approach consistently outperforms state-of-the-art speaker\ndiarization methods.\n","authors":["Luyao Cheng","Hui Wang","Siqi Zheng","Yafeng Chen","Rongjie Huang","Qinglin Zhang","Qian Chen","Xihao Li"],"pdf_url":"https://arxiv.org/pdf/2408.12102v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09135v2","updated":"2024-08-22T03:28:39Z","published":"2024-08-17T08:18:40Z","title":"Vanilla Gradient Descent for Oblique Decision Trees","summary":" Decision Trees (DTs) constitute one of the major highly non-linear AI models,\nvalued, e.g., for their efficiency on tabular data. Learning accurate DTs is,\nhowever, complicated, especially for oblique DTs, and does take a significant\ntraining time. Further, DTs suffer from overfitting, e.g., they proverbially\n\"do not generalize\" in regression tasks. Recently, some works proposed ways to\nmake (oblique) DTs differentiable. This enables highly efficient\ngradient-descent algorithms to be used to learn DTs. It also enables\ngeneralizing capabilities by learning regressors at the leaves simultaneously\nwith the decisions in the tree. Prior approaches to making DTs differentiable\nrely either on probabilistic approximations at the tree's internal nodes (soft\nDTs) or on approximations in gradient computation at the internal node\n(quantized gradient descent). In this work, we propose DTSemNet, a novel\nsemantically equivalent and invertible encoding for (hard, oblique) DTs as\nNeural Networks (NNs), that uses standard vanilla gradient descent. Experiments\nacross various classification and regression benchmarks show that oblique DTs\nlearned using DTSemNet are more accurate than oblique DTs of similar size\nlearned using state-of-the-art techniques. Further, DT training time is\nsignificantly reduced. We also experimentally demonstrate that DTSemNet can\nlearn DT policies as efficiently as NN policies in the Reinforcement Learning\n(RL) setup with physical inputs (dimensions $\\leq32$). The code is available at\n{\\color{blue}\\textit{\\url{https://github.com/CPS-research-group/dtsemnet}}}.\n","authors":["Subrat Prasad Panda","Blaise Genest","Arvind Easwaran","Ponnuthurai Nagaratnam Suganthan"],"pdf_url":"https://arxiv.org/pdf/2408.09135v2.pdf","comment":"Published in ECAI-2024. Full version (includes supplementary\n material)"},{"id":"http://arxiv.org/abs/2408.12097v1","updated":"2024-08-22T03:10:52Z","published":"2024-08-22T03:10:52Z","title":"Extraction of Research Objectives, Machine Learning Model Names, and\n Dataset Names from Academic Papers and Analysis of Their Interrelationships\n Using LLM and Network Analysis","summary":" Machine learning is widely utilized across various industries. Identifying\nthe appropriate machine learning models and datasets for specific tasks is\ncrucial for the effective industrial application of machine learning. However,\nthis requires expertise in both machine learning and the relevant domain,\nleading to a high learning cost. Therefore, research focused on extracting\ncombinations of tasks, machine learning models, and datasets from academic\npapers is critically important, as it can facilitate the automatic\nrecommendation of suitable methods. Conventional information extraction methods\nfrom academic papers have been limited to identifying machine learning models\nand other entities as named entities. To address this issue, this study\nproposes a methodology extracting tasks, machine learning methods, and dataset\nnames from scientific papers and analyzing the relationships between these\ninformation by using LLM, embedding model, and network clustering. The proposed\nmethod's expression extraction performance, when using Llama3, achieves an\nF-score exceeding 0.8 across various categories, confirming its practical\nutility. Benchmarking results on financial domain papers have demonstrated the\neffectiveness of this method, providing insights into the use of the latest\ndatasets, including those related to ESG (Environmental, Social, and\nGovernance) data.\n","authors":["S. Nishio","H. Nonaka","N. Tsuchiya","A. Migita","Y. Banno","T. Hayashi","H. Sakaji","T. Sakumoto","K. Watabe"],"pdf_url":"https://arxiv.org/pdf/2408.12097v1.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2408.12095v1","updated":"2024-08-22T03:08:49Z","published":"2024-08-22T03:08:49Z","title":"uMedSum: A Unified Framework for Advancing Medical Abstractive\n Summarization","summary":" Medical abstractive summarization faces the challenge of balancing\nfaithfulness and informativeness. Current methods often sacrifice key\ninformation for faithfulness or introduce confabulations when prioritizing\ninformativeness. While recent advancements in techniques like in-context\nlearning (ICL) and fine-tuning have improved medical summarization, they often\noverlook crucial aspects such as faithfulness and informativeness without\nconsidering advanced methods like model reasoning and self-improvement.\nMoreover, the field lacks a unified benchmark, hindering systematic evaluation\ndue to varied metrics and datasets. This paper addresses these gaps by\npresenting a comprehensive benchmark of six advanced abstractive summarization\nmethods across three diverse datasets using five standardized metrics. Building\non these findings, we propose uMedSum, a modular hybrid summarization framework\nthat introduces novel approaches for sequential confabulation removal followed\nby key missing information addition, ensuring both faithfulness and\ninformativeness. Our work improves upon previous GPT-4-based state-of-the-art\n(SOTA) medical summarization methods, significantly outperforming them in both\nquantitative metrics and qualitative domain expert evaluations. Notably, we\nachieve an average relative performance improvement of 11.8% in reference-free\nmetrics over the previous SOTA. Doctors prefer uMedSum's summaries 6 times more\nthan previous SOTA in difficult cases where there are chances of confabulations\nor missing information. These results highlight uMedSum's effectiveness and\ngeneralizability across various datasets and metrics, marking a significant\nadvancement in medical summarization.\n","authors":["Aishik Nagar","Yutong Liu","Andy T. Liu","Viktor Schlegel","Vijay Prakash Dwivedi","Arun-Kumar Kaliya-Perumal","Guna Pratheep Kalanchiam","Yili Tang","Robby T. Tan"],"pdf_url":"https://arxiv.org/pdf/2408.12095v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2408.12091v1","updated":"2024-08-22T03:00:21Z","published":"2024-08-22T03:00:21Z","title":"Unsupervised discovery of the shared and private geometry in multi-view\n data","summary":" Modern applications often leverage multiple views of a subject of study.\nWithin neuroscience, there is growing interest in large-scale simultaneous\nrecordings across multiple brain regions. Understanding the relationship\nbetween views (e.g., the neural activity in each region recorded) can reveal\nfundamental principles about the characteristics of each representation and\nabout the system. However, existing methods to characterize such relationships\neither lack the expressivity required to capture complex nonlinearities,\ndescribe only sources of variance that are shared between views, or discard\ngeometric information that is crucial to interpreting the data. Here, we\ndevelop a nonlinear neural network-based method that, given paired samples of\nhigh-dimensional views, disentangles low-dimensional shared and private latent\nvariables underlying these views while preserving intrinsic data geometry.\nAcross multiple simulated and real datasets, we demonstrate that our method\noutperforms competing methods. Using simulated populations of lateral\ngeniculate nucleus (LGN) and V1 neurons we demonstrate our model's ability to\ndiscover interpretable shared and private structure across different noise\nconditions. On a dataset of unrotated and corresponding but randomly rotated\nMNIST digits, we recover private latents for the rotated view that encode\nrotation angle regardless of digit class, and places the angle representation\non a 1-d manifold, while shared latents encode digit class but not rotation\nangle. Applying our method to simultaneous Neuropixels recordings of\nhippocampus and prefrontal cortex while mice run on a linear track, we discover\na low-dimensional shared latent space that encodes the animal's position. We\npropose our approach as a general-purpose method for finding succinct and\ninterpretable descriptions of paired data sets in terms of disentangled shared\nand private latent variables.\n","authors":["Sai Koukuntla","Joshua B. Julian","Jesse C. Kaminsky","Manuel Schottdorf","David W. Tank","Carlos D. Brody","Adam S. Charles"],"pdf_url":"https://arxiv.org/pdf/2408.12091v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07975v2","updated":"2024-08-22T02:58:00Z","published":"2023-11-14T08:05:02Z","title":"Distilling the Unknown to Unveil Certainty","summary":" Out-of-distribution (OOD) detection is essential in identifying test samples\nthat deviate from the in-distribution (ID) data upon which a standard network\nis trained, ensuring network robustness and reliability. This paper introduces\nOOD knowledge distillation, a pioneering learning framework applicable whether\nor not training ID data is available, given a standard network. This framework\nharnesses unknown OOD-sensitive knowledge from the standard network to craft a\ncertain binary classifier adept at distinguishing between ID and OOD samples.\nTo accomplish this, we introduce Confidence Amendment (CA), an innovative\nmethodology that transforms an OOD sample into an ID one while progressively\namending prediction confidence derived from the standard network. This approach\nenables the simultaneous synthesis of both ID and OOD samples, each accompanied\nby an adjusted prediction confidence, thereby facilitating the training of a\nbinary classifier sensitive to OOD. Theoretical analysis provides bounds on the\ngeneralization error of the binary classifier, demonstrating the pivotal role\nof confidence amendment in enhancing OOD sensitivity. Extensive experiments\nspanning various datasets and network architectures confirm the efficacy of the\nproposed method in detecting OOD samples.\n","authors":["Zhilin Zhao","Longbing Cao","Yixuan Zhang","Kun-Yu Lin","Wei-Shi Zheng"],"pdf_url":"https://arxiv.org/pdf/2311.07975v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12077v1","updated":"2024-08-22T02:33:29Z","published":"2024-08-22T02:33:29Z","title":"Through-the-Wall Radar Human Activity Micro-Doppler Signature\n Representation Method Based on Joint Boulic-Sinusoidal Pendulum Model","summary":" With the help of micro-Doppler signature, ultra-wideband (UWB)\nthrough-the-wall radar (TWR) enables the reconstruction of range and velocity\ninformation of limb nodes to accurately identify indoor human activities.\nHowever, existing methods are usually trained and validated directly using\nrange-time maps (RTM) and Doppler-time maps (DTM), which have high feature\nredundancy and poor generalization ability. In order to solve this problem,\nthis paper proposes a human activity micro-Doppler signature representation\nmethod based on joint Boulic-sinusoidal pendulum motion model. In detail, this\npaper presents a simplified joint Boulic-sinusoidal pendulum human motion model\nby taking head, torso, both hands and feet into consideration improved from\nBoulic-Thalmann kinematic model. The paper also calculates the minimum number\nof key points needed to describe the Doppler and micro-Doppler information\nsufficiently. Both numerical simulations and experiments are conducted to\nverify the effectiveness. The results demonstrate that the proposed number of\nkey points of micro-Doppler signature can precisely represent the indoor human\nlimb node motion characteristics, and substantially improve the generalization\ncapability of the existing methods for different testers.\n","authors":["Xiaopeng Yang","Weicheng Gao","Xiaodong Qu","Zeyu Ma","Hao Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.12077v1.pdf","comment":"17 pages, 14 figures, 7 tables, in IEEE Transactions on Microwave\n Theory and Techniques, 2024"},{"id":"http://arxiv.org/abs/2407.14845v2","updated":"2024-08-22T02:23:12Z","published":"2024-07-20T11:19:58Z","title":"Understanding the Relationship between Prompts and Response Uncertainty\n in Large Language Models","summary":" Large language models (LLMs) are widely used in decision-making, but their\nreliability, especially in critical tasks like healthcare, is not\nwell-established. Therefore, understanding how LLMs reason and make decisions\nis crucial for their safe deployment. This paper investigates how the\nuncertainty of responses generated by LLMs relates to the information provided\nin the input prompt. Leveraging the insight that LLMs learn to infer latent\nconcepts during pretraining, we propose a prompt-response concept model that\nexplains how LLMs generate responses and helps understand the relationship\nbetween prompts and response uncertainty. We show that the uncertainty\ndecreases as the prompt's informativeness increases, similar to epistemic\nuncertainty. Our detailed experimental results on real datasets validate our\nproposed model.\n","authors":["Ze Yu Zhang","Arun Verma","Finale Doshi-Velez","Bryan Kian Hsiang Low"],"pdf_url":"https://arxiv.org/pdf/2407.14845v2.pdf","comment":"27 pages, 13 figures"},{"id":"http://arxiv.org/abs/2407.18906v2","updated":"2024-08-22T02:22:05Z","published":"2024-07-26T17:58:57Z","title":"A Scalable Quantum Non-local Neural Network for Image Classification","summary":" Non-local operations play a crucial role in computer vision enabling the\ncapture of long-range dependencies through weighted sums of features across the\ninput, surpassing the constraints of traditional convolution operations that\nfocus solely on local neighborhoods. Non-local operations typically require\ncomputing pairwise relationships between all elements in a set, leading to\nquadratic complexity in terms of time and memory. Due to the high computational\nand memory demands, scaling non-local neural networks to large-scale problems\ncan be challenging. This article introduces a hybrid quantum-classical scalable\nnon-local neural network, referred to as Quantum Non-Local Neural Network\n(QNL-Net), to enhance pattern recognition. The proposed QNL-Net relies on\ninherent quantum parallelism to allow the simultaneous processing of a large\nnumber of input features enabling more efficient computations in\nquantum-enhanced feature space and involving pairwise relationships through\nquantum entanglement. We benchmark our proposed QNL-Net with other quantum\ncounterparts to binary classification with datasets MNIST and CIFAR-10. The\nsimulation findings showcase our QNL-Net achieves cutting-edge accuracy levels\nin binary image classification among quantum classifiers while utilizing fewer\nqubits.\n","authors":["Sparsh Gupta","Debanjan Konar","Vaneet Aggarwal"],"pdf_url":"https://arxiv.org/pdf/2407.18906v2.pdf","comment":"preprint, 12 pages (including references and appendix), 5 figures"},{"id":"http://arxiv.org/abs/2408.12071v1","updated":"2024-08-22T02:18:47Z","published":"2024-08-22T02:18:47Z","title":"Multi-Task Curriculum Graph Contrastive Learning with Clustering Entropy\n Guidance","summary":" Recent advances in unsupervised deep graph clustering have been significantly\npromoted by contrastive learning. Despite the strides, most graph contrastive\nlearning models face challenges: 1) graph augmentation is used to improve\nlearning diversity, but commonly used random augmentation methods may destroy\ninherent semantics and cause noise; 2) the fixed positive and negative sample\nselection strategy is limited to deal with complex real data, thereby impeding\nthe model's capability to capture fine-grained patterns and relationships. To\nreduce these problems, we propose the Clustering-guided Curriculum Graph\ncontrastive Learning (CCGL) framework. CCGL uses clustering entropy as the\nguidance of the following graph augmentation and contrastive learning.\nSpecifically, according to the clustering entropy, the intra-class edges and\nimportant features are emphasized in augmentation. Then, a multi-task\ncurriculum learning scheme is proposed, which employs the clustering guidance\nto shift the focus from the discrimination task to the clustering task. In this\nway, the sample selection strategy of contrastive learning can be adjusted\nadaptively from early to late stage, which enhances the model's flexibility for\ncomplex data structure. Experimental results demonstrate that CCGL has achieved\nexcellent performance compared to state-of-the-art competitors.\n","authors":["Chusheng Zeng","Bocheng Wang","Jinghui Yuan","Rong Wang","Mulin Chen"],"pdf_url":"https://arxiv.org/pdf/2408.12071v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12068v1","updated":"2024-08-22T02:14:59Z","published":"2024-08-22T02:14:59Z","title":"Simplified Mamba with Disentangled Dependency Encoding for Long-Term\n Time Series Forecasting","summary":" Recently many deep learning models have been proposed for Long-term Time\nSeries Forecasting (LTSF). Based on previous literature, we identify three\ncritical patterns that can improve forecasting accuracy: the order and semantic\ndependencies in time dimension as well as cross-variate dependency. However,\nlittle effort has been made to simultaneously consider order and semantic\ndependencies when developing forecasting models. Moreover, existing approaches\nutilize cross-variate dependency by mixing information from different\ntimestamps and variates, which may introduce irrelevant or harmful\ncross-variate information to the time dimension and largely hinder forecasting\nperformance. To overcome these limitations, we investigate the potential of\nMamba for LTSF and discover two key advantages benefiting forecasting: (i) the\nselection mechanism makes Mamba focus on or ignore specific inputs and learn\nsemantic dependency easily, and (ii) Mamba preserves order dependency by\nprocessing sequences recursively. After that, we empirically find that the\nnon-linear activation used in Mamba is unnecessary for semantically sparse time\nseries data. Therefore, we further propose SAMBA, a Simplified Mamba with\ndisentangled dependency encoding. Specifically, we first remove the\nnon-linearities of Mamba to make it more suitable for LTSF. Furthermore, we\npropose a disentangled dependency encoding strategy to endow Mamba with\ncross-variate dependency modeling capabilities while reducing the interference\nbetween time and variate dimensions. Extensive experimental results on seven\nreal-world datasets demonstrate the effectiveness of SAMBA over\nstate-of-the-art forecasting models.\n","authors":["Zixuan Weng","Jindong Han","Wenzhao Jiang","Hao Liu"],"pdf_url":"https://arxiv.org/pdf/2408.12068v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.01497v3","updated":"2024-08-22T02:09:35Z","published":"2023-07-04T06:06:10Z","title":"Accelerated stochastic approximation with state-dependent noise","summary":" We consider a class of stochastic smooth convex optimization problems under\nrather general assumptions on the noise in the stochastic gradient observation.\nAs opposed to the classical problem setting in which the variance of noise is\nassumed to be uniformly bounded, herein we assume that the variance of\nstochastic gradients is related to the \"sub-optimality\" of the approximate\nsolutions delivered by the algorithm. Such problems naturally arise in a\nvariety of applications, in particular, in the well-known generalized linear\nregression problem in statistics. However, to the best of our knowledge, none\nof the existing stochastic approximation algorithms for solving this class of\nproblems attain optimality in terms of the dependence on accuracy, problem\nparameters, and mini-batch size.\n We discuss two non-Euclidean accelerated stochastic approximation\nroutines--stochastic accelerated gradient descent (SAGD) and stochastic\ngradient extrapolation (SGE)--which carry a particular duality relationship. We\nshow that both SAGD and SGE, under appropriate conditions, achieve the optimal\nconvergence rate, attaining the optimal iteration and sample complexities\nsimultaneously. However, corresponding assumptions for the SGE algorithm are\nmore general; they allow, for instance, for efficient application of the SGE to\nstatistical estimation problems under heavy tail noises and discontinuous score\nfunctions. We also discuss the application of the SGE to problems satisfying\nquadratic growth conditions, and show how it can be used to recover sparse\nsolutions. Finally, we report on some simulation experiments to illustrate\nnumerical performance of our proposed algorithms in high-dimensional settings.\n","authors":["Sasila Ilandarideva","Anatoli Juditsky","Guanghui Lan","Tianjiao Li"],"pdf_url":"https://arxiv.org/pdf/2307.01497v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11449v3","updated":"2024-08-22T01:57:40Z","published":"2024-03-18T03:56:34Z","title":"Graph Partial Label Learning with Potential Cause Discovering","summary":" Graph Neural Networks (GNNs) have garnered widespread attention for their\npotential to address the challenges posed by graph representation learning,\nwhich face complex graph-structured data across various domains. However, due\nto the inherent complexity and interconnectedness of graphs, accurately\nannotating graph data for training GNNs is extremely challenging. To address\nthis issue, we have introduced Partial Label Learning (PLL) into graph\nrepresentation learning. PLL is a critical weakly supervised learning problem\nwhere each training instance is associated with a set of candidate labels,\nincluding the ground-truth label and the additional interfering labels. PLL\nallows annotators to make errors, which reduces the difficulty of data\nlabeling. Subsequently, we propose a novel graph representation learning method\nthat enables GNN models to effectively learn discriminative information within\nthe context of PLL. Our approach utilizes potential cause extraction to obtain\ngraph data that holds causal relationships with the labels. By conducting\nauxiliary training based on the extracted graph data, our model can effectively\neliminate the interfering information in the PLL scenario. We support the\nrationale behind our method with a series of theoretical analyses. Moreover, we\nconduct extensive evaluations and ablation studies on multiple datasets,\ndemonstrating the superiority of our proposed method.\n","authors":["Hang Gao","Jiaguo Yuan","Jiangmeng Li","Peng Qiao","Fengge Wu","Changwen Zheng","Huaping Liu"],"pdf_url":"https://arxiv.org/pdf/2403.11449v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03386v2","updated":"2024-08-22T01:54:59Z","published":"2023-12-06T09:52:18Z","title":"An Infinite-Width Analysis on the Jacobian-Regularised Training of a\n Neural Network","summary":" The recent theoretical analysis of deep neural networks in their\ninfinite-width limits has deepened our understanding of initialisation, feature\nlearning, and training of those networks, and brought new practical techniques\nfor finding appropriate hyperparameters, learning network weights, and\nperforming inference. In this paper, we broaden this line of research by\nshowing that this infinite-width analysis can be extended to the Jacobian of a\ndeep neural network. We show that a multilayer perceptron (MLP) and its\nJacobian at initialisation jointly converge to a Gaussian process (GP) as the\nwidths of the MLP's hidden layers go to infinity and characterise this GP. We\nalso prove that in the infinite-width limit, the evolution of the MLP under the\nso-called robust training (i.e., training with a regulariser on the Jacobian)\nis described by a linear first-order ordinary differential equation that is\ndetermined by a variant of the Neural Tangent Kernel. We experimentally show\nthe relevance of our theoretical claims to wide finite networks, and\nempirically analyse the properties of kernel regression solution to obtain an\ninsight into Jacobian regularisation.\n","authors":["Taeyoung Kim","Hongseok Yang"],"pdf_url":"https://arxiv.org/pdf/2312.03386v2.pdf","comment":"Accepted at ICML 2024. 74 pages, 18 figures"},{"id":"http://arxiv.org/abs/2408.12063v1","updated":"2024-08-22T01:53:35Z","published":"2024-08-22T01:53:35Z","title":"A Deconfounding Approach to Climate Model Bias Correction","summary":" Global Climate Models (GCMs) are crucial for predicting future climate\nchanges by simulating the Earth systems. However, GCM outputs exhibit\nsystematic biases due to model uncertainties, parameterization simplifications,\nand inadequate representation of complex climate phenomena. Traditional bias\ncorrection methods, which rely on historical observation data and statistical\ntechniques, often neglect unobserved confounders, leading to biased results.\nThis paper proposes a novel bias correction approach to utilize both GCM and\nobservational data to learn a factor model that captures multi-cause latent\nconfounders. Inspired by recent advances in causality based time series\ndeconfounding, our method first constructs a factor model to learn latent\nconfounders from historical data and then applies them to enhance the bias\ncorrection process using advanced time series forecasting models. The\nexperimental results demonstrate significant improvements in the accuracy of\nprecipitation outputs. By addressing unobserved confounders, our approach\noffers a robust and theoretically grounded solution for climate model bias\ncorrection.\n","authors":["Wentao Gao","Jiuyong Li","Debo Cheng","Lin Liu","Jixue Liu","Thuc Duy Le","Xiaojing Du","Xiongren Chen","Yanchang Zhao","Yun Chen"],"pdf_url":"https://arxiv.org/pdf/2408.12063v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12059v1","updated":"2024-08-22T01:26:28Z","published":"2024-08-22T01:26:28Z","title":"MAC protocol classification in the ISM band using machine learning\n methods","summary":" With the emergence of new technologies and a growing number of wireless\nnetworks, we face the problem of radio spectrum shortages. As a result,\nidentifying the wireless channel spectrum to exploit the channel's idle state\nwhile also boosting network security is a pivotal issue. Detecting and\nclassifying protocols in the MAC sublayer enables Cognitive Radio users to\nimprove spectrum utilization and minimize potential interference. In this\npaper, we classify the Wi-Fi and Bluetooth protocols, which are the most widely\nused MAC sublayer protocols in the ISM radio band. With the advent of various\nwireless technologies, especially in the 2.4 GHz frequency band, the ISM\nfrequency spectrum has become crowded and high-traffic, which faces a lack of\nspectrum resources and user interference. Therefore, identifying and\nclassifying protocols is an effective and useful method. Leveraging machine\nlearning and deep learning techniques, known for their advanced classification\ncapabilities, we apply Support Vector Machine and K-Nearest Neighbors\nalgorithms, which are machine learning algorithms, to classify protocols into\nthree classes: Wi-Fi, Wi-Fi Beacon, and Bluetooth. To capture the signals, we\nuse the USRP N210 Software Defined Radio device and sample the real data in the\nindoor environment in different conditions of the presence and absence of\ntransmitters and receivers for these two protocols. By assembling this dataset\nand studying the time and frequency features of the protocols, we extract the\nframe width and the silence gap between the two frames as time features and the\nPAPR of each frame as a power feature. By comparing the output of the protocols\nclassification in different conditions and also adding Gaussian noise, it was\nfound that the samples in the nonlinear SVM method with RBF and KNN functions\nhave the best performance, with 97.83% and 98.12% classification accuracy,\nrespectively.\n","authors":["Hanieh Rashidpour","Hossein Bahramgiri"],"pdf_url":"https://arxiv.org/pdf/2408.12059v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03715v3","updated":"2024-08-22T01:26:21Z","published":"2024-02-06T05:11:38Z","title":"Clarify: Improving Model Robustness With Natural Language Corrections","summary":" The standard way to teach models is by feeding them lots of data. However,\nthis approach often teaches models incorrect ideas because they pick up on\nmisleading signals in the data. To prevent such misconceptions, we must\nnecessarily provide additional information beyond the training data. Prior\nmethods incorporate additional instance-level supervision, such as labels for\nmisleading features or additional labels for debiased data. However, such\nstrategies require a large amount of labeler effort. We hypothesize that people\nare good at providing textual feedback at the concept level, a capability that\nexisting teaching frameworks do not leverage. We propose Clarify, a novel\ninterface and method for interactively correcting model misconceptions. Through\nClarify, users need only provide a short text description of a model's\nconsistent failure patterns. Then, in an entirely automated way, we use such\ndescriptions to improve the training process. Clarify is the first end-to-end\nsystem for user model correction. Our user studies show that non-expert users\ncan successfully describe model misconceptions via Clarify, leading to\nincreased worst-case performance in two datasets. We additionally conduct a\ncase study on a large-scale image dataset, ImageNet, using Clarify to find and\nrectify 31 novel hard subpopulations.\n","authors":["Yoonho Lee","Michelle S. Lam","Helena Vasconcelos","Michael S. Bernstein","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2402.03715v3.pdf","comment":"UIST 2024. Interface code available at\n https://github.com/yoonholee/Clarify"},{"id":"http://arxiv.org/abs/2408.12055v1","updated":"2024-08-22T01:11:27Z","published":"2024-08-22T01:11:27Z","title":"Aligning (Medical) LLMs for (Counterfactual) Fairness","summary":" Large Language Models (LLMs) have emerged as promising solutions for a\nvariety of medical and clinical decision support applications. However, LLMs\nare often subject to different types of biases, which can lead to unfair\ntreatment of individuals, worsening health disparities, and reducing trust in\nAI-augmented medical tools. Aiming to address this important issue, in this\nstudy, we present a new model alignment approach for aligning LLMs using a\npreference optimization method within a knowledge distillation framework. Prior\nto presenting our proposed method, we first use an evaluation framework to\nconduct a comprehensive (largest to our knowledge) empirical evaluation to\nreveal the type and nature of existing biases in LLMs used for medical\napplications. We then offer a bias mitigation technique to reduce the unfair\npatterns in LLM outputs across different subgroups identified by the protected\nattributes. We show that our mitigation method is effective in significantly\nreducing observed biased patterns. Our code is publicly available at\n\\url{https://github.com/healthylaife/FairAlignmentLLM}.\n","authors":["Raphael Poulain","Hamed Fayyaz","Rahmatollah Beheshti"],"pdf_url":"https://arxiv.org/pdf/2408.12055v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2404.15149"}],"Multimedia":[{"id":"http://arxiv.org/abs/2408.12601v1","updated":"2024-08-22T17:59:44Z","published":"2024-08-22T17:59:44Z","title":"DreamCinema: Cinematic Transfer with Free Camera and 3D Character","summary":" We are living in a flourishing era of digital media, where everyone has the\npotential to become a personal filmmaker. Current research on cinematic\ntransfer empowers filmmakers to reproduce and manipulate the visual elements\n(e.g., cinematography and character behaviors) from classic shots. However,\ncharacters in the reimagined films still rely on manual crafting, which\ninvolves significant technical complexity and high costs, making it\nunattainable for ordinary users. Furthermore, their estimated cinematography\nlacks smoothness due to inadequate capturing of inter-frame motion and modeling\nof physical trajectories. Fortunately, the remarkable success of 2D and 3D AIGC\nhas opened up the possibility of efficiently generating characters tailored to\nusers' needs, diversifying cinematography. In this paper, we propose\nDreamCinema, a novel cinematic transfer framework that pioneers generative AI\ninto the film production paradigm, aiming at facilitating user-friendly film\ncreation. Specifically, we first extract cinematic elements (i.e., human and\ncamera pose) and optimize the camera trajectory. Then, we apply a character\ngenerator to efficiently create 3D high-quality characters with a human\nstructure prior. Finally, we develop a structure-guided motion transfer\nstrategy to incorporate generated characters into film creation and transfer it\nvia 3D graphics engines smoothly. Extensive experiments demonstrate the\neffectiveness of our method for creating high-quality films with free camera\nand 3D characters.\n","authors":["Weiliang Chen","Fangfu Liu","Diankun Wu","Haowen Sun","Haixu Song","Yueqi Duan"],"pdf_url":"https://arxiv.org/pdf/2408.12601v1.pdf","comment":"Project page: https://liuff19.github.io/DreamCinema"},{"id":"http://arxiv.org/abs/2408.12558v1","updated":"2024-08-22T17:17:43Z","published":"2024-08-22T17:17:43Z","title":"Exploring the Role of Audio in Multimodal Misinformation Detection","summary":" With the rapid development of deepfake technology, especially the deep audio\nfake technology, misinformation detection on the social media scene meets a\ngreat challenge. Social media data often contains multimodal information which\nincludes audio, video, text, and images. However, existing multimodal\nmisinformation detection methods tend to focus only on some of these\nmodalities, failing to comprehensively address information from all modalities.\nTo comprehensively address the various modal information that may appear on\nsocial media, this paper constructs a comprehensive multimodal misinformation\ndetection framework. By employing corresponding neural network encoders for\neach modality, the framework can fuse different modality information and\nsupport the multimodal misinformation detection task. Based on the constructed\nframework, this paper explores the importance of the audio modality in\nmultimodal misinformation detection tasks on social media. By adjusting the\narchitecture of the acoustic encoder, the effectiveness of different acoustic\nfeature encoders in the multimodal misinformation detection tasks is\ninvestigated. Furthermore, this paper discovers that audio and video\ninformation must be carefully aligned, otherwise the misalignment across\ndifferent audio and video modalities can severely impair the model performance.\n","authors":["Moyang Liu","Yukun Liu","Ruibo Fu","Zhengqi Wen","Jianhua Tao","Xuefei Liu","Guanjun Li"],"pdf_url":"https://arxiv.org/pdf/2408.12558v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12321v1","updated":"2024-08-22T11:57:16Z","published":"2024-08-22T11:57:16Z","title":"MaVEn: An Effective Multi-granularity Hybrid Visual Encoding Framework\n for Multimodal Large Language Model","summary":" This paper presents MaVEn, an innovative Multi-granularity Visual Encoding\nframework designed to enhance the capabilities of Multimodal Large Language\nModels (MLLMs) in multi-image reasoning. Current MLLMs primarily focus on\nsingle-image visual understanding, limiting their ability to interpret and\nintegrate information across multiple images. MaVEn addresses this limitation\nby combining discrete visual symbol sequences, which abstract coarse-grained\nsemantic concepts, with traditional continuous representation sequences that\nmodel fine-grained features. This dual approach bridges the semantic gap\nbetween visual and textual data, thereby improving the model's ability to\nprocess and interpret information from multiple images effectively.\nAdditionally, we design a dynamic reduction mechanism by for long-sequence\ncontinuous features to enhance multi-image processing efficiency. Experimental\nresults demonstrate that MaVEn significantly enhances MLLMs' understanding in\ncomplex multi-image scenarios, while also improving performance in single-image\ncontexts.\n","authors":["Chaoya Jiang","Jia Hongrui","Haiyang Xu","Wei Ye","Mengfan Dong","Ming Yan","Ji Zhang","Fei Huang","Shikun Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.12321v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03632v2","updated":"2024-08-22T04:19:34Z","published":"2024-08-07T08:43:58Z","title":"Concept Conductor: Orchestrating Multiple Personalized Concepts in\n Text-to-Image Synthesis","summary":" The customization of text-to-image models has seen significant advancements,\nyet generating multiple personalized concepts remains a challenging task.\nCurrent methods struggle with attribute leakage and layout confusion when\nhandling multiple concepts, leading to reduced concept fidelity and semantic\nconsistency. In this work, we introduce a novel training-free framework,\nConcept Conductor, designed to ensure visual fidelity and correct layout in\nmulti-concept customization. Concept Conductor isolates the sampling processes\nof multiple custom models to prevent attribute leakage between different\nconcepts and corrects erroneous layouts through self-attention-based spatial\nguidance. Additionally, we present a concept injection technique that employs\nshape-aware masks to specify the generation area for each concept. This\ntechnique injects the structure and appearance of personalized concepts through\nfeature fusion in the attention layers, ensuring harmony in the final image.\nExtensive qualitative and quantitative experiments demonstrate that Concept\nConductor can consistently generate composite images with accurate layouts\nwhile preserving the visual details of each concept. Compared to existing\nbaselines, Concept Conductor shows significant performance improvements. Our\nmethod supports the combination of any number of concepts and maintains high\nfidelity even when dealing with visually similar concepts. The code and models\nare available at https://github.com/Nihukat/Concept-Conductor.\n","authors":["Zebin Yao","Fangxiang Feng","Ruifan Li","Xiaojie Wang"],"pdf_url":"https://arxiv.org/pdf/2408.03632v2.pdf","comment":"Github Page: https://github.com/Nihukat/Concept-Conductor"},{"id":"http://arxiv.org/abs/2309.11092v2","updated":"2024-08-22T02:23:46Z","published":"2023-09-20T06:51:11Z","title":"Generalized Face Forgery Detection via Adaptive Learning for Pre-trained\n Vision Transformer","summary":" With the rapid progress of generative models, the current challenge in face\nforgery detection is how to effectively detect realistic manipulated faces from\ndifferent unseen domains. Though previous studies show that pre-trained Vision\nTransformer (ViT) based models can achieve some promising results after fully\nfine-tuning on the Deepfake dataset, their generalization performances are\nstill unsatisfactory. One possible reason is that fully fine-tuned ViT-based\nmodels may disrupt the pre-trained features [1, 2] and overfit to some\ndata-specific patterns [3]. To alleviate this issue, we present a\n\\textbf{F}orgery-aware \\textbf{A}daptive \\textbf{Vi}sion \\textbf{T}ransformer\n(FA-ViT) under the adaptive learning paradigm, where the parameters in the\npre-trained ViT are kept fixed while the designed adaptive modules are\noptimized to capture forgery features. Specifically, a global adaptive module\nis designed to model long-range interactions among input tokens, which takes\nadvantage of self-attention mechanism to mine global forgery clues. To further\nexplore essential local forgery clues, a local adaptive module is proposed to\nexpose local inconsistencies by enhancing the local contextual association. In\naddition, we introduce a fine-grained adaptive learning module that emphasizes\nthe common compact representation of genuine faces through relationship\nlearning in fine-grained pairs, driving these proposed adaptive modules to be\naware of fine-grained forgery-aware information. Extensive experiments\ndemonstrate that our FA-ViT achieves state-of-the-arts results in the\ncross-dataset evaluation, and enhances the robustness against unseen\nperturbations. Particularly, FA-ViT achieves 93.83\\% and 78.32\\% AUC scores on\nCeleb-DF and DFDC datasets in the cross-dataset evaluation. The code and\ntrained model have been released at: https://github.com/LoveSiameseCat/FAViT.\n","authors":["Anwei Luo","Rizhao Cai","Chenqi Kong","Yakun Ju","Xiangui Kang","Jiwu Huang","Alex C. Kot"],"pdf_url":"https://arxiv.org/pdf/2309.11092v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.02901v2","updated":"2024-08-22T00:31:39Z","published":"2024-08-06T02:15:12Z","title":"Lighthouse: A User-Friendly Library for Reproducible Video Moment\n Retrieval and Highlight Detection","summary":" We propose Lighthouse, a user-friendly library for reproducible video moment\nretrieval and highlight detection (MR-HD). Although researchers proposed\nvarious MR-HD approaches, the research community holds two main issues. The\nfirst is a lack of comprehensive and reproducible experiments across various\nmethods, datasets, and video-text features. This is because no unified training\nand evaluation codebase covers multiple settings. The second is user-unfriendly\ndesign. Because previous works use different libraries, researchers set up\nindividual environments. In addition, most works release only the training\ncodes, requiring users to implement the whole inference process of MR-HD.\nLighthouse addresses these issues by implementing a unified reproducible\ncodebase that includes six models, three features, and five datasets. In\naddition, it provides an inference API and web demo to make these methods\neasily accessible for researchers and developers. Our experiments demonstrate\nthat Lighthouse generally reproduces the reported scores in the reference\npapers. The code is available at https://github.com/line/lighthouse.\n","authors":["Taichi Nishimura","Shota Nakada","Hokuto Munakata","Tatsuya Komatsu"],"pdf_url":"https://arxiv.org/pdf/2408.02901v2.pdf","comment":"6 pages; library tech report"}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..7f5166c7afa0cda370aafaf91ba8d66cdeff74e5 GIT binary patch literal 15086 zcmeHO33yaRwyq9g34stc4G>5efuJa|8H50X3WB1d4n)8Z5JVvktB7$yKtv!Kb`lau zdP!$Nc9B&OSrq|+r=TLJ;K~kR2@vF;|J<9Kba%RAH(}4!8Q=syhFvI(6#Q zsX{4}Dx;b;Q+$T2oQ6t8Dy7213w{SH^#k7p^C{m4`z!S>3p8dKR#E*)@?QIEpg)}c zg{r6iyXi3T|KFt>>TsDWWe%JEGeI;m)t_ z#K0v35xKK9{I2==#4YqlUA%3Xh?MkH#4d|P#d8&Xs_&s!ylR8}jrLpHV^;bsWZSYa zH!TUx_B8jZuJHA{>W61Pj6tR~6PcNrNKa44y2nHqn=Z;*^b+JZFPz5nh)ES%qX=#z&q(>zlfC!?&@YSrplEEcre-mb8` zuXubZFMZU1X@6u`GYT;qc;sp5g4h(J-Ri$pM?zXcp{_ZWm%PAnhXUA$>^4n&yX>&2hmV*Ry0tB zDc2*Jr;1N^DfmP%o?7)3+HY@<`rp+@r%jzP%QCA_hD^#Z(KZo(JM=fG&)C8vq&l}j zJwF&~<0nw3PebMBv;9AHx_+HHM>2k2y$bcquTUQ>g6h^CsupeGV1<^;S|Zt!?1a7Z z#?J81^LLBW9d_fL={n^r&=JYsxAQ)IUfZ*@bwK9T6E8jvRhMmd?FO}(eVmu4AjvwHnm*c+SZaI_{G2dio}E$k=@I4b)RlA>{TDjCf@P9^3fXP4&AYX4kyP&|*&u zL_Z&m!0N<4WeU`#OS-PO!zb88j|}~hyr;2|GQa;071I}ibplcL)3QG6j4NJuzfD_B zC=*%^D*iPcyDJ`}Kd)TT$K}u=sD1mO_U@%EJplFd&rlaHy4N$2?^n)?N2!-$3x0El zpcL=UvTj#WH?}W2Bm5luU4G~0LM>eiHE+x9X!aoTcDDYxjsAu_3X1y$Bwy|VK(IpqYlX$aVwJaeLK?Nm$>FoH>BT1SA+z+ zU=vI)(6%2wtjj0%MZP(b^zV%uI__S*bQz3sFxr#nAAdmI&v6?@p6=F4Uu9a$c0#M_ zYms0O{AbSSdL;Y>%a9?ohv$p;r=yM;d1*uX{*gzhC!9-CPjph+lrr*t+6=DYwBtv~ zyR_+Lw$R}Ly?yB);gOO8)vups_cUD>9g)5^F#gq3Fz(vLe!d?nI$Cc_+LU_I&i?)M zIch^KE+zVltlyC|I^G%I@#DH)3(vSXsLPkV$8N|bbwuZ+4Uu2klyA~U=gyHYb#inm z@gHOTMt<~hZ2FpM@D?7T<1$AF4AA)*-@JVaMzK}WhO}jjts%pUoNtelKmDVdPWxH2 zUPby@A3Nh09x~3tJ0qiLUVDpO%84zIy3&TL?uk4TCquO+|J<8Kuls0W!BE?_7q^=R zR>LM4G8ykZJsq(+)^#i|_{CpsPVA>jf&X*X4WnPYb(?4W24BGk+NDH$2J`E zf`8mZuHWQ;fpoJ;{E)k7hv&^N8NXnQkB3QdfAN5etrc7{H)-GHn^uNpi|M>0e#!TL zUf<(*vuE`rpX~9(?-?@G**>`PqAzOd-d)F5zrMZ>JLy9R#yRq)UYk01*0I&Dt@{+N_~~bu_)WvlvL5G&ri-7^ z->MF^<`&@J!8Sr^LszWytV3LjOg($**cvs`*CSW_T%%0(R| z7fJph+p5D@i1_zn8yvAoUifk!XnKY+uH-m5_PtS7-tn7OM)r*E%1GHa#zNgmoALcE z#4q!>Kk2^KMLx2D%Xms3%l^vv?dd6HJdB}Qx1PEh0yX;DK9ZoqOLJHETi*8l?M+!q*#o zC6zI-cj$nK1`W|ZyFL7`-F)mM@LV85j)p*DxN;%mZkENXqy&csb zsD_E}O+J!}{T|s1i|rJAB99{pc8M?U{DQvC+q9AQaBsmro~7V_${$@ebz$tS zD0VAxxY*^fnl6;TP##r}S4F+U^$_|)D9TLDq~u-ur`m{J5sTMs zuVdouiO8@OoM~_@pI-BHF}c0L>wncBZML_?6w4IMOrMFU?NJb4z)d@OeCL^Ns63wI}D zBv}ESDELfpe#mbj`F_{^oZh>Z^W}G7ZeV`Y?soZMN5fs)bg~^4FI2?vWy3K&V>+50 zU`*>4B=PI|ujbte-Xj>la6GD=q@VASBzQbE(Mou3Mu)rQ&jISBtLywuzSE(YL* zRWHzQDO&Hv*NR_EzfbO-(Ql3ZI2xH2{XUVbt8zbPa`t3YJ<0PO*Cc+F#GZd3NgVut zNOB${@er3J{oZq9PuOfW&DRq@LnzyljUgWmDclQ1?vKPg+dRz&)b3@`ACx*V>tj&< zQh{G_jjc<}=nbyZ(YJA*)|_Wd4{~4Ik$L+3y{kb@W`7DE$|U3Y$hJq3Zlm8!pKa`- zv1q-oHJX3j0-ZkZ?7)E-Ue)=q) z1McS8?eE9wOY)5BEYBN$`Hivk&!HwUHvc$vb{y}ht!;-idz!|3=!&7Jc7pgyNTLgZ zL&}654a0gZHLP z#fT3_pz0|%<5&U~4Z|;Ccy3ZZ)RDUXWm zuW1r%$T_63N0z;(oDoczz`f8=o@319zR0eVoPCet-frwzJsvA%1%sSn_Upy_AU<-J z`Se6X?t8y0>UX)zFl-nU@1AMxy7s@^n~*a*Gj(PbecRHl3 z)RAxUQWpboZ1o5#BsAxUFp)gfC+3&x=I=7_IiV()^N6pLIfV24e(_kM$kW2W1{+TyObG z%18SuI2`rMz##ABo2)s!CwhC^NWA&#Ye>jRK*XU4w{bZGSA|Oz&~HsYEykEkKg?pY zXufJvMiN@@Z_T@=EZKu=$l!rI`}>%4?++b|p?{Y+CP#nP?M%$mP|pP*d{r3U&v{>q zi@lfm9`4_JKPsK8gz6`1fO`u_9Kqm!&w+bjW;{vaQU+P+dv)2-r6{&=nx(CzzLj}D zrv-UL^G+<+sG)xM6 z?TJFFEmf0C{C}YwOAfki>*iPzQOx^KY$zohMj$PFW zCBJEte(!S2disF0r>^Ov+jX9@#tC1!$1G3B{B^EFJGawD(vNmcn=A5eJ=^~ChPFSD z`yJZd2HtPb^0MEMZ)+A3XVDr_*beuF%KQ@h`>O7b$(~E@NRv#Gm$SfJ`dkb8=(^#^ zpZemTUj}!jMxoMTuBTTxc8|>VvOmu-_V5+=E%E4@&Z01YYUJsU+fLnv2}>u?uI6Cm+L8KM zAc2-+N1Mj9EDb0eKE% zBWGqVy3+V)V + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Computation and Language 60 + +
+
+
+ + ☆ Controllable Text Generation for Large Language Models: A Survey + + +
+ In Natural Language Processing (NLP), Large Language Models (LLMs) have +demonstrated high text generation quality. However, in real-world applications, +LLMs must meet increasingly complex requirements. Beyond avoiding misleading or +inappropriate content, LLMs are also expected to cater to specific user needs, +such as imitating particular writing styles or generating text with poetic +richness. These varied demands have driven the development of Controllable Text +Generation (CTG) techniques, which ensure that outputs adhere to predefined +control conditions--such as safety, sentiment, thematic consistency, and +linguistic style--while maintaining high standards of helpfulness, fluency, and +diversity. + This paper systematically reviews the latest advancements in CTG for LLMs, +offering a comprehensive definition of its core concepts and clarifying the +requirements for control conditions and text quality. We categorize CTG tasks +into two primary types: content control and attribute control. The key methods +are discussed, including model retraining, fine-tuning, reinforcement learning, +prompt engineering, latent space manipulation, and decoding-time intervention. +We analyze each method's characteristics, advantages, and limitations, +providing nuanced insights for achieving generation control. Additionally, we +review CTG evaluation methods, summarize its applications across domains, and +address key challenges in current research, including reduced fluency and +practicality. We also propose several appeals, such as placing greater emphasis +on real-world applications in future research. This paper aims to offer +valuable guidance to researchers and developers in the field. Our reference +list and Chinese version are open-sourced at +https://github.com/IAAR-Shanghai/CTGSurvey. + +
+
+ comment: 52 pages, 11 figures, 7 tables, 11 equations +
+
+
+
+
+ + ☆ RuleAlign: Making Large Language Models Better Physicians with + Diagnostic Rule Alignment + + +
+ Large Language Models (LLMs) like GPT-4, MedPaLM-2, and Med-Gemini achieve +performance competitively with human experts across various medical benchmarks. +However, they still face challenges in making professional diagnoses akin to +physicians, particularly in efficiently gathering patient information and +reasoning the final diagnosis. To this end, we introduce the RuleAlign +framework, designed to align LLMs with specific diagnostic rules. We develop a +medical dialogue dataset comprising rule-based communications between patients +and physicians and design an alignment learning approach through preference +learning. Experimental results demonstrate the effectiveness of the proposed +approach. We hope that our work can serve as an inspiration for exploring the +potential of LLMs as AI physicians. + +
+
+ comment: Ongoing work +
+
+
+
+
+ + ☆ MuMA-ToM: Multi-modal Multi-Agent Theory of Mind SC + + +
+ Understanding people's social interactions in complex real-world scenarios +often relies on intricate mental reasoning. To truly understand how and why +people interact with one another, we must infer the underlying mental states +that give rise to the social interactions, i.e., Theory of Mind reasoning in +multi-agent interactions. Additionally, social interactions are often +multi-modal -- we can watch people's actions, hear their conversations, and/or +read about their past behaviors. For AI systems to successfully and safely +interact with people in real-world environments, they also need to understand +people's mental states as well as their inferences about each other's mental +states based on multi-modal information about their interactions. For this, we +introduce MuMA-ToM, a Multi-modal Multi-Agent Theory of Mind benchmark. +MuMA-ToM is the first multi-modal Theory of Mind benchmark that evaluates +mental reasoning in embodied multi-agent interactions. In MuMA-ToM, we provide +video and text descriptions of people's multi-modal behavior in realistic +household environments. Based on the context, we then ask questions about +people's goals, beliefs, and beliefs about others' goals. We validated MuMA-ToM +in a human experiment and provided a human baseline. We also proposed a novel +multi-modal, multi-agent ToM model, LIMP (Language model-based Inverse +Multi-agent Planning). Our experimental results show that LIMP significantly +outperforms state-of-the-art methods, including large multi-modal models (e.g., +GPT-4o, Gemini-1.5 Pro) and a recent multi-modal ToM model, BIP-ALM. + +
+
+ comment: Project website: https://scai.cs.jhu.edu/projects/MuMA-ToM/ Code: + https://github.com/SCAI-JHU/MuMA-ToM +
+
+
+
+
+ + ☆ Jamba-1.5: Hybrid Transformer-Mamba Models at Scale + + +
+ We present Jamba-1.5, new instruction-tuned large language models based on +our Jamba architecture. Jamba is a hybrid Transformer-Mamba mixture of experts +architecture, providing high throughput and low memory usage across context +lengths, while retaining the same or better quality as Transformer models. We +release two model sizes: Jamba-1.5-Large, with 94B active parameters, and +Jamba-1.5-Mini, with 12B active parameters. Both models are fine-tuned for a +variety of conversational and instruction-following capabilties, and have an +effective context length of 256K tokens, the largest amongst open-weight +models. To support cost-effective inference, we introduce ExpertsInt8, a novel +quantization technique that allows fitting Jamba-1.5-Large on a machine with 8 +80GB GPUs when processing 256K-token contexts without loss of quality. When +evaluated on a battery of academic and chatbot benchmarks, Jamba-1.5 models +achieve excellent results while providing high throughput and outperforming +other open-weight models on long-context benchmarks. The model weights for both +sizes are publicly available under the Jamba Open Model License and we release +ExpertsInt8 as open source. + +
+
+ comment: Webpage: https://www.ai21.com/jamba +
+
+
+
+
+ + ☆ Towards Evaluating and Building Versatile Large Language Models for + Medicine + + +
+ In this study, we present MedS-Bench, a comprehensive benchmark designed to +evaluate the performance of large language models (LLMs) in clinical contexts. +Unlike existing benchmarks that focus on multiple-choice question answering, +MedS-Bench spans 11 high-level clinical tasks, including clinical report +summarization, treatment recommendations, diagnosis, named entity recognition, +and medical concept explanation, among others. We evaluated six leading LLMs, +e.g., MEDITRON, Mistral, InternLM 2, Llama 3, GPT-4, and Claude-3.5 using +few-shot prompting, and found that even the most sophisticated models struggle +with these complex tasks. To address these limitations, we developed MedS-Ins, +a large-scale instruction tuning dataset for medicine. MedS-Ins comprises 58 +medically oriented language corpora, totaling 13.5 million samples across 122 +tasks. To demonstrate the dataset's utility, we conducted a proof-of-concept +experiment by performing instruction tuning on a lightweight, open-source +medical language model. The resulting model, MMedIns-Llama 3, significantly +outperformed existing models across nearly all clinical tasks. To promote +further advancements in the application of LLMs to clinical challenges, we have +made the MedS-Ins dataset fully accessible and invite the research community to +contribute to its expansion.Additionally, we have launched a dynamic +leaderboard for MedS-Bench, which we plan to regularly update the test set to +track progress and enhance the adaptation of general LLMs to the medical +domain. Leaderboard: https://henrychur.github.io/MedS-Bench/. Github: +https://github.com/MAGIC-AI4Med/MedS-Ins. + +
+
+
+
+
+ + ☆ The Russian-focused embedders' exploration: ruMTEB benchmark and Russian + embedding model design + + +
+ Embedding models play a crucial role in Natural Language Processing (NLP) by +creating text embeddings used in various tasks such as information retrieval +and assessing semantic text similarity. This paper focuses on research related +to embedding models in the Russian language. It introduces a new +Russian-focused embedding model called ru-en-RoSBERTa and the ruMTEB benchmark, +the Russian version extending the Massive Text Embedding Benchmark (MTEB). Our +benchmark includes seven categories of tasks, such as semantic textual +similarity, text classification, reranking, and retrieval. The research also +assesses a representative set of Russian and multilingual models on the +proposed benchmark. The findings indicate that the new model achieves results +that are on par with state-of-the-art models in Russian. We release the model +ru-en-RoSBERTa, and the ruMTEB framework comes with open-source code, +integration into the original framework and a public leaderboard. + +
+
+
+
+
+ + ☆ GenderCARE: A Comprehensive Framework for Assessing and Reducing Gender + Bias in Large Language Models + + +
+ Large language models (LLMs) have exhibited remarkable capabilities in +natural language generation, but they have also been observed to magnify +societal biases, particularly those related to gender. In response to this +issue, several benchmarks have been proposed to assess gender bias in LLMs. +However, these benchmarks often lack practical flexibility or inadvertently +introduce biases. To address these shortcomings, we introduce GenderCARE, a +comprehensive framework that encompasses innovative Criteria, bias Assessment, +Reduction techniques, and Evaluation metrics for quantifying and mitigating +gender bias in LLMs. To begin, we establish pioneering criteria for gender +equality benchmarks, spanning dimensions such as inclusivity, diversity, +explainability, objectivity, robustness, and realisticity. Guided by these +criteria, we construct GenderPair, a novel pair-based benchmark designed to +assess gender bias in LLMs comprehensively. Our benchmark provides standardized +and realistic evaluations, including previously overlooked gender groups such +as transgender and non-binary individuals. Furthermore, we develop effective +debiasing techniques that incorporate counterfactual data augmentation and +specialized fine-tuning strategies to reduce gender bias in LLMs without +compromising their overall performance. Extensive experiments demonstrate a +significant reduction in various gender bias benchmarks, with reductions +peaking at over 90% and averaging above 35% across 17 different LLMs. +Importantly, these reductions come with minimal variability in mainstream +language tasks, remaining below 2%. By offering a realistic assessment and +tailored reduction of gender biases, we hope that our GenderCARE can represent +a significant step towards achieving fairness and equity in LLMs. More details +are available at https://github.com/kstanghere/GenderCARE-ccs24. + +
+
+
+
+
+ + ☆ Vintern-1B: An Efficient Multimodal Large Language Model for Vietnamese + + +
+ In this report, we introduce Vintern-1B, a reliable 1-billion-parameters +multimodal large language model (MLLM) for Vietnamese language tasks. By +integrating the Qwen2-0.5B-Instruct language model with the +InternViT-300M-448px visual model, Vintern-1B is optimized for a range of +applications, including optical character recognition (OCR), document +extraction, and general question-answering in Vietnamese context. The model is +fine-tuned on an extensive dataset of over 3 million image-question-answer +pairs, achieving robust performance and reliable results across multiple +Vietnamese language benchmarks like OpenViVQA and ViTextVQA. Vintern-1B is +small enough to fit into various on-device applications easily. Additionally, +we have open-sourced several Vietnamese vision question answering (VQA) +datasets for text and diagrams, created with Gemini 1.5 Flash. Our models are +available at: https://huggingface.co/5CD-AI/Vintern-1B-v2. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2404.16821 by other authors +
+
+
+
+
+ + ☆ Enhancing Multi-hop Reasoning through Knowledge Erasure in Large + Language Model Editing + + +
+ Large language models (LLMs) face challenges with internal knowledge +inaccuracies and outdated information. Knowledge editing has emerged as a +pivotal approach to mitigate these issues. Although current knowledge editing +techniques exhibit promising performance in single-hop reasoning tasks, they +show limitations when applied to multi-hop reasoning. Drawing on cognitive +neuroscience and the operational mechanisms of LLMs, we hypothesize that the +residual single-hop knowledge after editing causes edited models to revert to +their original answers when processing multi-hop questions, thereby undermining +their performance in multihop reasoning tasks. To validate this hypothesis, we +conduct a series of experiments that empirically confirm our assumptions. +Building on the validated hypothesis, we propose a novel knowledge editing +method that incorporates a Knowledge Erasure mechanism for Large language model +Editing (KELE). Specifically, we design an erasure function for residual +knowledge and an injection function for new knowledge. Through joint +optimization, we derive the optimal recall vector, which is subsequently +utilized within a rank-one editing framework to update the parameters of +targeted model layers. Extensive experiments on GPT-J and GPT-2 XL demonstrate +that KELE substantially enhances the multi-hop reasoning capability of edited +LLMs. + +
+
+
+
+
+ + ☆ Positional Description for Numerical Normalization + + +
+ We present a Positional Description Scheme (PDS) tailored for digit +sequences, integrating placeholder value information for each digit. Given the +structural limitations of subword tokenization algorithms, language models +encounter critical Text Normalization (TN) challenges when handling numerical +tasks. Our schema addresses this challenge through straightforward +pre-processing, preserving the model architecture while significantly +simplifying number normalization, rendering the problem tractable. This +simplifies the task and facilitates more compact production-ready models +capable of learning from smaller datasets. Furthermore, our investigations +reveal that PDS enhances the arithmetic processing capabilities of language +models, resulting in a relative accuracy improvement of 23% to 51% on complex +arithmetic tasks. We demonstrate that PDS effectively mitigates fatal numerical +normalization errors in neural models, requiring only a modest amount of +training data without rule-based Finite State Transducers (FST). We demonstrate +that PDS is essential for both the Text-To-Speech and Speech Recognition text +processing, enabling effective TN under production constraints. + +
+
+ comment: Published at Interspeech 2024 +
+
+
+
+
+ + ☆ A Comparative Analysis of Faithfulness Metrics and Humans in Citation + Evaluation SIGIR2024 + + +
+ Large language models (LLMs) often generate content with unsupported or +unverifiable content, known as "hallucinations." To address this, +retrieval-augmented LLMs are employed to include citations in their content, +grounding the content in verifiable sources. Despite such developments, +manually assessing how well a citation supports the associated statement +remains a major challenge. Previous studies tackle this challenge by leveraging +faithfulness metrics to estimate citation support automatically. However, they +limit this citation support estimation to a binary classification scenario, +neglecting fine-grained citation support in practical scenarios. To investigate +the effectiveness of faithfulness metrics in fine-grained scenarios, we propose +a comparative evaluation framework that assesses the metric effectiveness in +distinguishing citations between three-category support levels: full, partial, +and no support. Our framework employs correlation analysis, classification +evaluation, and retrieval evaluation to measure the alignment between metric +scores and human judgments comprehensively. Our results indicate no single +metric consistently excels across all evaluations, highlighting the complexity +of accurately evaluating fine-grained support levels. Particularly, we find +that the best-performing metrics struggle to distinguish partial support from +full or no support. Based on these findings, we provide practical +recommendations for developing more effective metrics. + +
+
+ comment: Accepted by the First Workshop on Large Language Model for Evaluation + in Information Retrieval (LLM4Eval@SIGIR2024), non-archival. arXiv admin + note: substantial text overlap with arXiv:2406.15264 +
+
+
+
+
+ + ☆ CLEANANERCorp: Identifying and Correcting Incorrect Labels in the + ANERcorp Dataset LREC + + +
+ Label errors are a common issue in machine learning datasets, particularly +for tasks such as Named Entity Recognition. Such label errors might hurt model +training, affect evaluation results, and lead to an inaccurate assessment of +model performance. In this study, we dived deep into one of the widely adopted +Arabic NER benchmark datasets (ANERcorp) and found a significant number of +annotation errors, missing labels, and inconsistencies. Therefore, in this +study, we conducted empirical research to understand these errors, correct them +and propose a cleaner version of the dataset named CLEANANERCorp. CLEANANERCorp +will serve the research community as a more accurate and consistent benchmark. + +
+
+ comment: Proceedings of the 6th Workshop on Open-Source Arabic Corpora and + Processing Tools (OSACT) with Shared Tasks on Arabic LLMs Hallucination and + Dialect to MSA Machine Translation @ LREC-COLING 2024 +
+
+
+
+
+ + ☆ Fine-tuning Smaller Language Models for Question Answering over + Financial Documents + + +
+ Recent research has shown that smaller language models can acquire +substantial reasoning abilities when fine-tuned with reasoning exemplars +crafted by a significantly larger teacher model. We explore this paradigm for +the financial domain, focusing on the challenge of answering questions that +require multi-hop numerical reasoning over financial texts. We assess the +performance of several smaller models that have been fine-tuned to generate +programs that encode the required financial reasoning and calculations. Our +findings demonstrate that these fine-tuned smaller models approach the +performance of the teacher model. + To provide a granular analysis of model performance, we propose an approach +to investigate the specific student model capabilities that are enhanced by +fine-tuning. Our empirical analysis indicates that fine-tuning refines the +student models ability to express and apply the required financial concepts +along with adapting the entity extraction for the specific data format. In +addition, we hypothesize and demonstrate that comparable financial reasoning +capability can be induced using relatively smaller datasets. + +
+
+
+
+
+ + ☆ Interactive DualChecker for Mitigating Hallucinations in Distilling + Large Language Models + + +
+ Large Language Models (LLMs) have demonstrated exceptional capabilities +across various machine learning (ML) tasks. Given the high costs of creating +annotated datasets for supervised learning, LLMs offer a valuable alternative +by enabling effective few-shot in-context learning. However, these models can +produce hallucinations, particularly in domains with incomplete knowledge. +Additionally, current methods for knowledge distillation using LLMs often +struggle to enhance the effectiveness of both teacher and student models. To +address these challenges, we introduce DualChecker, an innovative framework +designed to mitigate hallucinations and improve the performance of both teacher +and student models during knowledge distillation. DualChecker employs +ContextAligner to ensure that the context provided by teacher models aligns +with human labeling standards. It also features a dynamic checker system that +enhances model interaction: one component re-prompts teacher models with more +detailed content when they show low confidence, and another identifies +borderline cases from student models to refine the teaching templates. This +interactive process promotes continuous improvement and effective knowledge +transfer between the models. We evaluate DualChecker using a green innovation +textual dataset that includes binary, multiclass, and token classification +tasks. The experimental results show that DualChecker significantly outperforms +existing state-of-the-art methods, achieving up to a 17% improvement in F1 +score for teacher models and 10% for student models. Notably, student models +fine-tuned with LLM predictions perform comparably to those fine-tuned with +actual data, even in a challenging domain. We make all datasets, models, and +code from this research publicly available. + +
+
+
+
+
+ + ☆ Improving Factuality in Large Language Models via Decoding-Time + Hallucinatory and Truthful Comparators + + +
+ Despite their remarkable capabilities, Large Language Models (LLMs) are prone +to generate responses that contradict verifiable facts, i.e., unfaithful +hallucination content. Existing efforts generally focus on optimizing model +parameters or editing semantic representations, which compromise the internal +factual knowledge of target LLMs. In addition, hallucinations typically exhibit +multifaceted patterns in downstream tasks, limiting the model's holistic +performance across tasks. In this paper, we propose a Comparator-driven +Decoding-Time (CDT) framework to alleviate the response hallucination. Firstly, +we construct hallucinatory and truthful comparators with multi-task fine-tuning +samples. In this case, we present an instruction prototype-guided mixture of +experts strategy to enhance the ability of the corresponding comparators to +capture different hallucination or truthfulness patterns in distinct task +instructions. CDT constrains next-token predictions to factuality-robust +distributions by contrasting the logit differences between the target LLMs and +these comparators. Systematic experiments on multiple downstream tasks show +that our framework can significantly improve the model performance and response +factuality. + +
+
+ comment: Hallucination Mitigation in LLMs +
+
+
+
+
+ + ☆ MaVEn: An Effective Multi-granularity Hybrid Visual Encoding Framework + for Multimodal Large Language Model + + +
+ This paper presents MaVEn, an innovative Multi-granularity Visual Encoding +framework designed to enhance the capabilities of Multimodal Large Language +Models (MLLMs) in multi-image reasoning. Current MLLMs primarily focus on +single-image visual understanding, limiting their ability to interpret and +integrate information across multiple images. MaVEn addresses this limitation +by combining discrete visual symbol sequences, which abstract coarse-grained +semantic concepts, with traditional continuous representation sequences that +model fine-grained features. This dual approach bridges the semantic gap +between visual and textual data, thereby improving the model's ability to +process and interpret information from multiple images effectively. +Additionally, we design a dynamic reduction mechanism by for long-sequence +continuous features to enhance multi-image processing efficiency. Experimental +results demonstrate that MaVEn significantly enhances MLLMs' understanding in +complex multi-image scenarios, while also improving performance in single-image +contexts. + +
+
+
+
+
+ + ☆ Large Language Models Are Self-Taught Reasoners: Enhancing LLM + Applications via Tailored Problem-Solving Demonstrations + + +
+ Guiding large language models with a selected set of human-authored +demonstrations is a common practice for improving LLM applications. However, +human effort can be costly, especially in specialized domains (e.g., clinical +diagnosis), and does not guarantee optimal performance due to the potential +discrepancy of target skills between selected demonstrations and real test +instances. Motivated by these, this paper explores the automatic creation of +customized demonstrations, whose target skills align with the given target +instance. We present SELF-TAUGHT, a problem-solving framework, which +facilitates demonstrations that are "tailored" to the target problem and +"filtered" for better quality (i.e., correctness) in a zero-shot manner. In 15 +tasks of multiple-choice questions of diverse domains and the diagnosis of +Alzheimer's disease (AD) with real-world patients, SELF-TAUGHT achieves +superior performance to strong baselines (e.g., Few-shot CoT, Plan-and-Solve, +Auto-CoT). We conduct comprehensive analyses on SELF-TAUGHT, including its +generalizability to existing prompting methods and different LLMs, the quality +of its intermediate generation, and more. + +
+
+ comment: preprint / under review +
+
+
+
+
+ + ☆ Toward the Evaluation of Large Language Models Considering Score + Variance across Instruction Templates + + +
+ The natural language understanding (NLU) performance of large language models +(LLMs) has been evaluated across various tasks and datasets. The existing +evaluation methods, however, do not take into account the variance in scores +due to differences in prompts, which leads to unfair evaluation and comparison +of NLU performance. Moreover, evaluation designed for specific prompts is +inappropriate for instruction tuning, which aims to perform well with any +prompt. It is therefore necessary to find a way to measure NLU performance in a +fair manner, considering score variance between different instruction +templates. In this study, we provide English and Japanese cross-lingual +datasets for evaluating the NLU performance of LLMs, which include multiple +instruction templates for fair evaluation of each task, along with regular +expressions to constrain the output format. Furthermore, we propose the Sharpe +score as an evaluation metric that takes into account the variance in scores +between templates. Comprehensive analysis of English and Japanese LLMs reveals +that the high variance among templates has a significant impact on the fair +evaluation of LLMs. + +
+
+ comment: 19 pages, 7 figures +
+
+
+
+
+ + ☆ A Language-agnostic Model of Child Language Acquisition + + +
+ This work reimplements a recent semantic bootstrapping child-language +acquisition model, which was originally designed for English, and trains it to +learn a new language: Hebrew. The model learns from pairs of utterances and +logical forms as meaning representations, and acquires both syntax and word +meanings simultaneously. The results show that the model mostly transfers to +Hebrew, but that a number of factors, including the richer morphology in +Hebrew, makes the learning slower and less robust. This suggests that a clear +direction for future work is to enable the model to leverage the similarities +between different word forms. + +
+
+
+
+
+ + ☆ LLMs are not Zero-Shot Reasoners for Biomedical Information Extraction + + +
+ Large Language Models (LLMs) are increasingly adopted for applications in +healthcare, reaching the performance of domain experts on tasks such as +question answering and document summarisation. Despite their success on these +tasks, it is unclear how well LLMs perform on tasks that are traditionally +pursued in the biomedical domain, such as structured information extration. To +breach this gap, in this paper, we systematically benchmark LLM performance in +Medical Classification and Named Entity Recognition (NER) tasks. We aim to +disentangle the contribution of different factors to the performance, +particularly the impact of LLMs' task knowledge and reasoning capabilities, +their (parametric) domain knowledge, and addition of external knowledge. To +this end we evaluate various open LLMs -- including BioMistral and Llama-2 +models -- on a diverse set of biomedical datasets, using standard prompting, +Chain-of-Thought (CoT) and Self-Consistency based reasoning as well as +Retrieval-Augmented Generation (RAG) with PubMed and Wikipedia corpora. +Counter-intuitively, our results reveal that standard prompting consistently +outperforms more complex techniques across both tasks, laying bare the +limitations in the current application of CoT, self-consistency and RAG in the +biomedical domain. Our findings suggest that advanced prompting methods +developed for knowledge- or reasoning-intensive tasks, such as CoT or RAG, are +not easily portable to biomedical tasks where precise structured outputs are +required. This highlights the need for more effective integration of external +knowledge and reasoning mechanisms in LLMs to enhance their performance in +real-world biomedical applications. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ EvalYaks: Instruction Tuning Datasets and LoRA Fine-tuned Models for + Automated Scoring of CEFR B2 Speaking Assessment Transcripts + + +
+ Relying on human experts to evaluate CEFR speaking assessments in an +e-learning environment creates scalability challenges, as it limits how quickly +and widely assessments can be conducted. We aim to automate the evaluation of +CEFR B2 English speaking assessments in e-learning environments from +conversation transcripts. First, we evaluate the capability of leading open +source and commercial Large Language Models (LLMs) to score a candidate's +performance across various criteria in the CEFR B2 speaking exam in both global +and India-specific contexts. Next, we create a new expert-validated, +CEFR-aligned synthetic conversational dataset with transcripts that are rated +at different assessment scores. In addition, new instruction-tuned datasets are +developed from the English Vocabulary Profile (up to CEFR B2 level) and the +CEFR-SP WikiAuto datasets. Finally, using these new datasets, we perform +parameter efficient instruction tuning of Mistral Instruct 7B v0.2 to develop a +family of models called EvalYaks. Four models in this family are for assessing +the four sections of the CEFR B2 speaking exam, one for identifying the CEFR +level of vocabulary and generating level-specific vocabulary, and another for +detecting the CEFR level of text and generating level-specific text. EvalYaks +achieved an average acceptable accuracy of 96%, a degree of variation of 0.35 +levels, and performed 3 times better than the next best model. This +demonstrates that a 7B parameter LLM instruction tuned with high-quality +CEFR-aligned assessment data can effectively evaluate and score CEFR B2 English +speaking assessments, offering a promising solution for scalable, automated +language proficiency evaluation. + +
+
+
+
+
+ + ☆ Large Language Models as Foundations for Next-Gen Dense Retrieval: A + Comprehensive Empirical Assessment EMNLP24 + + +
+ Pretrained language models like BERT and T5 serve as crucial backbone +encoders for dense retrieval. However, these models often exhibit limited +generalization capabilities and face challenges in improving in domain +accuracy. Recent research has explored using large language models (LLMs) as +retrievers, achieving SOTA performance across various tasks. Despite these +advancements, the specific benefits of LLMs over traditional retrievers and the +impact of different LLM configurations, such as parameter sizes, pretraining +duration, and alignment processes on retrieval tasks remain unclear. In this +work, we conduct a comprehensive empirical study on a wide range of retrieval +tasks, including in domain accuracy, data efficiency, zero shot generalization, +lengthy retrieval, instruction based retrieval, and multi task learning. We +evaluate over 15 different backbone LLMs and non LLMs. Our findings reveal that +larger models and extensive pretraining consistently enhance in domain accuracy +and data efficiency. Additionally, larger models demonstrate significant +potential in zero shot generalization, lengthy retrieval, instruction based +retrieval, and multi task learning. These results underscore the advantages of +LLMs as versatile and effective backbone encoders in dense retrieval, providing +valuable insights for future research and development in this field. + +
+
+ comment: Submitted to EMNLP24 +
+
+
+
+
+ + ☆ Reasoning Factual Knowledge in Structured Data with Large Language + Models + + +
+ Large language models (LLMs) have made remarkable progress in various natural +language processing tasks as a benefit of their capability to comprehend and +reason with factual knowledge. However, a significant amount of factual +knowledge is stored in structured data, which possesses unique characteristics +that differ from the unstructured texts used for pretraining. This difference +can introduce imperceptible inference parameter deviations, posing challenges +for LLMs in effectively utilizing and reasoning with structured data to +accurately infer factual knowledge. To this end, we propose a benchmark named +StructFact, to evaluate the structural reasoning capabilities of LLMs in +inferring factual knowledge. StructFact comprises 8,340 factual questions +encompassing various tasks, domains, timelines, and regions. This benchmark +allows us to investigate the capability of LLMs across five factual tasks +derived from the unique characteristics of structural facts. Extensive +experiments on a set of LLMs with different training strategies reveal the +limitations of current LLMs in inferring factual knowledge from structured +data. We present this benchmark as a compass to navigate the strengths and +weaknesses of LLMs in reasoning with structured data for knowledge-sensitive +tasks, and to encourage advancements in related real-world applications. Please +find our code at https://github.com/EganGu/StructFact. + +
+
+
+
+
+ + ☆ Revisiting the Phenomenon of Syntactic Complexity Convergence on German + Dialogue Data + + +
+ We revisit the phenomenon of syntactic complexity convergence in +conversational interaction, originally found for English dialogue, which has +theoretical implication for dialogical concepts such as mutual understanding. +We use a modified metric to quantify syntactic complexity based on dependency +parsing. The results show that syntactic complexity convergence can be +statistically confirmed in one of three selected German datasets that were +analysed. Given that the dataset which shows such convergence is much larger +than the other two selected datasets, the empirical results indicate a certain +degree of linguistic generality of syntactic complexity convergence in +conversational interaction. We also found a different type of syntactic +complexity convergence in one of the datasets while further investigation is +still necessary. + +
+
+ comment: Accepted to KONVENS 2024 +
+
+
+
+
+ + ☆ FIRST: Teach A Reliable Large Language Model Through Efficient + Trustworthy Distillation + + +
+ Large language models (LLMs) have become increasingly prevalent in our daily +lives, leading to an expectation for LLMs to be trustworthy -- - both accurate +and well-calibrated (the prediction confidence should align with its ground +truth correctness likelihood). Nowadays, fine-tuning has become the most +popular method for adapting a model to practical usage by significantly +increasing accuracy on downstream tasks. Despite the great accuracy it +achieves, we found fine-tuning is still far away from satisfactory +trustworthiness due to "tuning-induced mis-calibration". In this paper, we +delve deeply into why and how mis-calibration exists in fine-tuned models, and +how distillation can alleviate the issue. Then we further propose a brand new +method named Efficient Trustworthy Distillation (FIRST), which utilizes a small +portion of teacher's knowledge to obtain a reliable language model in a +cost-efficient way. Specifically, we identify the "concentrated knowledge" +phenomenon during distillation, which can significantly reduce the +computational burden. Then we apply a "trustworthy maximization" process to +optimize the utilization of this small portion of concentrated knowledge before +transferring it to the student. Experimental results demonstrate the +effectiveness of our method, where better accuracy (+2.3%) and less +mis-calibration (-10%) are achieved on average across both in-domain and +out-of-domain scenarios, indicating better trustworthiness. + +
+
+
+
+
+ + ☆ Preference-Guided Reflective Sampling for Aligning Language Models + + +
+ Large language models (LLMs) are aligned with human preferences by +reinforcement learning from human feedback (RLHF). Effective data sampling is +crucial for RLHF, as it determines the efficiency of model training, ensuring +that models learn from the informative samples. To achieve better data +generation, we propose a new sampling method called Preference-Guided +Reflective Sampling (PRS). PRS frames the response generation as an +optimization process to the explicitly specified user preference described in +natural language. It employs a tree-based generation framework to enable an +efficient sampling process, which guides the direction of generation through +preference and better explores the sampling space with adaptive +self-refinement. Notably, PRS can align LLMs to diverse preferences. We study +preference-controlled text generation for instruction following and +keyword-focused document summarization. Our findings indicate that PRS, across +different LLM policies, generates training data with much higher rewards than +strong baselines. PRS also excels in post-RL training. + +
+
+
+
+
+ + ☆ Search-Based LLMs for Code Optimization ICSE'25 + + +
+ The code written by developers usually suffers from efficiency problems and +contain various performance bugs. These inefficiencies necessitate the research +of automated refactoring methods for code optimization. Early research in code +optimization employs rule-based methods and focuses on specific inefficiency +issues, which are labor-intensive and suffer from the low coverage issue. +Recent work regards the task as a sequence generation problem, and resorts to +deep learning (DL) techniques such as large language models (LLMs). These +methods typically prompt LLMs to directly generate optimized code. Although +these methods show state-of-the-art performance, such one-step generation +paradigm is hard to achieve an optimal solution. First, complex optimization +methods such as combinatorial ones are hard to be captured by LLMs. Second, the +one-step generation paradigm poses challenge in precisely infusing the +knowledge required for effective code optimization within LLMs, resulting in +under-optimized code.To address these problems, we propose to model this task +from the search perspective, and propose a search-based LLMs framework named +SBLLM that enables iterative refinement and discovery of improved optimization +methods. SBLLM synergistically integrate LLMs with evolutionary search and +consists of three key components: 1) an execution-based representative sample +selection part that evaluates the fitness of each existing optimized code and +prioritizes promising ones to pilot the generation of improved code; 2) an +adaptive optimization pattern retrieval part that infuses targeted optimization +patterns into the model for guiding LLMs towards rectifying and progressively +enhancing their optimization methods; and 3) a genetic operator-inspired +chain-of-thought prompting part that aids LLMs in combining different +optimization methods and generating improved optimization methods. + +
+
+ comment: Accepted by 2025 IEEE/ACM 47th International Conference on Software + Engineering (ICSE'25) +
+
+
+
+
+ + ☆ Implicit Sentiment Analysis Based on Chain of Thought Prompting + + +
+ Implicit Sentiment Analysis (ISA) is a crucial research area in natural +language processing. Inspired by the idea of large language model Chain of +Thought (CoT), this paper introduces a Sentiment Analysis of Thinking (SAoT) +framework. The framework first analyzes the implicit aspects and opinions in +the text using common sense and thinking chain capabilities. Then, it reflects +on the process of implicit sentiment analysis and finally deduces the polarity +of sentiment. The model is evaluated on the SemEval 2014 dataset, consisting of +1120 restaurant reviews and 638 laptop reviews. The experimental results +demonstrate that the utilization of the ERNIE-Bot-4+SAoT model yields a notable +performance improvement. Specifically, on the restaurant dataset, the F1 score +reaches 75.27, accompanied by an ISA score of 66.29. Similarly, on the computer +dataset, the F1 score achieves 76.50, while the ISA score amounts to 73.46. +Comparatively, the ERNIE-Bot-4+SAoT model surpasses the BERTAsp + SCAPt +baseline by an average margin of 47.99%. + +
+
+
+
+
+ + ☆ A Tighter Complexity Analysis of SparseGPT + + +
+ In this work, we improved the analysis of the running time of SparseGPT +[Frantar, Alistarh ICML 2023] from $O(d^{3})$ to $O(d^{\omega} + d^{2+a+o(1)} + +d^{1+\omega(1,1,a)-a})$ for any $a \in [0, 1]$, where $\omega$ is the exponent +of matrix multiplication. In particular, for the current $\omega \approx 2.371$ +[Alman, Duan, Williams, Xu, Xu, Zhou 2024], our running times boil down to +$O(d^{2.53})$. This running time is due to the analysis of the lazy update +behavior in iterative maintenance problems, such as [Deng, Song, Weinstein +2022, Brand, Song, Zhou ICML 2024]. + +
+
+
+
+
+ + ☆ MDD-5k: A New Diagnostic Conversation Dataset for Mental Disorders + Synthesized via Neuro-Symbolic LLM Agents + + +
+ The clinical diagnosis of most mental disorders primarily relies on the +conversations between psychiatrist and patient. The creation of such diagnostic +conversation datasets is promising to boost the AI mental healthcare community. +However, directly collecting the conversations in real diagnosis scenarios is +near impossible due to stringent privacy and ethical considerations. To address +this issue, we seek to synthesize diagnostic conversation by exploiting +anonymous patient cases that are easier to access. Specifically, we design a +neuro-symbolic multi-agent framework for synthesizing the diagnostic +conversation of mental disorders with large language models. It takes patient +case as input and is capable of generating multiple diverse conversations with +one single patient case. The framework basically involves the interaction +between a doctor agent and a patient agent, and achieves text generation under +symbolic control via a dynamic diagnosis tree from a tool agent. By applying +the proposed framework, we develop the largest Chinese mental disorders +diagnosis dataset MDD-5k, which is built upon 1000 cleaned real patient cases +by cooperating with a pioneering psychiatric hospital, and contains 5000 +high-quality long conversations with diagnosis results as labels. To the best +of our knowledge, it's also the first labelled Chinese mental disorders +diagnosis dataset. Human evaluation demonstrates the proposed MDD-5k dataset +successfully simulates human-like diagnostic process of mental disorders. The +dataset and code will become publicly accessible in +https://github.com/lemonsis/MDD-5k. + +
+
+
+
+
+ + ☆ RoVRM: A Robust Visual Reward Model Optimized via Auxiliary Textual + Preference Data + + +
+ Large vision-language models (LVLMs) often fail to align with human +preferences, leading to issues like generating misleading content without +proper visual context (also known as hallucination). A promising solution to +this problem is using human-preference alignment techniques, such as best-of-n +sampling and reinforcement learning. However, these techniques face the +difficulty arising from the scarcity of visual preference data, which is +required to train a visual reward model (VRM). In this work, we continue the +line of research. We present a Robust Visual Reward Model (RoVRM) which +improves human-preference alignment for LVLMs. RoVRM leverages auxiliary +textual preference data through a three-phase progressive training and optimal +transport-based preference data selection to effectively mitigate the scarcity +of visual preference data. We experiment with RoVRM on the commonly used +vision-language tasks based on the LLaVA-1.5-7B and -13B models. Experimental +results demonstrate that RoVRM consistently outperforms traditional VRMs. +Furthermore, our three-phase progressive training and preference data selection +approaches can yield consistent performance gains over ranking-based alignment +techniques, such as direct preference optimization. + +
+
+
+
+
+ + ☆ Extraction of Research Objectives, Machine Learning Model Names, and + Dataset Names from Academic Papers and Analysis of Their Interrelationships + Using LLM and Network Analysis + + +
+ Machine learning is widely utilized across various industries. Identifying +the appropriate machine learning models and datasets for specific tasks is +crucial for the effective industrial application of machine learning. However, +this requires expertise in both machine learning and the relevant domain, +leading to a high learning cost. Therefore, research focused on extracting +combinations of tasks, machine learning models, and datasets from academic +papers is critically important, as it can facilitate the automatic +recommendation of suitable methods. Conventional information extraction methods +from academic papers have been limited to identifying machine learning models +and other entities as named entities. To address this issue, this study +proposes a methodology extracting tasks, machine learning methods, and dataset +names from scientific papers and analyzing the relationships between these +information by using LLM, embedding model, and network clustering. The proposed +method's expression extraction performance, when using Llama3, achieves an +F-score exceeding 0.8 across various categories, confirming its practical +utility. Benchmarking results on financial domain papers have demonstrated the +effectiveness of this method, providing insights into the use of the latest +datasets, including those related to ESG (Environmental, Social, and +Governance) data. + +
+
+ comment: 10 pages, 8 figures +
+
+
+
+
+ + ☆ uMedSum: A Unified Framework for Advancing Medical Abstractive + Summarization + + +
+ Medical abstractive summarization faces the challenge of balancing +faithfulness and informativeness. Current methods often sacrifice key +information for faithfulness or introduce confabulations when prioritizing +informativeness. While recent advancements in techniques like in-context +learning (ICL) and fine-tuning have improved medical summarization, they often +overlook crucial aspects such as faithfulness and informativeness without +considering advanced methods like model reasoning and self-improvement. +Moreover, the field lacks a unified benchmark, hindering systematic evaluation +due to varied metrics and datasets. This paper addresses these gaps by +presenting a comprehensive benchmark of six advanced abstractive summarization +methods across three diverse datasets using five standardized metrics. Building +on these findings, we propose uMedSum, a modular hybrid summarization framework +that introduces novel approaches for sequential confabulation removal followed +by key missing information addition, ensuring both faithfulness and +informativeness. Our work improves upon previous GPT-4-based state-of-the-art +(SOTA) medical summarization methods, significantly outperforming them in both +quantitative metrics and qualitative domain expert evaluations. Notably, we +achieve an average relative performance improvement of 11.8% in reference-free +metrics over the previous SOTA. Doctors prefer uMedSum's summaries 6 times more +than previous SOTA in difficult cases where there are chances of confabulations +or missing information. These results highlight uMedSum's effectiveness and +generalizability across various datasets and metrics, marking a significant +advancement in medical summarization. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ High-Quality Data Augmentation for Low-Resource NMT: Combining a + Translation Memory, a GAN Generator, and Filtering + + +
+ Back translation, as a technique for extending a dataset, is widely used by +researchers in low-resource language translation tasks. It typically translates +from the target to the source language to ensure high-quality translation +results. This paper proposes a novel way of utilizing a monolingual corpus on +the source side to assist Neural Machine Translation (NMT) in low-resource +settings. We realize this concept by employing a Generative Adversarial Network +(GAN), which augments the training data for the discriminator while mitigating +the interference of low-quality synthetic monolingual translations with the +generator. Additionally, this paper integrates Translation Memory (TM) with +NMT, increasing the amount of data available to the generator. Moreover, we +propose a novel procedure to filter the synthetic sentence pairs during the +augmentation process, ensuring the high quality of the data. + +
+
+
+
+
+ + ☆ ConflictBank: A Benchmark for Evaluating the Influence of Knowledge + Conflicts in LLM + + +
+ Large language models (LLMs) have achieved impressive advancements across +numerous disciplines, yet the critical issue of knowledge conflicts, a major +source of hallucinations, has rarely been studied. Only a few research explored +the conflicts between the inherent knowledge of LLMs and the retrieved +contextual knowledge. However, a thorough assessment of knowledge conflict in +LLMs is still missing. Motivated by this research gap, we present ConflictBank, +the first comprehensive benchmark developed to systematically evaluate +knowledge conflicts from three aspects: (i) conflicts encountered in retrieved +knowledge, (ii) conflicts within the models' encoded knowledge, and (iii) the +interplay between these conflict forms. Our investigation delves into four +model families and twelve LLM instances, meticulously analyzing conflicts +stemming from misinformation, temporal discrepancies, and semantic divergences. +Based on our proposed novel construction framework, we create 7,453,853 +claim-evidence pairs and 553,117 QA pairs. We present numerous findings on +model scale, conflict causes, and conflict types. We hope our ConflictBank +benchmark will help the community better understand model behavior in conflicts +and develop more reliable LLMs. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Evidence-backed Fact Checking using RAG and Few-Shot In-Context Learning + with LLMs + + +
+ Given the widespread dissemination of misinformation on social media, +implementing fact-checking mechanisms for online claims is essential. Manually +verifying every claim is highly challenging, underscoring the need for an +automated fact-checking system. This paper presents our system designed to +address this issue. We utilize the Averitec dataset to assess the veracity of +claims. In addition to veracity prediction, our system provides supporting +evidence, which is extracted from the dataset. We develop a Retrieve and +Generate (RAG) pipeline to extract relevant evidence sentences from a knowledge +base, which are then inputted along with the claim into a large language model +(LLM) for classification. We also evaluate the few-shot In-Context Learning +(ICL) capabilities of multiple LLMs. Our system achieves an 'Averitec' score of +0.33, which is a 22% absolute improvement over the baseline. All code will be +made available on All code will be made available on +https://github.com/ronit-singhal/evidence-backed-fact-checking-using-rag-and-few-shot-in-context-learning-with-llms. + +
+
+
+
+
+ + ☆ Aligning (Medical) LLMs for (Counterfactual) Fairness + + +
+ Large Language Models (LLMs) have emerged as promising solutions for a +variety of medical and clinical decision support applications. However, LLMs +are often subject to different types of biases, which can lead to unfair +treatment of individuals, worsening health disparities, and reducing trust in +AI-augmented medical tools. Aiming to address this important issue, in this +study, we present a new model alignment approach for aligning LLMs using a +preference optimization method within a knowledge distillation framework. Prior +to presenting our proposed method, we first use an evaluation framework to +conduct a comprehensive (largest to our knowledge) empirical evaluation to +reveal the type and nature of existing biases in LLMs used for medical +applications. We then offer a bias mitigation technique to reduce the unfair +patterns in LLM outputs across different subgroups identified by the protected +attributes. We show that our mitigation method is effective in significantly +reducing observed biased patterns. Our code is publicly available at +\url{https://github.com/healthylaife/FairAlignmentLLM}. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2404.15149 +
+
+
+
+
+ + ♻ ☆ Understanding Reference Policies in Direct Preference Optimization + + +
+ Direct Preference Optimization (DPO) has become a widely used training method +for the instruction fine-tuning of large language models (LLMs). In this work, +we explore an under-investigated aspect of DPO - its dependency on the +reference model or policy. Such reference policies, typically instantiated as +the model to be further fine-tuned, are important since they can impose an +upper limit on DPO's effectiveness. Therefore, we address three related +research questions in this work. First, we explore the optimal strength of the +KL divergence constraint in DPO, which penalizes deviations from the reference +policy, and find that DPO is sensitive to this strength. Next, we examine the +necessity of the KL-constraint from the reference policies in DPO by providing +both theoretical and empirical comparisons between DPO and related learning +objectives, demonstrating DPO's superiority in this controlled setting. +Additionally, we investigate whether DPO benefits from stronger reference +policies, finding that a stronger reference policy can lead to improved +performance, but only when it is similar to the model being fine-tuned. Our +findings highlight the confounding role of reference policies in DPO and offer +insights for best practices, while also identifying open research questions for +future studies. + +
+
+ comment: GitHub Repo: https://github.com/yale-nlp/refdpo +
+
+
+
+
+ + ♻ ☆ SPICED: News Similarity Detection Dataset with Multiple Topics and + Complexity Levels LREC + + +
+ The proliferation of news media outlets has increased the demand for +intelligent systems capable of detecting redundant information in news articles +in order to enhance user experience. However, the heterogeneous nature of news +can lead to spurious findings in these systems: Simple heuristics such as +whether a pair of news are both about politics can provide strong but deceptive +downstream performance. Segmenting news similarity datasets into topics +improves the training of these models by forcing them to learn how to +distinguish salient characteristics under more narrow domains. However, this +requires the existence of topic-specific datasets, which are currently lacking. +In this article, we propose a novel dataset of similar news, SPICED, which +includes seven topics: Crime & Law, Culture & Entertainment, Disasters & +Accidents, Economy & Business, Politics & Conflicts, Science & Technology, and +Sports. Futhermore, we present four different levels of complexity, +specifically designed for news similarity detection task. We benchmarked the +created datasets using MinHash, BERT, SBERT, and SimCSE models. + +
+
+ comment: LREC-COLING 2024 +
+
+
+
+
+ + ♻ ☆ Prefix Guidance: A Steering Wheel for Large Language Models to Defend + Against Jailbreak Attacks + + +
+ In recent years, the rapid development of large language models (LLMs) has +achieved remarkable performance across various tasks. However, research +indicates that LLMs are vulnerable to jailbreak attacks, where adversaries can +induce the generation of harmful content through meticulously crafted prompts. +This vulnerability poses significant challenges to the secure use and promotion +of LLMs. Existing defense methods offer protection from different perspectives +but often suffer from insufficient effectiveness or a significant impact on the +model's capabilities. In this paper, we propose a plug-and-play and +easy-to-deploy jailbreak defense framework, namely Prefix Guidance (PG), which +guides the model to identify harmful prompts by directly setting the first few +tokens of the model's output. This approach combines the model's inherent +security capabilities with an external classifier to defend against jailbreak +attacks. We demonstrate the effectiveness of PG across three models and five +attack methods. Compared to baselines, our approach is generally more effective +on average. Additionally, results on the Just-Eval benchmark further confirm +PG's superiority to preserve the model's performance. our code is available at +https://github.com/weiyezhimeng/Prefix-Guidance. + +
+
+
+
+
+ + ♻ ☆ From Lazy to Prolific: Tackling Missing Labels in Open Vocabulary + Extreme Classification by Positive-Unlabeled Sequence Learning + + +
+ Open-vocabulary Extreme Multi-label Classification (OXMC) extends traditional +XMC by allowing prediction beyond an extremely large, predefined label set +(typically $10^3$ to $10^{12}$ labels), addressing the dynamic nature of +real-world labeling tasks. However, self-selection bias in data annotation +leads to significant missing labels in both training and test data, +particularly for less popular inputs. This creates two critical challenges: +generation models learn to be "lazy'" by under-generating labels, and +evaluation becomes unreliable due to insufficient annotation in the test set. +In this work, we introduce Positive-Unlabeled Sequence Learning (PUSL), which +reframes OXMC as an infinite keyphrase generation task, addressing the +generation model's laziness. Additionally, we propose to adopt a suite of +evaluation metrics, F1@$\mathcal{O}$ and newly proposed B@$k$, to reliably +assess OXMC models with incomplete ground truths. In a highly imbalanced +e-commerce dataset with substantial missing labels, PUSL generates 30% more +unique labels, and 72% of its predictions align with actual user queries. On +the less skewed EURLex-4.3k dataset, PUSL demonstrates superior F1 scores, +especially as label counts increase from 15 to 30. Our approach effectively +tackles both the modeling and evaluation challenges in OXMC with missing +labels. + +
+
+
+
+
+ + ♻ ☆ Topics as Entity Clusters: Entity-based Topics from Large Language + Models and Graph Neural Networks LREC + + +
+ Topic models aim to reveal latent structures within a corpus of text, +typically through the use of term-frequency statistics over bag-of-words +representations from documents. In recent years, conceptual entities -- +interpretable, language-independent features linked to external knowledge +resources -- have been used in place of word-level tokens, as words typically +require extensive language processing with a minimal assurance of +interpretability. However, current literature is limited when it comes to +exploring purely entity-driven neural topic modeling. For instance, despite the +advantages of using entities for eliciting thematic structure, it is unclear +whether current techniques are compatible with these sparsely organised, +information-dense conceptual units. In this work, we explore entity-based +neural topic modeling and propose a novel topic clustering approach using +bimodal vector representations of entities. Concretely, we extract these latent +representations from large language models and graph neural networks trained on +a knowledge base of symbolic relations, in order to derive the most salient +aspects of these conceptual units. Analysis of coherency metrics confirms that +our approach is better suited to working with entities in comparison to +state-of-the-art models, particularly when using graph-based embeddings trained +on a knowledge base. + +
+
+ comment: 16 pages, 1 figure. LREC-COLING 2024 +
+
+
+
+
+ + ♻ ☆ Generalizing Visual Question Answering from Synthetic to Human-Written + Questions via a Chain of QA with a Large Language Model + + +
+ Visual question answering (VQA) is a task where an image is given, and a +series of questions are asked about the image. To build an efficient VQA +algorithm, a large amount of QA data is required which is very expensive. +Generating synthetic QA pairs based on templates is a practical way to obtain +data. However, VQA models trained on those data do not perform well on complex, +human-written questions. To address this issue, we propose a new method called +{\it chain of QA for human-written questions} (CoQAH). CoQAH utilizes a +sequence of QA interactions between a large language model and a VQA model +trained on synthetic data to reason and derive logical answers for +human-written questions. We tested the effectiveness of CoQAH on two types of +human-written VQA datasets for 3D-rendered and chest X-ray images and found +that it achieved state-of-the-art accuracy in both types of data. Notably, +CoQAH outperformed general vision-language models, VQA models, and medical +foundation models with no finetuning. + +
+
+
+
+
+ + ♻ ☆ Uncovering Latent Arguments in Social Media Messaging by Employing + LLMs-in-the-Loop Strategy + + +
+ The widespread use of social media has led to a surge in popularity for +automated methods of analyzing public opinion. Supervised methods are adept at +text categorization, yet the dynamic nature of social media discussions poses a +continual challenge for these techniques due to the constant shifting of the +focus. On the other hand, traditional unsupervised methods for extracting +themes from public discourse, such as topic modeling, often reveal overarching +patterns that might not capture specific nuances. Consequently, a significant +portion of research into social media discourse still depends on +labor-intensive manual coding techniques and a human-in-the-loop approach, +which are both time-consuming and costly. In this work, we study the problem of +discovering arguments associated with a specific theme. We propose a generic +LLMs-in-the-Loop strategy that leverages the advanced capabilities of Large +Language Models (LLMs) to extract latent arguments from social media messaging. +To demonstrate our approach, we apply our framework to contentious topics. We +use two publicly available datasets: (1) the climate campaigns dataset of 14k +Facebook ads with 25 themes and (2) the COVID-19 vaccine campaigns dataset of +9k Facebook ads with 14 themes. Additionally, we design a downstream task as +stance prediction by leveraging talking points in climate debates. Furthermore, +we analyze demographic targeting and the adaptation of messaging based on +real-world events. + +
+
+
+
+
+ + ♻ ☆ The Oscars of AI Theater: A Survey on Role-Playing with Language Models + + +
+ This survey explores the burgeoning field of role-playing with language +models, focusing on their development from early persona-based models to +advanced character-driven simulations facilitated by Large Language Models +(LLMs). Initially confined to simple persona consistency due to limited model +capabilities, role-playing tasks have now expanded to embrace complex character +portrayals involving character consistency, behavioral alignment, and overall +attractiveness. We provide a comprehensive taxonomy of the critical components +in designing these systems, including data, models and alignment, agent +architecture and evaluation. This survey not only outlines the current +methodologies and challenges, such as managing dynamic personal profiles and +achieving high-level persona consistency but also suggests avenues for future +research in improving the depth and realism of role-playing applications. The +goal is to guide future research by offering a structured overview of current +methodologies and identifying potential areas for improvement. Related +resources and papers are available at +https://github.com/nuochenpku/Awesome-Role-Play-Papers. + +
+
+ comment: 28 pages +
+
+
+
+
+ + ♻ ☆ Can we trust the evaluation on ChatGPT? + + +
+ ChatGPT, the first large language model (LLM) with mass adoption, has +demonstrated remarkable performance in numerous natural language tasks. Despite +its evident usefulness, evaluating ChatGPT's performance in diverse problem +domains remains challenging due to the closed nature of the model and its +continuous updates via Reinforcement Learning from Human Feedback (RLHF). We +highlight the issue of data contamination in ChatGPT evaluations, with a case +study of the task of stance detection. We discuss the challenge of preventing +data contamination and ensuring fair model evaluation in the age of closed and +continuously trained models. + +
+
+
+
+
+ + ♻ ☆ AI-Augmented Predictions: LLM Assistants Improve Human Forecasting + Accuracy + + +
+ Large language models (LLMs) match and sometimes exceeding human performance +in many domains. This study explores the potential of LLMs to augment human +judgement in a forecasting task. We evaluate the effect on human forecasters of +two LLM assistants: one designed to provide high-quality ("superforecasting") +advice, and the other designed to be overconfident and base-rate neglecting, +thus providing noisy forecasting advice. We compare participants using these +assistants to a control group that received a less advanced model that did not +provide numerical predictions or engaged in explicit discussion of predictions. +Participants (N = 991) answered a set of six forecasting questions and had the +option to consult their assigned LLM assistant throughout. Our preregistered +analyses show that interacting with each of our frontier LLM assistants +significantly enhances prediction accuracy by between 24 percent and 28 percent +compared to the control group. Exploratory analyses showed a pronounced outlier +effect in one forecasting item, without which we find that the superforecasting +assistant increased accuracy by 41 percent, compared with 29 percent for the +noisy assistant. We further examine whether LLM forecasting augmentation +disproportionately benefits less skilled forecasters, degrades the +wisdom-of-the-crowd by reducing prediction diversity, or varies in +effectiveness with question difficulty. Our data do not consistently support +these hypotheses. Our results suggest that access to a frontier LLM assistant, +even a noisy one, can be a helpful decision aid in cognitively demanding tasks +compared to a less powerful model that does not provide specific forecasting +advice. However, the effects of outliers suggest that further research into the +robustness of this pattern is needed. + +
+
+ comment: 22 pages pages (main text comprised of 19 pages, appendix comprised + of three pages). 10 visualizations in the main text (four figures, six + tables), three additional figures in the appendix +
+
+
+
+
+ + ♻ ☆ Language Agents as Optimizable Graphs + + +
+ Various human-designed prompt engineering techniques have been proposed to +improve problem solvers based on Large Language Models (LLMs), yielding many +disparate code bases. We unify these approaches by describing LLM-based agents +as computational graphs. The nodes implement functions to process multimodal +data or query LLMs, and the edges describe the information flow between +operations. Graphs can be recursively combined into larger composite graphs +representing hierarchies of inter-agent collaboration (where edges connect +operations of different agents). Our novel automatic graph optimizers (1) +refine node-level LLM prompts (node optimization) and (2) improve agent +orchestration by changing graph connectivity (edge optimization). Experiments +demonstrate that our framework can be used to efficiently develop, integrate, +and automatically improve various LLM agents. The code can be found at +https://github.com/metauto-ai/gptswarm. + +
+
+ comment: Project Website: https://gptswarm.org ; Github Repo: + https://github.com/metauto-ai/gptswarm . In Forty-first International + Conference on Machine Learning (2024) +
+
+
+
+
+ + ♻ ☆ Dependency Annotation of Ottoman Turkish with Multilingual BERT + + +
+ This study introduces a pretrained large language model-based annotation +methodology for the first de dency treebank in Ottoman Turkish. Our +experimental results show that, iteratively, i) pseudo-annotating data using a +multilingual BERT-based parsing model, ii) manually correcting the +pseudo-annotations, and iii) fine-tuning the parsing model with the corrected +annotations, we speed up and simplify the challenging dependency annotation +process. The resulting treebank, that will be a part of the Universal +Dependencies (UD) project, will facilitate automated analysis of Ottoman +Turkish documents, unlocking the linguistic richness embedded in this +historical heritage. + +
+
+ comment: 9 pages, 5 figures. Accepted to LAW-XVIII +
+
+
+
+
+ + ♻ ☆ A Modular Approach for Multimodal Summarization of TV Shows + + +
+ In this paper we address the task of summarizing television shows, which +touches key areas in AI research: complex reasoning, multiple modalities, and +long narratives. We present a modular approach where separate components +perform specialized sub-tasks which we argue affords greater flexibility +compared to end-to-end methods. Our modules involve detecting scene boundaries, +reordering scenes so as to minimize the number of cuts between different +events, converting visual information to text, summarizing the dialogue in each +scene, and fusing the scene summaries into a final summary for the entire +episode. We also present a new metric, PRISMA (Precision and Recall EvaluatIon +of Summary FActs), to measure both precision and recall of generated summaries, +which we decompose into atomic facts. Tested on the recently released +SummScreen3D dataset, our method produces higher quality summaries than +comparison models, as measured with ROUGE and our new fact-based metric, and as +assessed by human evaluators. + +
+
+
+
+
+ + ♻ ☆ SUBLLM: A Novel Efficient Architecture with Token Sequence Subsampling + for LLM ECAI 2024 + + +
+ While Large Language Models (LLMs) have achieved remarkable success in +various fields, the efficiency of training and inference remains a major +challenge. To address this issue, we propose SUBLLM, short for +Subsampling-Upsampling-Bypass Large Language Model, an innovative architecture +that extends the core decoder-only framework by incorporating subsampling, +upsampling, and bypass modules. The subsampling modules are responsible for +shortening the sequence, while the upsampling modules restore the sequence +length, and the bypass modules enhance convergence. In comparison to LLaMA, the +proposed SUBLLM exhibits significant enhancements in both training and +inference speeds as well as memory usage, while maintaining competitive +few-shot performance. During training, SUBLLM increases speeds by 26% and cuts +memory by 10GB per GPU. In inference, it boosts speeds by up to 37% and reduces +memory by 1GB per GPU. The training and inference speeds can be enhanced by 34% +and 52% respectively when the context window is expanded to 8192. Our code is +available at https://github.com/XiaoMi/subllm. + +
+
+ comment: 10 pages, 5 figures, accepted by ECAI 2024 +
+
+
+
+
+ + ♻ ☆ KLoB: a Benchmark for Assessing Knowledge Locating Methods in Language + Models + + +
+ Recently, Locate-Then-Edit paradigm has emerged as one of the main approaches +in changing factual knowledge stored in the Language models. However, there is +a lack of research on whether present locating methods can pinpoint the exact +parameters embedding the desired knowledge. Moreover, although many researchers +have questioned the validity of locality hypothesis of factual knowledge, no +method is provided to test the a hypothesis for more in-depth discussion and +research. Therefore, we introduce KLoB, a benchmark examining three essential +properties that a reliable knowledge locating method should satisfy. KLoB can +serve as a benchmark for evaluating existing locating methods in language +models, and can contributes a method to reassessing the validity of locality +hypothesis of factual knowledge. KLoB is publicly available at an anonymous +GitHub: \url{https://github.com/anon6662/KLoB}. + +
+
+
+
+
+ + ♻ ☆ On Early Detection of Hallucinations in Factual Question Answering KDD 2024 + + +
+ While large language models (LLMs) have taken great strides towards helping +humans with a plethora of tasks, hallucinations remain a major impediment +towards gaining user trust. The fluency and coherence of model generations even +when hallucinating makes detection a difficult task. In this work, we explore +if the artifacts associated with the model generations can provide hints that +the generation will contain hallucinations. Specifically, we probe LLMs at 1) +the inputs via Integrated Gradients based token attribution, 2) the outputs via +the Softmax probabilities, and 3) the internal state via self-attention and +fully-connected layer activations for signs of hallucinations on open-ended +question answering tasks. Our results show that the distributions of these +artifacts tend to differ between hallucinated and non-hallucinated generations. +Building on this insight, we train binary classifiers that use these artifacts +as input features to classify model generations into hallucinations and +non-hallucinations. These hallucination classifiers achieve up to $0.80$ AUROC. +We also show that tokens preceding a hallucination can already predict the +subsequent hallucination even before it occurs. + +
+
+ comment: KDD 2024 +
+
+
+
+
+ + ♻ ☆ Vaccine: Perturbation-aware Alignment for Large Language Models against + Harmful Fine-tuning + + +
+ The new paradigm of finetuning-as-a-service introduces a new attack surface +for Large Language Models (LLMs): a few harmful data uploaded by users can +easily trick the finetuning to produce an alignment-broken model. We conduct an +empirical analysis and uncover a \textit{harmful embedding drift} phenomenon, +showing a probable cause of the alignment-broken effect. Inspired by our +findings, we propose Vaccine, a perturbation-aware alignment technique to +mitigate the security risk of users finetuning. The core idea of Vaccine is to +produce invariant hidden embeddings by progressively adding crafted +perturbation to them in the alignment phase. This enables the embeddings to +withstand harmful perturbation from un-sanitized user data in the finetuning +phase. Our results on open source mainstream LLMs (e.g., Llama2, Opt, Vicuna) +demonstrate that Vaccine can boost the robustness of alignment against harmful +prompts induced embedding drift while reserving reasoning ability towards +benign prompts. Our code is available at +\url{https://github.com/git-disl/Vaccine}. + +
+
+
+
+
+ + ♻ ☆ The Curious Case of Nonverbal Abstract Reasoning with Multi-Modal Large + Language Models + + +
+ While large language models (LLMs) are still being adopted to new domains and +utilized in novel applications, we are experiencing an influx of the new +generation of foundation models, namely multi-modal large language models +(MLLMs). These models integrate verbal and visual information, opening new +possibilities to demonstrate more complex reasoning abilities at the +intersection of the two modalities. However, despite the revolutionizing +prospect of MLLMs, our understanding of their reasoning abilities is limited. +In this study, we assess the nonverbal abstract reasoning abilities of +open-source and closed-source MLLMs using variations of Raven's Progressive +Matrices. Our experiments reveal the challenging nature of such problems for +MLLMs while showcasing the immense gap between open-source and closed-source +models. We also uncover critical shortcomings of visual and textual +perceptions, subjecting the models to low-performance ceilings. Finally, to +improve MLLMs' performance, we experiment with different methods, such as +Chain-of-Thought prompting, leading to a significant (up to 100%) boost in +performance. Our code and datasets are available at +https://github.com/usc-isi-i2/isi-mmlm-rpm. + +
+
+ comment: 21 pages +
+
+
+
+
+ + ♻ ☆ Mistral-SPLADE: LLMs for better Learned Sparse Retrieval + + +
+ Learned Sparse Retrievers (LSR) have evolved into an effective retrieval +strategy that can bridge the gap between traditional keyword-based sparse +retrievers and embedding-based dense retrievers. At its core, learned sparse +retrievers try to learn the most important semantic keyword expansions from a +query and/or document which can facilitate better retrieval with overlapping +keyword expansions. LSR like SPLADE has typically been using encoder only +models with MLM (masked language modeling) style objective in conjunction with +known ways of retrieval performance improvement such as hard negative mining, +distillation, etc. In this work, we propose to use decoder-only model for +learning semantic keyword expansion. We posit, decoder only models that have +seen much higher magnitudes of data are better equipped to learn keyword +expansions needed for improved retrieval. We use Mistral as the backbone to +develop our Learned Sparse Retriever similar to SPLADE and train it on a subset +of sentence-transformer data which is often used for training text embedding +models. Our experiments support the hypothesis that a sparse retrieval model +based on decoder only large language model (LLM) surpasses the performance of +existing LSR systems, including SPLADE and all its variants. The LLM based +model (Echo-Mistral-SPLADE) now stands as a state-of-the-art learned sparse +retrieval model on the BEIR text retrieval benchmark. + +
+
+
+
+
+ + ♻ ☆ Large Language Models Might Not Care What You Are Saying: Prompt Format + Beats Descriptions + + +
+ With the help of in-context learning (ICL), large language models (LLMs) have +achieved impressive performance across various tasks. However, the function of +descriptive instructions during ICL remains under-explored. In this work, we +propose an ensemble prompt framework to describe the selection criteria of +multiple in-context examples, and preliminary experiments on machine +translation (MT) across six translation directions confirm that this framework +boosts ICL perfromance. But to our surprise, LLMs might not necessarily care +what the descriptions actually say, and the performance gain is primarily +caused by the ensemble format, since the framework could lead to improvement +even with random descriptive nouns. We further apply this new ensemble prompt +on a range of commonsense, math, logical reasoning and hallucination tasks with +three LLMs and achieve promising results, suggesting again that designing a +proper prompt format would be much more effective and efficient than paying +effort into specific descriptions. Our code will be publicly available once +this paper is published. + +
+
+ comment: There are some mistakes in the experimental data +
+
+
+
+
+ + ♻ ☆ Understanding the Relationship between Prompts and Response Uncertainty + in Large Language Models + + +
+ Large language models (LLMs) are widely used in decision-making, but their +reliability, especially in critical tasks like healthcare, is not +well-established. Therefore, understanding how LLMs reason and make decisions +is crucial for their safe deployment. This paper investigates how the +uncertainty of responses generated by LLMs relates to the information provided +in the input prompt. Leveraging the insight that LLMs learn to infer latent +concepts during pretraining, we propose a prompt-response concept model that +explains how LLMs generate responses and helps understand the relationship +between prompts and response uncertainty. We show that the uncertainty +decreases as the prompt's informativeness increases, similar to epistemic +uncertainty. Our detailed experimental results on real datasets validate our +proposed model. + +
+
+ comment: 27 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ Clarify: Improving Model Robustness With Natural Language Corrections + + +
+ The standard way to teach models is by feeding them lots of data. However, +this approach often teaches models incorrect ideas because they pick up on +misleading signals in the data. To prevent such misconceptions, we must +necessarily provide additional information beyond the training data. Prior +methods incorporate additional instance-level supervision, such as labels for +misleading features or additional labels for debiased data. However, such +strategies require a large amount of labeler effort. We hypothesize that people +are good at providing textual feedback at the concept level, a capability that +existing teaching frameworks do not leverage. We propose Clarify, a novel +interface and method for interactively correcting model misconceptions. Through +Clarify, users need only provide a short text description of a model's +consistent failure patterns. Then, in an entirely automated way, we use such +descriptions to improve the training process. Clarify is the first end-to-end +system for user model correction. Our user studies show that non-expert users +can successfully describe model misconceptions via Clarify, leading to +increased worst-case performance in two datasets. We additionally conduct a +case study on a large-scale image dataset, ImageNet, using Clarify to find and +rectify 31 novel hard subpopulations. + +
+
+ comment: UIST 2024. Interface code available at + https://github.com/yoonholee/Clarify +
+
+
+
+
+ + ♻ ☆ Lighthouse: A User-Friendly Library for Reproducible Video Moment + Retrieval and Highlight Detection + + +
+ We propose Lighthouse, a user-friendly library for reproducible video moment +retrieval and highlight detection (MR-HD). Although researchers proposed +various MR-HD approaches, the research community holds two main issues. The +first is a lack of comprehensive and reproducible experiments across various +methods, datasets, and video-text features. This is because no unified training +and evaluation codebase covers multiple settings. The second is user-unfriendly +design. Because previous works use different libraries, researchers set up +individual environments. In addition, most works release only the training +codes, requiring users to implement the whole inference process of MR-HD. +Lighthouse addresses these issues by implementing a unified reproducible +codebase that includes six models, three features, and five datasets. In +addition, it provides an inference API and web demo to make these methods +easily accessible for researchers and developers. Our experiments demonstrate +that Lighthouse generally reproduces the reported scores in the reference +papers. The code is available at https://github.com/line/lighthouse. + +
+
+ comment: 6 pages; library tech report +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 114 + +
+
+
+ + ☆ DreamCinema: Cinematic Transfer with Free Camera and 3D Character + + +
+ We are living in a flourishing era of digital media, where everyone has the +potential to become a personal filmmaker. Current research on cinematic +transfer empowers filmmakers to reproduce and manipulate the visual elements +(e.g., cinematography and character behaviors) from classic shots. However, +characters in the reimagined films still rely on manual crafting, which +involves significant technical complexity and high costs, making it +unattainable for ordinary users. Furthermore, their estimated cinematography +lacks smoothness due to inadequate capturing of inter-frame motion and modeling +of physical trajectories. Fortunately, the remarkable success of 2D and 3D AIGC +has opened up the possibility of efficiently generating characters tailored to +users' needs, diversifying cinematography. In this paper, we propose +DreamCinema, a novel cinematic transfer framework that pioneers generative AI +into the film production paradigm, aiming at facilitating user-friendly film +creation. Specifically, we first extract cinematic elements (i.e., human and +camera pose) and optimize the camera trajectory. Then, we apply a character +generator to efficiently create 3D high-quality characters with a human +structure prior. Finally, we develop a structure-guided motion transfer +strategy to incorporate generated characters into film creation and transfer it +via 3D graphics engines smoothly. Extensive experiments demonstrate the +effectiveness of our method for creating high-quality films with free camera +and 3D characters. + +
+
+ comment: Project page: https://liuff19.github.io/DreamCinema +
+
+
+
+
+ + ☆ ND-SDF: Learning Normal Deflection Fields for High-Fidelity Indoor + Reconstruction + + +
+ Neural implicit reconstruction via volume rendering has demonstrated its +effectiveness in recovering dense 3D surfaces. However, it is non-trivial to +simultaneously recover meticulous geometry and preserve smoothness across +regions with differing characteristics. To address this issue, previous methods +typically employ geometric priors, which are often constrained by the +performance of the prior models. In this paper, we propose ND-SDF, which learns +a Normal Ddeflection field to represent the angular deviation between the scene +normal and the prior normal. Unlike previous methods that uniformly apply +geometric priors on all samples, introducing significant bias in accuracy, our +proposed normal deflection field dynamically learns and adapts the utilization +of samples based on their specific characteristics, thereby improving both the +accuracy and effectiveness of the model. Our method not only obtains smooth +weakly textured regions such as walls and floors but also preserves the +geometric details of complex structures. In addition, we introduce a novel ray +sampling strategy based on the deflection angle to facilitate the unbiased +rendering process, which significantly improves the quality and accuracy of +intricate surfaces, especially on thin structures. Consistent improvements on +various challenging datasets demonstrate the superiority of our method. + +
+
+
+
+
+ + ☆ Automating Deformable Gasket Assembly + + +
+ In Gasket Assembly, a deformable gasket must be aligned and pressed into a +narrow channel. This task is common for sealing surfaces in the manufacturing +of automobiles, appliances, electronics, and other products. Gasket Assembly is +a long-horizon, high-precision task and the gasket must align with the channel +and be fully pressed in to achieve a secure fit. To compare approaches, we +present 4 methods for Gasket Assembly: one policy from deep imitation learning +and three procedural algorithms. We evaluate these methods with 100 physical +trials. Results suggest that the Binary+ algorithm succeeds in 10/10 on the +straight channel whereas the learned policy based on 250 human teleoperated +demonstrations succeeds in 8/10 trials and is significantly slower. Code, CAD +models, videos, and data can be found at +https://berkeleyautomation.github.io/robot-gasket/ + +
+
+ comment: Content without Appendix accepted for IEEE CASE 2024 +
+
+
+
+
+ + ☆ xGen-VideoSyn-1: High-fidelity Text-to-Video Synthesis with Compressed + Representations ECCV24 + + +
+ We present xGen-VideoSyn-1, a text-to-video (T2V) generation model capable of +producing realistic scenes from textual descriptions. Building on recent +advancements, such as OpenAI's Sora, we explore the latent diffusion model +(LDM) architecture and introduce a video variational autoencoder (VidVAE). +VidVAE compresses video data both spatially and temporally, significantly +reducing the length of visual tokens and the computational demands associated +with generating long-sequence videos. To further address the computational +costs, we propose a divide-and-merge strategy that maintains temporal +consistency across video segments. Our Diffusion Transformer (DiT) model +incorporates spatial and temporal self-attention layers, enabling robust +generalization across different timeframes and aspect ratios. We have devised a +data processing pipeline from the very beginning and collected over 13M +high-quality video-text pairs. The pipeline includes multiple steps such as +clipping, text detection, motion estimation, aesthetics scoring, and dense +captioning based on our in-house video-LLM model. Training the VidVAE and DiT +models required approximately 40 and 642 H100 days, respectively. Our model +supports over 14-second 720p video generation in an end-to-end way and +demonstrates competitive performance against state-of-the-art T2V models. + +
+
+ comment: Accepted by ECCV24 AI4VA +
+
+
+
+
+ + ☆ Real-Time Video Generation with Pyramid Attention Broadcast + + +
+ We present Pyramid Attention Broadcast (PAB), a real-time, high quality and +training-free approach for DiT-based video generation. Our method is founded on +the observation that attention difference in the diffusion process exhibits a +U-shaped pattern, indicating significant redundancy. We mitigate this by +broadcasting attention outputs to subsequent steps in a pyramid style. It +applies different broadcast strategies to each attention based on their +variance for best efficiency. We further introduce broadcast sequence parallel +for more efficient distributed inference. PAB demonstrates superior results +across three models compared to baselines, achieving real-time generation for +up to 720p videos. We anticipate that our simple yet effective method will +serve as a robust baseline and facilitate future research and application for +video generation. + +
+
+
+
+
+ + ☆ Enhanced Parking Perception by Multi-Task Fisheye Cross-view + Transformers + + +
+ Current parking area perception algorithms primarily focus on detecting +vacant slots within a limited range, relying on error-prone homographic +projection for both labeling and inference. However, recent advancements in +Advanced Driver Assistance System (ADAS) require interaction with end-users +through comprehensive and intelligent Human-Machine Interfaces (HMIs). These +interfaces should present a complete perception of the parking area going from +distinguishing vacant slots' entry lines to the orientation of other parked +vehicles. This paper introduces Multi-Task Fisheye Cross View Transformers (MT +F-CVT), which leverages features from a four-camera fisheye Surround-view +Camera System (SVCS) with multihead attentions to create a detailed Bird-Eye +View (BEV) grid feature map. Features are processed by both a segmentation +decoder and a Polygon-Yolo based object detection decoder for parking slots and +vehicles. Trained on data labeled using LiDAR, MT F-CVT positions objects +within a 25m x 25m real open-road scenes with an average error of only 20 cm. +Our larger model achieves an F-1 score of 0.89. Moreover the smaller model +operates at 16 fps on an Nvidia Jetson Orin embedded board, with similar +detection results to the larger one. MT F-CVT demonstrates robust +generalization capability across different vehicles and camera rig +configurations. A demo video from an unseen vehicle and camera rig is available +at: https://streamable.com/jjw54x. + +
+
+ comment: 26th Irish Machine Vision and Image Processing Conference, + Data-Driven Autonomy Workshop (matching camera-ready version) +
+
+
+
+
+ + ☆ MuMA-ToM: Multi-modal Multi-Agent Theory of Mind SC + + +
+ Understanding people's social interactions in complex real-world scenarios +often relies on intricate mental reasoning. To truly understand how and why +people interact with one another, we must infer the underlying mental states +that give rise to the social interactions, i.e., Theory of Mind reasoning in +multi-agent interactions. Additionally, social interactions are often +multi-modal -- we can watch people's actions, hear their conversations, and/or +read about their past behaviors. For AI systems to successfully and safely +interact with people in real-world environments, they also need to understand +people's mental states as well as their inferences about each other's mental +states based on multi-modal information about their interactions. For this, we +introduce MuMA-ToM, a Multi-modal Multi-Agent Theory of Mind benchmark. +MuMA-ToM is the first multi-modal Theory of Mind benchmark that evaluates +mental reasoning in embodied multi-agent interactions. In MuMA-ToM, we provide +video and text descriptions of people's multi-modal behavior in realistic +household environments. Based on the context, we then ask questions about +people's goals, beliefs, and beliefs about others' goals. We validated MuMA-ToM +in a human experiment and provided a human baseline. We also proposed a novel +multi-modal, multi-agent ToM model, LIMP (Language model-based Inverse +Multi-agent Planning). Our experimental results show that LIMP significantly +outperforms state-of-the-art methods, including large multi-modal models (e.g., +GPT-4o, Gemini-1.5 Pro) and a recent multi-modal ToM model, BIP-ALM. + +
+
+ comment: Project website: https://scai.cs.jhu.edu/projects/MuMA-ToM/ Code: + https://github.com/SCAI-JHU/MuMA-ToM +
+
+
+
+
+ + ☆ Sapiens: Foundation for Human Vision Models ECCV 2024 + + +
+ We present Sapiens, a family of models for four fundamental human-centric +vision tasks - 2D pose estimation, body-part segmentation, depth estimation, +and surface normal prediction. Our models natively support 1K high-resolution +inference and are extremely easy to adapt for individual tasks by simply +fine-tuning models pretrained on over 300 million in-the-wild human images. We +observe that, given the same computational budget, self-supervised pretraining +on a curated dataset of human images significantly boosts the performance for a +diverse set of human-centric tasks. The resulting models exhibit remarkable +generalization to in-the-wild data, even when labeled data is scarce or +entirely synthetic. Our simple model design also brings scalability - model +performance across tasks improves as we scale the number of parameters from 0.3 +to 2 billion. Sapiens consistently surpasses existing baselines across various +human-centric benchmarks. We achieve significant improvements over the prior +state-of-the-art on Humans-5K (pose) by 7.6 mAP, Humans-2K (part-seg) by 17.1 +mIoU, Hi4D (depth) by 22.4% relative RMSE, and THuman2 (normal) by 53.5% +relative angular error. + +
+
+ comment: ECCV 2024 (Oral) +
+
+
+
+
+ + ☆ Pruning By Explaining Revisited: Optimizing Attribution Methods to Prune + CNNs and Transformers ECCV 2024 + + +
+ To solve ever more complex problems, Deep Neural Networks are scaled to +billions of parameters, leading to huge computational costs. An effective +approach to reduce computational requirements and increase efficiency is to +prune unnecessary components of these often over-parameterized networks. +Previous work has shown that attribution methods from the field of eXplainable +AI serve as effective means to extract and prune the least relevant network +components in a few-shot fashion. We extend the current state by proposing to +explicitly optimize hyperparameters of attribution methods for the task of +pruning, and further include transformer-based networks in our analysis. Our +approach yields higher model compression rates of large transformer- and +convolutional architectures (VGG, ResNet, ViT) compared to previous works, +while still attaining high performance on ImageNet classification tasks. Here, +our experiments indicate that transformers have a higher degree of +over-parameterization compared to convolutional neural networks. Code is +available at +$\href{https://github.com/erfanhatefi/Pruning-by-eXplaining-in-PyTorch}{\text{this +https link}}$. + +
+
+ comment: Accepted as a workshop paper at ECCV 2024 31 pages (14 pages + manuscript, 4 pages references, 13 pages appendix) +
+
+
+
+
+ + ☆ Comparing YOLOv5 Variants for Vehicle Detection: A Performance Analysis + + +
+ Vehicle detection is an important task in the management of traffic and +automatic vehicles. This study provides a comparative analysis of five YOLOv5 +variants, YOLOv5n6s, YOLOv5s6s, YOLOv5m6s, YOLOv5l6s, and YOLOv5x6s, for +vehicle detection in various environments. The research focuses on evaluating +the effectiveness of these models in detecting different types of vehicles, +such as Car, Bus, Truck, Bicycle, and Motorcycle, under varying conditions +including lighting, occlusion, and weather. Performance metrics such as +precision, recall, F1-score, and mean Average Precision are utilized to assess +the accuracy and reliability of each model. YOLOv5n6s demonstrated a strong +balance between precision and recall, particularly in detecting Cars. YOLOv5s6s +and YOLOv5m6s showed improvements in recall, enhancing their ability to detect +all relevant objects. YOLOv5l6s, with its larger capacity, provided robust +performance, especially in detecting Cars, but not good with identifying +Motorcycles and Bicycles. YOLOv5x6s was effective in recognizing Buses and Cars +but faced challenges with Motorcycle class. + +
+
+
+
+
+ + ☆ Automatic Organ and Pan-cancer Segmentation in Abdomen CT: the FLARE + 2023 Challenge MICCAI 2024 + + +
+ Organ and cancer segmentation in abdomen Computed Tomography (CT) scans is +the prerequisite for precise cancer diagnosis and treatment. Most existing +benchmarks and algorithms are tailored to specific cancer types, limiting their +ability to provide comprehensive cancer analysis. This work presents the first +international competition on abdominal organ and pan-cancer segmentation by +providing a large-scale and diverse dataset, including 4650 CT scans with +various cancer types from over 40 medical centers. The winning team established +a new state-of-the-art with a deep learning-based cascaded framework, achieving +average Dice Similarity Coefficient scores of 92.3% for organs and 64.9% for +lesions on the hidden multi-national testing set. The dataset and code of top +teams are publicly available, offering a benchmark platform to drive further +innovations https://codalab.lisn.upsaclay.fr/competitions/12239. + +
+
+ comment: MICCAI 2024 FLARE Challenge Summary +
+
+
+
+
+ + ☆ Deep Learning Improvements for Sparse Spatial Field Reconstruction + + +
+ Accurately reconstructing a global spatial field from sparse data has been a +longstanding problem in several domains, such as Earth Sciences and Fluid +Dynamics. Historically, scientists have approached this problem by employing +complex physics models to reconstruct the spatial fields. However, these +methods are often computationally intensive. With the increase in popularity of +machine learning (ML), several researchers have applied ML to the spatial field +reconstruction task and observed improvements in computational efficiency. One +such method in arXiv:2101.00554 utilizes a sparse mask of sensor locations and +a Voronoi tessellation with sensor measurements as inputs to a convolutional +neural network for reconstructing the global spatial field. In this work, we +propose multiple adjustments to the aforementioned approach and show +improvements on geoscience and fluid dynamics simulation datasets. We identify +and discuss scenarios that benefit the most using the proposed ML-based spatial +field reconstruction approach. + +
+
+
+
+
+ + ☆ Show-o: One Single Transformer to Unify Multimodal Understanding and + Generation + + +
+ We present a unified transformer, i.e., Show-o, that unifies multimodal +understanding and generation. Unlike fully autoregressive models, Show-o +unifies autoregressive and (discrete) diffusion modeling to adaptively handle +inputs and outputs of various and mixed modalities. The unified model flexibly +supports a wide range of vision-language tasks including visual +question-answering, text-to-image generation, text-guided +inpainting/extrapolation, and mixed-modality generation. Across various +benchmarks, it demonstrates comparable or superior performance to existing +individual models with an equivalent or larger number of parameters tailored +for understanding or generation. This significantly highlights its potential as +a next-generation foundation model. Code and models are released at +https://github.com/showlab/Show-o. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ☆ UMAD: University of Macau Anomaly Detection Benchmark Dataset IROS + + +
+ Anomaly detection is critical in surveillance systems and patrol robots by +identifying anomalous regions in images for early warning. Depending on whether +reference data are utilized, anomaly detection can be categorized into anomaly +detection with reference and anomaly detection without reference. Currently, +anomaly detection without reference, which is closely related to +out-of-distribution (OoD) object detection, struggles with learning anomalous +patterns due to the difficulty of collecting sufficiently large and diverse +anomaly datasets with the inherent rarity and novelty of anomalies. +Alternatively, anomaly detection with reference employs the scheme of change +detection to identify anomalies by comparing semantic changes between a +reference image and a query one. However, there are very few ADr works due to +the scarcity of public datasets in this domain. In this paper, we aim to +address this gap by introducing the UMAD Benchmark Dataset. To our best +knowledge, this is the first benchmark dataset designed specifically for +anomaly detection with reference in robotic patrolling scenarios, e.g., where +an autonomous robot is employed to detect anomalous objects by comparing a +reference and a query video sequences. The reference sequences can be taken by +the robot along a specified route when there are no anomalous objects in the +scene. The query sequences are captured online by the robot when it is +patrolling in the same scene following the same route. Our benchmark dataset is +elaborated such that each query image can find a corresponding reference based +on accurate robot localization along the same route in the prebuilt 3D map, +with which the reference and query images can be geometrically aligned using +adaptive warping. Besides the proposed benchmark dataset, we evaluate the +baseline models of ADr on this dataset. + +
+
+ comment: Accepted by the IEEE/RSJ International Conference on Intelligent + Robots and Systems (IROS) 2024, project code at https://github.com/IMRL/UMAD +
+
+
+
+
+ + ☆ Scribbles for All: Benchmarking Scribble Supervised Segmentation Across + Datasets + + +
+ In this work, we introduce Scribbles for All, a label and training data +generation algorithm for semantic segmentation trained on scribble labels. +Training or fine-tuning semantic segmentation models with weak supervision has +become an important topic recently and was subject to significant advances in +model quality. In this setting, scribbles are a promising label type to achieve +high quality segmentation results while requiring a much lower annotation +effort than usual pixel-wise dense semantic segmentation annotations. The main +limitation of scribbles as source for weak supervision is the lack of +challenging datasets for scribble segmentation, which hinders the development +of novel methods and conclusive evaluations. To overcome this limitation, +Scribbles for All provides scribble labels for several popular segmentation +datasets and provides an algorithm to automatically generate scribble labels +for any dataset with dense annotations, paving the way for new insights and +model advancements in the field of weakly supervised segmentation. In addition +to providing datasets and algorithm, we evaluate state-of-the-art segmentation +models on our datasets and show that models trained with our synthetic labels +perform competitively with respect to models trained on manual labels. Thus, +our datasets enable state-of-the-art research into methods for scribble-labeled +semantic segmentation. The datasets, scribble generation algorithm, and +baselines are publicly available at https://github.com/wbkit/Scribbles4All + +
+
+ comment: under review +
+
+
+
+
+ + ☆ Not All Samples Should Be Utilized Equally: Towards Understanding and + Improving Dataset Distillation + + +
+ Dataset Distillation (DD) aims to synthesize a small dataset capable of +performing comparably to the original dataset. Despite the success of numerous +DD methods, theoretical exploration of this area remains unaddressed. In this +paper, we take an initial step towards understanding various matching-based DD +methods from the perspective of sample difficulty. We begin by empirically +examining sample difficulty, measured by gradient norm, and observe that +different matching-based methods roughly correspond to specific difficulty +tendencies. We then extend the neural scaling laws of data pruning to DD to +theoretically explain these matching-based methods. Our findings suggest that +prioritizing the synthesis of easier samples from the original dataset can +enhance the quality of distilled datasets, especially in low IPC +(image-per-class) settings. Based on our empirical observations and theoretical +analysis, we introduce the Sample Difficulty Correction (SDC) approach, +designed to predominantly generate easier samples to achieve higher dataset +quality. Our SDC can be seamlessly integrated into existing methods as a plugin +with minimal code adjustments. Experimental results demonstrate that adding SDC +generates higher-quality distilled datasets across 7 distillation methods and 6 +datasets. + +
+
+
+
+
+ + ☆ Frame Order Matters: A Temporal Sequence-Aware Model for Few-Shot Action + Recognition + + +
+ In this paper, we propose a novel Temporal Sequence-Aware Model (TSAM) for +few-shot action recognition (FSAR), which incorporates a sequential perceiver +adapter into the pre-training framework, to integrate both the spatial +information and the sequential temporal dynamics into the feature embeddings. +Different from the existing fine-tuning approaches that capture temporal +information by exploring the relationships among all the frames, our +perceiver-based adapter recurrently captures the sequential dynamics alongside +the timeline, which could perceive the order change. To obtain the +discriminative representations for each class, we extend a textual corpus for +each class derived from the large language models (LLMs) and enrich the visual +prototypes by integrating the contextual semantic information. Besides, We +introduce an unbalanced optimal transport strategy for feature matching that +mitigates the impact of class-unrelated features, thereby facilitating more +effective decision-making. Experimental results on five FSAR datasets +demonstrate that our method set a new benchmark, beating the second-best +competitors with large margins. + +
+
+ comment: 9 pages, 6 figures +
+
+
+
+
+ + ☆ Envisioning Class Entity Reasoning by Large Language Models for Few-shot + Learning + + +
+ Few-shot learning (FSL) aims to recognize new concepts using a limited number +of visual samples. Existing approaches attempt to incorporate semantic +information into the limited visual data for category understanding. However, +these methods often enrich class-level feature representations with abstract +category names, failing to capture the nuanced features essential for effective +generalization. To address this issue, we propose a novel framework for FSL, +which incorporates both the abstract class semantics and the concrete class +entities extracted from Large Language Models (LLMs), to enhance the +representation of the class prototypes. Specifically, our framework composes a +Semantic-guided Visual Pattern Extraction (SVPE) module and a +Prototype-Calibration (PC) module, where the SVPE meticulously extracts +semantic-aware visual patterns across diverse scales, while the PC module +seamlessly integrates these patterns to refine the visual prototype, enhancing +its representativeness. Extensive experiments on four few-shot classification +benchmarks and the BSCD-FSL cross-domain benchmarks showcase remarkable +advancements over the current state-of-the-art methods. Notably, for the +challenging one-shot setting, our approach, utilizing the ResNet-12 backbone, +achieves an impressive average improvement of 1.95% over the second-best +competitor. + +
+
+ comment: 9 pages, 7 figures +
+
+
+
+
+ + ☆ WCEbleedGen: A wireless capsule endoscopy dataset and its benchmarking + for automatic bleeding classification, detection, and segmentation + + +
+ Computer-based analysis of Wireless Capsule Endoscopy (WCE) is crucial. +However, a medically annotated WCE dataset for training and evaluation of +automatic classification, detection, and segmentation of bleeding and +non-bleeding frames is currently lacking. The present work focused on +development of a medically annotated WCE dataset called WCEbleedGen for +automatic classification, detection, and segmentation of bleeding and +non-bleeding frames. It comprises 2,618 WCE bleeding and non-bleeding frames +which were collected from various internet resources and existing WCE datasets. +A comprehensive benchmarking and evaluation of the developed dataset was done +using nine classification-based, three detection-based, and three +segmentation-based deep learning models. The dataset is of high-quality, is +class-balanced and contains single and multiple bleeding sites. Overall, our +standard benchmark results show that Visual Geometric Group (VGG) 19, You Only +Look Once version 8 nano (YOLOv8n), and Link network (Linknet) performed best +in automatic classification, detection, and segmentation-based evaluations, +respectively. Automatic bleeding diagnosis is crucial for WCE video +interpretations. This diverse dataset will aid in developing of real-time, +multi-task learning-based innovative solutions for automatic bleeding diagnosis +in WCE. The dataset and code are publicly available at +https://zenodo.org/records/10156571 and +https://github.com/misahub2023/Benchmarking-Codes-of-the-WCEBleedGen-dataset. + +
+
+
+
+
+ + ☆ Smartphone-based Eye Tracking System using Edge Intelligence and Model + Optimisation + + +
+ A significant limitation of current smartphone-based eye-tracking algorithms +is their low accuracy when applied to video-type visual stimuli, as they are +typically trained on static images. Also, the increasing demand for real-time +interactive applications like games, VR, and AR on smartphones requires +overcoming the limitations posed by resource constraints such as limited +computational power, battery life, and network bandwidth. Therefore, we +developed two new smartphone eye-tracking techniques for video-type visuals by +combining Convolutional Neural Networks (CNN) with two different Recurrent +Neural Networks (RNN), namely Long Short Term Memory (LSTM) and Gated Recurrent +Unit (GRU). Our CNN+LSTM and CNN+GRU models achieved an average Root Mean +Square Error of 0.955cm and 1.091cm, respectively. To address the computational +constraints of smartphones, we developed an edge intelligence architecture to +enhance the performance of smartphone-based eye tracking. We applied various +optimisation methods like quantisation and pruning to deep learning models for +better energy, CPU, and memory usage on edge devices, focusing on real-time +processing. Using model quantisation, the model inference time in the CNN+LSTM +and CNN+GRU models was reduced by 21.72% and 19.50%, respectively, on edge +devices. + +
+
+
+
+
+ + ☆ Finding Closure: A Closer Look at the Gestalt Law of Closure in + Convolutional Neural Networks + + +
+ The human brain has an inherent ability to fill in gaps to perceive figures +as complete wholes, even when parts are missing or fragmented. This phenomenon +is known as Closure in psychology, one of the Gestalt laws of perceptual +organization, explaining how the human brain interprets visual stimuli. Given +the importance of Closure for human object recognition, we investigate whether +neural networks rely on a similar mechanism. Exploring this crucial human +visual skill in neural networks has the potential to highlight their +comparability to humans. Recent studies have examined the Closure effect in +neural networks. However, they typically focus on a limited selection of +Convolutional Neural Networks (CNNs) and have not reached a consensus on their +capability to perform Closure. To address these gaps, we present a systematic +framework for investigating the Closure principle in neural networks. We +introduce well-curated datasets designed to test for Closure effects, including +both modal and amodal completion. We then conduct experiments on various CNNs +employing different measurements. Our comprehensive analysis reveals that VGG16 +and DenseNet-121 exhibit the Closure effect, while other CNNs show variable +results. We interpret these findings by blending insights from psychology and +neural network research, offering a unique perspective that enhances +transparency in understanding neural networks. Our code and dataset will be +made available on GitHub. + +
+
+
+
+
+ + ☆ Relaxed Rotational Equivariance via $G$-Biases in Vision + + +
+ Group Equivariant Convolution (GConv) can effectively handle rotational +symmetry data. They assume uniform and strict rotational symmetry across all +features, as the transformations under the specific group. However, real-world +data rarely conforms to strict rotational symmetry commonly referred to as +Rotational Symmetry-Breaking in the system or dataset, making GConv unable to +adapt effectively to this phenomenon. Motivated by this, we propose a simple +but highly effective method to address this problem, which utilizes a set of +learnable biases called the $G$-Biases under the group order to break strict +group constraints and achieve \textbf{R}elaxed \textbf{R}otational +\textbf{E}quivarant \textbf{Conv}olution (RREConv). We conduct extensive +experiments to validate Relaxed Rotational Equivariance on rotational symmetry +groups $\mathcal{C}_n$ (e.g. $\mathcal{C}_2$, $\mathcal{C}_4$, and +$\mathcal{C}_6$ groups). Further experiments demonstrate that our proposed +RREConv-based methods achieve excellent performance, compared to existing +GConv-based methods in classification and detection tasks on natural image +datasets. + +
+
+
+
+
+ + ☆ The 2nd Solution for LSVOS Challenge RVOS Track: Spatial-temporal + Refinement for Consistent Semantic Segmentation + + +
+ Referring Video Object Segmentation (RVOS) is a challenging task due to its +requirement for temporal understanding. Due to the obstacle of computational +complexity, many state-of-the-art models are trained on short time intervals. +During testing, while these models can effectively process information over +short time steps, they struggle to maintain consistent perception over +prolonged time sequences, leading to inconsistencies in the resulting semantic +segmentation masks. To address this challenge, we take a step further in this +work by leveraging the tracking capabilities of the newly introduced Segment +Anything Model version 2 (SAM-v2) to enhance the temporal consistency of the +referring object segmentation model. Our method achieved a score of 60.40 +\mathcal{J\text{\&}F} on the test set of the MeViS dataset, placing 2nd place +in the final ranking of the RVOS Track at the ECCV 2024 LSVOS Challenge. + +
+
+
+
+
+ + ☆ A Riemannian Approach for Spatiotemporal Analysis and Generation of 4D + Tree-shaped Structures + + +
+ We propose the first comprehensive approach for modeling and analyzing the +spatiotemporal shape variability in tree-like 4D objects, i.e., 3D objects +whose shapes bend, stretch, and change in their branching structure over time +as they deform, grow, and interact with their environment. Our key contribution +is the representation of tree-like 3D shapes using Square Root Velocity +Function Trees (SRVFT). By solving the spatial registration in the SRVFT space, +which is equipped with an L2 metric, 4D tree-shaped structures become +time-parameterized trajectories in this space. This reduces the problem of +modeling and analyzing 4D tree-like shapes to that of modeling and analyzing +elastic trajectories in the SRVFT space, where elasticity refers to time +warping. In this paper, we propose a novel mathematical representation of the +shape space of such trajectories, a Riemannian metric on that space, and +computational tools for fast and accurate spatiotemporal registration and +geodesics computation between 4D tree-shaped structures. Leveraging these +building blocks, we develop a full framework for modelling the spatiotemporal +variability using statistical models and generating novel 4D tree-like +structures from a set of exemplars. We demonstrate and validate the proposed +framework using real 4D plant data. + +
+
+
+
+
+ + ☆ Adapting MIMO video restoration networks to low latency constraints + + +
+ MIMO (multiple input, multiple output) approaches are a recent trend in +neural network architectures for video restoration problems, where each network +evaluation produces multiple output frames. The video is split into +non-overlapping stacks of frames that are processed independently, resulting in +a very appealing trade-off between output quality and computational cost. In +this work we focus on the low-latency setting by limiting the number of +available future frames. We find that MIMO architectures suffer from problems +that have received little attention so far, namely (1) the performance drops +significantly due to the reduced temporal receptive field, particularly for +frames at the borders of the stack, (2) there are strong temporal +discontinuities at stack transitions which induce a step-wise motion artifact. +We propose two simple solutions to alleviate these problems: recurrence across +MIMO stacks to boost the output quality by implicitly increasing the temporal +receptive field, and overlapping of the output stacks to smooth the temporal +discontinuity at stack transitions. These modifications can be applied to any +MIMO architecture. We test them on three state-of-the-art video denoising +networks with different computational cost. The proposed contributions result +in a new state-of-the-art for low-latency networks, both in terms of +reconstruction error and temporal consistency. As an additional contribution, +we introduce a new benchmark consisting of drone footage that highlights +temporal consistency issues that are not apparent in the standard benchmarks. + +
+
+ comment: See the project web page to download the associated videos +
+
+
+
+
+ + ☆ Robotic Eye-in-hand Visual Servo Axially Aligning Nasopharyngeal Swabs + with the Nasal Cavity + + +
+ The nasopharyngeal (NP) swab test is a method for collecting cultures to +diagnose for different types of respiratory illnesses, including COVID-19. +Delegating this task to robots would be beneficial in terms of reducing +infection risks and bolstering the healthcare system, but a critical component +of the NP swab test is having the swab aligned properly with the nasal cavity +so that it does not cause excessive discomfort or injury by traveling down the +wrong passage. Existing research towards robotic NP swabbing typically assumes +the patient's head is held within a fixture. This simplifies the alignment +problem, but is also dissimilar to clinical scenarios where patients are +typically free-standing. Consequently, our work creates a vision-guided +pipeline to allow an instrumented robot arm to properly position and orient NP +swabs with respect to the nostrils of free-standing patients. The first +component of the pipeline is a precomputed joint lookup table to allow the arm +to meet the patient's arbitrary position in the designated workspace, while +avoiding joint limits. Our pipeline leverages semantic face models from +computer vision to estimate the Euclidean pose of the face with respect to a +monocular RGB-D camera placed on the end-effector. These estimates are passed +into an unscented Kalman filter on manifolds state estimator and a pose based +visual servo control loop to move the swab to the designated pose in front of +the nostril. Our pipeline was validated with human trials, featuring a cohort +of 25 participants. The system is effective, reaching the nostril for 84% of +participants, and our statistical analysis did not find significant demographic +biases within the cohort. + +
+
+ comment: 12 pages, 13 figures +
+
+
+
+
+ + ☆ FlexEdit: Marrying Free-Shape Masks to VLLM for Flexible Image Editing + + +
+ Combining Vision Large Language Models (VLLMs) with diffusion models offers a +powerful method for executing image editing tasks based on human language +instructions. However, language instructions alone often fall short in +accurately conveying user requirements, particularly when users want to add, +replace elements in specific areas of an image. Luckily, masks can effectively +indicate the exact locations or elements to be edited, while they require users +to precisely draw the shapes at the desired locations, which is highly +user-unfriendly. To address this, we propose FlexEdit, an end-to-end image +editing method that leverages both free-shape masks and language instructions +for Flexible Editing. Our approach employs a VLLM in comprehending the image +content, mask, and user instructions. Additionally, we introduce the Mask +Enhance Adapter (MEA) that fuses the embeddings of the VLLM with the image +data, ensuring a seamless integration of mask information and model output +embeddings. Furthermore, we construct FSMI-Edit, a benchmark specifically +tailored for free-shape mask, including 8 types of free-shape mask. Extensive +experiments show that our method achieves state-of-the-art (SOTA) performance +in LLM-based image editing, and our simple prompting technique stands out in +its effectiveness. The code and data can be found at +https://github.com/A-new-b/flex_edit. + +
+
+ comment: 15 pages, 14 figures +
+
+
+
+
+ + ☆ Enhanced Infield Agriculture with Interpretable Machine Learning + Approaches for Crop Classification + + +
+ The increasing popularity of Artificial Intelligence in recent years has led +to a surge in interest in image classification, especially in the agricultural +sector. With the help of Computer Vision, Machine Learning, and Deep Learning, +the sector has undergone a significant transformation, leading to the +development of new techniques for crop classification in the field. Despite the +extensive research on various image classification techniques, most have +limitations such as low accuracy, limited use of data, and a lack of reporting +model size and prediction. The most significant limitation of all is the need +for model explainability. This research evaluates four different approaches for +crop classification, namely traditional ML with handcrafted feature extraction +methods like SIFT, ORB, and Color Histogram; Custom Designed CNN and +established DL architecture like AlexNet; transfer learning on five models +pre-trained using ImageNet such as EfficientNetV2, ResNet152V2, Xception, +Inception-ResNetV2, MobileNetV3; and cutting-edge foundation models like YOLOv8 +and DINOv2, a self-supervised Vision Transformer Model. All models performed +well, but Xception outperformed all of them in terms of generalization, +achieving 98% accuracy on the test data, with a model size of 80.03 MB and a +prediction time of 0.0633 seconds. A key aspect of this research was the +application of Explainable AI to provide the explainability of all the models. +This journal presents the explainability of Xception model with LIME, SHAP, and +GradCAM, ensuring transparency and trustworthiness in the models' predictions. +This study highlights the importance of selecting the right model according to +task-specific needs. It also underscores the important role of explainability +in deploying AI in agriculture, providing insightful information to help +enhance AI-driven crop management strategies. + +
+
+
+
+
+ + ☆ CODE: Confident Ordinary Differential Editing + + +
+ Conditioning image generation facilitates seamless editing and the creation +of photorealistic images. However, conditioning on noisy or Out-of-Distribution +(OoD) images poses significant challenges, particularly in balancing fidelity +to the input and realism of the output. We introduce Confident Ordinary +Differential Editing (CODE), a novel approach for image synthesis that +effectively handles OoD guidance images. Utilizing a diffusion model as a +generative prior, CODE enhances images through score-based updates along the +probability-flow Ordinary Differential Equation (ODE) trajectory. This method +requires no task-specific training, no handcrafted modules, and no assumptions +regarding the corruptions affecting the conditioning image. Our method is +compatible with any diffusion model. Positioned at the intersection of +conditional image generation and blind image restoration, CODE operates in a +fully blind manner, relying solely on a pre-trained generative model. Our +method introduces an alternative approach to blind restoration: instead of +targeting a specific ground truth image based on assumptions about the +underlying corruption, CODE aims to increase the likelihood of the input image +while maintaining fidelity. This results in the most probable in-distribution +image around the input. Our contributions are twofold. First, CODE introduces a +novel editing method based on ODE, providing enhanced control, realism, and +fidelity compared to its SDE-based counterpart. Second, we introduce a +confidence interval-based clipping method, which improves CODE's effectiveness +by allowing it to disregard certain pixels or information, thus enhancing the +restoration process in a blind manner. Experimental results demonstrate CODE's +effectiveness over existing methods, particularly in scenarios involving severe +degradation or OoD inputs. + +
+
+
+
+
+ + ☆ Generalized SAM: Efficient Fine-Tuning of SAM for Variable Input Image + Sizes ECCV2024 + + +
+ There has been a lot of recent research on improving the efficiency of +fine-tuning foundation models. In this paper, we propose a novel efficient +fine-tuning method that allows the input image size of Segment Anything Model +(SAM) to be variable. SAM is a powerful foundational model for image +segmentation trained on huge datasets, but it requires fine-tuning to recognize +arbitrary classes. The input image size of SAM is fixed at 1024 x 1024, +resulting in substantial computational demands during training. Furthermore, +the fixed input image size may result in the loss of image information, e.g. +due to fixed aspect ratios. To address this problem, we propose Generalized SAM +(GSAM). Different from the previous methods, GSAM is the first to apply random +cropping during training with SAM, thereby significantly reducing the +computational cost of training. Experiments on datasets of various types and +various pixel counts have shown that GSAM can train more efficiently than SAM +and other fine-tuning methods for SAM, achieving comparable or higher accuracy. + +
+
+ comment: Accepted by ECCV2024 Workshop "Computational Aspects of Deep Learning + (CADL)" +
+
+
+
+
+ + ☆ Multi-Style Facial Sketch Synthesis through Masked Generative Modeling + + +
+ The facial sketch synthesis (FSS) model, capable of generating sketch +portraits from given facial photographs, holds profound implications across +multiple domains, encompassing cross-modal face recognition, entertainment, +art, media, among others. However, the production of high-quality sketches +remains a formidable task, primarily due to the challenges and flaws associated +with three key factors: (1) the scarcity of artist-drawn data, (2) the +constraints imposed by limited style types, and (3) the deficiencies of +processing input information in existing models. To address these difficulties, +we propose a lightweight end-to-end synthesis model that efficiently converts +images to corresponding multi-stylized sketches, obviating the necessity for +any supplementary inputs (\eg, 3D geometry). In this study, we overcome the +issue of data insufficiency by incorporating semi-supervised learning into the +training process. Additionally, we employ a feature extraction module and style +embeddings to proficiently steer the generative transformer during the +iterative prediction of masked image tokens, thus achieving a continuous +stylized output that retains facial features accurately in sketches. The +extensive experiments demonstrate that our method consistently outperforms +previous algorithms across multiple benchmarks, exhibiting a discernible +disparity. + +
+
+
+
+
+ + ☆ Cross-Domain Foundation Model Adaptation: Pioneering Computer Vision + Models for Geophysical Data Analysis + + +
+ We explore adapting foundation models (FMs) from the computer vision domain +to geoscience. FMs, large neural networks trained on massive datasets, excel in +diverse tasks with remarkable adaptability and generality. However, geoscience +faces challenges like lacking curated training datasets and high computational +costs for developing specialized FMs. This study considers adapting FMs from +computer vision to geoscience, analyzing their scale, adaptability, and +generality for geoscientific data analysis. We introduce a workflow that +leverages existing computer vision FMs, fine-tuning them for geoscientific +tasks, reducing development costs while enhancing accuracy. Through +experiments, we demonstrate this workflow's effectiveness in broad applications +to process and interpret geoscientific data of lunar images, seismic data, DAS +arrays and so on. Our findings introduce advanced ML techniques to geoscience, +proving the feasibility and advantages of cross-domain FMs adaptation, driving +further advancements in geoscientific data analysis and offering valuable +insights for FMs applications in other scientific domains. + +
+
+
+
+
+ + ☆ Sampling Strategies based on Wisdom of Crowds for Amazon Deforestation + Detection + + +
+ Conserving tropical forests is highly relevant socially and ecologically +because of their critical role in the global ecosystem. However, the ongoing +deforestation and degradation affect millions of hectares each year, +necessitating government or private initiatives to ensure effective forest +monitoring. In April 2019, a project based on Citizen Science and Machine +Learning models called ForestEyes (FE) was launched with the aim of providing +supplementary data to assist experts from government and non-profit +organizations in their deforestation monitoring efforts. Recent research has +shown that labeling FE project volunteers/citizen scientists helps tailor +machine learning models. In this sense, we adopt the FE project to create +different sampling strategies based on the wisdom of crowds to select the most +suitable samples from the training set to learn an SVM technique and obtain +better classification results in deforestation detection tasks. In our +experiments, we can show that our strategy based on user entropy-increasing +achieved the best classification results in the deforestation detection task +when compared with the random sampling strategies, as well as, reducing the +convergence time of the SVM technique. + +
+
+ comment: 6 pages, 5 figus, paper accepted at the SIBGRAPI 2024 +
+
+
+
+
+ + ☆ UMERegRobust -- Universal Manifold Embedding Compatible Features for + Robust Point Cloud Registration ECCV 2024 + + +
+ In this paper, we adopt the Universal Manifold Embedding (UME) framework for +the estimation of rigid transformations and extend it, so that it can +accommodate scenarios involving partial overlap and differently sampled point +clouds. UME is a methodology designed for mapping observations of the same +object, related by rigid transformations, into a single low-dimensional linear +subspace. This process yields a transformation-invariant representation of the +observations, with its matrix form representation being covariant (i.e. +equivariant) with the transformation. We extend the UME framework by +introducing a UME-compatible feature extraction method augmented with a unique +UME contrastive loss and a sampling equalizer. These components are integrated +into a comprehensive and robust registration pipeline, named UMERegRobust. We +propose the RotKITTI registration benchmark, specifically tailored to evaluate +registration methods for scenarios involving large rotations. UMERegRobust +achieves better than state-of-the-art performance on the KITTI benchmark, +especially when strict precision of (1{\deg}, 10cm) is considered (with an +average gain of +9%), and notably outperform SOTA methods on the RotKITTI +benchmark (with +45% gain compared the most recent SOTA method). Our code is +available at https://github.com/yuvalH9/UMERegRobust. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ Robust Principal Component Analysis via Discriminant Sample Weight + Learning + + +
+ Principal component analysis (PCA) is a classical feature extraction method, +but it may be adversely affected by outliers, resulting in inaccurate learning +of the projection matrix. This paper proposes a robust method to estimate both +the data mean and the PCA projection matrix by learning discriminant sample +weights from data containing outliers. Each sample in the dataset is assigned a +weight, and the proposed algorithm iteratively learns the weights, the mean, +and the projection matrix, respectively. Specifically, when the mean and the +projection matrix are available, via fine-grained analysis of outliers, a +weight for each sample is learned hierarchically so that outliers have small +weights while normal samples have large weights. With the learned weights +available, a weighted optimization problem is solved to estimate both the data +mean and the projection matrix. Because the learned weights discriminate +outliers from normal samples, the adverse influence of outliers is mitigated +due to the corresponding small weights. Experiments on toy data, UCI dataset, +and face dataset demonstrate the effectiveness of the proposed method in +estimating the mean and the projection matrix from the data containing +outliers. + +
+
+
+
+
+ + ☆ SAM-SP: Self-Prompting Makes SAM Great Again + + +
+ The recently introduced Segment Anything Model (SAM), a Visual Foundation +Model (VFM), has demonstrated impressive capabilities in zero-shot segmentation +tasks across diverse natural image datasets. Despite its success, SAM +encounters noticeably performance degradation when applied to specific domains, +such as medical images. Current efforts to address this issue have involved +fine-tuning strategies, intended to bolster the generalizability of the vanilla +SAM. However, these approaches still predominantly necessitate the utilization +of domain specific expert-level prompts during the evaluation phase, which +severely constrains the model's practicality. + To overcome this limitation, we introduce a novel self-prompting based +fine-tuning approach, called SAM-SP, tailored for extending the vanilla SAM +model. Specifically, SAM-SP leverages the output from the previous iteration of +the model itself as prompts to guide subsequent iteration of the model. This +self-prompting module endeavors to learn how to generate useful prompts +autonomously and alleviates the dependence on expert prompts during the +evaluation phase, significantly broadening SAM's applicability. Additionally, +we integrate a self-distillation module to enhance the self-prompting process +further. Extensive experiments across various domain specific datasets validate +the effectiveness of the proposed SAM-SP. Our SAM-SP not only alleviates the +reliance on expert prompts but also exhibits superior segmentation performance +comparing to the state-of-the-art task-specific segmentation approaches, the +vanilla SAM, and SAM-based approaches. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Class-balanced Open-set Semi-supervised Object Detection for Medical + Images + + +
+ Medical image datasets in the real world are often unlabeled and imbalanced, +and Semi-Supervised Object Detection (SSOD) can utilize unlabeled data to +improve an object detector. However, existing approaches predominantly assumed +that the unlabeled data and test data do not contain out-of-distribution (OOD) +classes. The few open-set semi-supervised object detection methods have two +weaknesses: first, the class imbalance is not considered; second, the OOD +instances are distinguished and simply discarded during pseudo-labeling. In +this paper, we consider the open-set semi-supervised object detection problem +which leverages unlabeled data that contain OOD classes to improve object +detection for medical images. Our study incorporates two key innovations: +Category Control Embed (CCE) and out-of-distribution Detection Fusion +Classifier (OODFC). CCE is designed to tackle dataset imbalance by constructing +a Foreground information Library, while OODFC tackles open-set challenges by +integrating the ``unknown'' information into basic pseudo-labels. Our method +outperforms the state-of-the-art SSOD performance, achieving a 4.25 mAP +improvement on the public Parasite dataset. + +
+
+
+
+
+ + ☆ GarmentAligner: Text-to-Garment Generation via Retrieval-augmented + Multi-level Corrections + + +
+ General text-to-image models bring revolutionary innovation to the fields of +arts, design, and media. However, when applied to garment generation, even the +state-of-the-art text-to-image models suffer from fine-grained semantic +misalignment, particularly concerning the quantity, position, and +interrelations of garment components. Addressing this, we propose +GarmentAligner, a text-to-garment diffusion model trained with +retrieval-augmented multi-level corrections. To achieve semantic alignment at +the component level, we introduce an automatic component extraction pipeline to +obtain spatial and quantitative information of garment components from +corresponding images and captions. Subsequently, to exploit component +relationships within the garment images, we construct retrieval subsets for +each garment by retrieval augmentation based on component-level similarity +ranking and conduct contrastive learning to enhance the model perception of +components from positive and negative samples. To further enhance the alignment +of components across semantic, spatial, and quantitative granularities, we +propose the utilization of multi-level correction losses that leverage detailed +component information. The experimental findings demonstrate that +GarmentAligner achieves superior fidelity and fine-grained semantic alignment +when compared to existing competitors. + +
+
+
+
+
+ + ☆ VTON-HandFit: Virtual Try-on for Arbitrary Hand Pose Guided by Hand + Priors Embedding + + +
+ Although diffusion-based image virtual try-on has made considerable progress, +emerging approaches still struggle to effectively address the issue of hand +occlusion (i.e., clothing regions occluded by the hand part), leading to a +notable degradation of the try-on performance. To tackle this issue widely +existing in real-world scenarios, we propose VTON-HandFit, leveraging the power +of hand priors to reconstruct the appearance and structure for hand occlusion +cases. Firstly, we tailor a Handpose Aggregation Net using the ControlNet-based +structure explicitly and adaptively encoding the global hand and pose priors. +Besides, to fully exploit the hand-related structure and appearance +information, we propose Hand-feature Disentanglement Embedding module to +disentangle the hand priors into the hand structure-parametric and +visual-appearance features, and customize a masked cross attention for further +decoupled feature embedding. Lastly, we customize a hand-canny constraint loss +to better learn the structure edge knowledge from the hand template of model +image. VTON-HandFit outperforms the baselines in qualitative and quantitative +evaluations on the public dataset and our self-collected hand-occlusion +Handfit-3K dataset particularly for the arbitrary hand pose occlusion cases in +real-world scenarios. Code and dataset will be made publicly available. + +
+
+
+
+
+ + ☆ EUIS-Net: A Convolutional Neural Network for Efficient Ultrasound Image + Segmentation + + +
+ Segmenting ultrasound images is critical for various medical applications, +but it offers significant challenges due to ultrasound images' inherent noise +and unpredictability. To address these challenges, we proposed EUIS-Net, a CNN +network designed to segment ultrasound images efficiently and precisely. The +proposed EUIS-Net utilises four encoder-decoder blocks, resulting in a notable +decrease in computational complexity while achieving excellent performance. The +proposed EUIS-Net integrates both channel and spatial attention mechanisms into +the bottleneck to improve feature representation and collect significant +contextual information. In addition, EUIS-Net incorporates a region-aware +attention module in skip connections, which enhances the ability to concentrate +on the region of the injury. To enable thorough information exchange across +various network blocks, skip connection aggregation is employed from the +network's lowermost to the uppermost block. Comprehensive evaluations are +conducted on two publicly available ultrasound image segmentation datasets. The +proposed EUIS-Net achieved mean IoU and dice scores of 78. 12\%, 85. 42\% and +84. 73\%, 89. 01\% in the BUSI and DDTI datasets, respectively. The findings of +our study showcase the substantial capabilities of EUIS-Net for immediate use +in clinical settings and its versatility in various ultrasound imaging tasks. + +
+
+
+
+
+ + ☆ Multimodal Foundational Models for Unsupervised 3D General Obstacle + Detection + + +
+ Current autonomous driving perception models primarily rely on supervised +learning with predefined categories. However, these models struggle to detect +general obstacles not included in the fixed category set due to their +variability and numerous edge cases. To address this issue, we propose a +combination of multimodal foundational model-based obstacle segmentation with +traditional unsupervised computational geometry-based outlier detection. Our +approach operates offline, allowing us to leverage non-causality, and utilizes +training-free methods. This enables the detection of general obstacles in 3D +without the need for expensive retraining. To overcome the limitations of +publicly available obstacle detection datasets, we collected and annotated our +dataset, which includes various obstacles even in distant regions. + +
+
+
+
+
+ + ☆ MaVEn: An Effective Multi-granularity Hybrid Visual Encoding Framework + for Multimodal Large Language Model + + +
+ This paper presents MaVEn, an innovative Multi-granularity Visual Encoding +framework designed to enhance the capabilities of Multimodal Large Language +Models (MLLMs) in multi-image reasoning. Current MLLMs primarily focus on +single-image visual understanding, limiting their ability to interpret and +integrate information across multiple images. MaVEn addresses this limitation +by combining discrete visual symbol sequences, which abstract coarse-grained +semantic concepts, with traditional continuous representation sequences that +model fine-grained features. This dual approach bridges the semantic gap +between visual and textual data, thereby improving the model's ability to +process and interpret information from multiple images effectively. +Additionally, we design a dynamic reduction mechanism by for long-sequence +continuous features to enhance multi-image processing efficiency. Experimental +results demonstrate that MaVEn significantly enhances MLLMs' understanding in +complex multi-image scenarios, while also improving performance in single-image +contexts. + +
+
+
+
+
+ + ☆ Adapt CLIP as Aggregation Instructor for Image Dehazing + + +
+ Most dehazing methods suffer from limited receptive field and do not explore +the rich semantic prior encapsulated in vision-language models, which have +proven effective in downstream tasks. In this paper, we introduce CLIPHaze, a +pioneering hybrid framework that synergizes the efficient global modeling of +Mamba with the prior knowledge and zero-shot capabilities of CLIP to address +both issues simultaneously. Specifically, our method employs parallel state +space model and window-based self-attention to obtain global contextual +dependency and local fine-grained perception, respectively. To seamlessly +aggregate information from both paths, we introduce CLIP-instructed Aggregation +Module (CAM). For non-homogeneous and homogeneous haze, CAM leverages zero-shot +estimated haze density map and high-quality image embedding without degradation +information to explicitly and implicitly determine the optimal neural operation +range for each pixel, thereby adaptively fusing two paths with different +receptive fields. Extensive experiments on various benchmarks demonstrate that +CLIPHaze achieves state-of-the-art (SOTA) performance, particularly in +non-homogeneous haze. Code will be publicly after acceptance. + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+ + ☆ Unrolled Decomposed Unpaired Learning for Controllable Low-Light Video + Enhancement + + +
+ Obtaining pairs of low/normal-light videos, with motions, is more challenging +than still images, which raises technical issues and poses the technical route +of unpaired learning as a critical role. This paper makes endeavors in the +direction of learning for low-light video enhancement without using paired +ground truth. Compared to low-light image enhancement, enhancing low-light +videos is more difficult due to the intertwined effects of noise, exposure, and +contrast in the spatial domain, jointly with the need for temporal coherence. +To address the above challenge, we propose the Unrolled Decomposed Unpaired +Network (UDU-Net) for enhancing low-light videos by unrolling the optimization +functions into a deep network to decompose the signal into spatial and +temporal-related factors, which are updated iteratively. Firstly, we formulate +low-light video enhancement as a Maximum A Posteriori estimation (MAP) problem +with carefully designed spatial and temporal visual regularization. Then, via +unrolling the problem, the optimization of the spatial and temporal constraints +can be decomposed into different steps and updated in a stage-wise manner. From +the spatial perspective, the designed Intra subnet leverages unpair prior +information from expert photography retouched skills to adjust the statistical +distribution. Additionally, we introduce a novel mechanism that integrates +human perception feedback to guide network optimization, suppressing +over/under-exposure conditions. Meanwhile, to address the issue from the +temporal perspective, the designed Inter subnet fully exploits temporal cues in +progressive optimization, which helps achieve improved temporal consistency in +enhancement results. Consequently, the proposed method achieves superior +performance to state-of-the-art methods in video illumination, noise +suppression, and temporal consistency across outdoor and indoor scenes. + +
+
+
+
+
+ + ☆ MakeupAttack: Feature Space Black-box Backdoor Attack on Face + Recognition via Makeup Transfer + + +
+ Backdoor attacks pose a significant threat to the training process of deep +neural networks (DNNs). As a widely-used DNN-based application in real-world +scenarios, face recognition systems once implanted into the backdoor, may cause +serious consequences. Backdoor research on face recognition is still in its +early stages, and the existing backdoor triggers are relatively simple and +visible. Furthermore, due to the perceptibility, diversity, and similarity of +facial datasets, many state-of-the-art backdoor attacks lose effectiveness on +face recognition tasks. In this work, we propose a novel feature space backdoor +attack against face recognition via makeup transfer, dubbed MakeupAttack. In +contrast to many feature space attacks that demand full access to target +models, our method only requires model queries, adhering to black-box attack +principles. In our attack, we design an iterative training paradigm to learn +the subtle features of the proposed makeup-style trigger. Additionally, +MakeupAttack promotes trigger diversity using the adaptive selection method, +dispersing the feature distribution of malicious samples to bypass existing +defense methods. Extensive experiments were conducted on two widely-used facial +datasets targeting multiple models. The results demonstrate that our proposed +attack method can bypass existing state-of-the-art defenses while maintaining +effectiveness, robustness, naturalness, and stealthiness, without compromising +model performance. + +
+
+
+
+
+ + ☆ AT-SNN: Adaptive Tokens for Vision Transformer on Spiking Neural Network + + +
+ In the training and inference of spiking neural networks (SNNs), direct +training and lightweight computation methods have been orthogonally developed, +aimed at reducing power consumption. However, only a limited number of +approaches have applied these two mechanisms simultaneously and failed to fully +leverage the advantages of SNN-based vision transformers (ViTs) since they were +originally designed for convolutional neural networks (CNNs). In this paper, we +propose AT-SNN designed to dynamically adjust the number of tokens processed +during inference in SNN-based ViTs with direct training, wherein power +consumption is proportional to the number of tokens. We first demonstrate the +applicability of adaptive computation time (ACT), previously limited to RNNs +and ViTs, to SNN-based ViTs, enhancing it to discard less informative spatial +tokens selectively. Also, we propose a new token-merge mechanism that relies on +the similarity of tokens, which further reduces the number of tokens while +enhancing accuracy. We implement AT-SNN to Spikformer and show the +effectiveness of AT-SNN in achieving high energy efficiency and accuracy +compared to state-of-the-art approaches on the image classification tasks, +CIFAR10, CIFAR-100, and TinyImageNet. For example, our approach uses up to +42.4% fewer tokens than the existing best-performing method on CIFAR-100, while +conserving higher accuracy. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ Towards Deconfounded Image-Text Matching with Causal Inference ACM MM + + +
+ Prior image-text matching methods have shown remarkable performance on many +benchmark datasets, but most of them overlook the bias in the dataset, which +exists in intra-modal and inter-modal, and tend to learn the spurious +correlations that extremely degrade the generalization ability of the model. +Furthermore, these methods often incorporate biased external knowledge from +large-scale datasets as prior knowledge into image-text matching model, which +is inevitable to force model further learn biased associations. To address +above limitations, this paper firstly utilizes Structural Causal Models (SCMs) +to illustrate how intra- and inter-modal confounders damage the image-text +matching. Then, we employ backdoor adjustment to propose an innovative +Deconfounded Causal Inference Network (DCIN) for image-text matching task. DCIN +(1) decomposes the intra- and inter-modal confounders and incorporates them +into the encoding stage of visual and textual features, effectively eliminating +the spurious correlations during image-text matching, and (2) uses causal +inference to mitigate biases of external knowledge. Consequently, the model can +learn causality instead of spurious correlations caused by dataset bias. +Extensive experiments on two well-known benchmark datasets, i.e., Flickr30K and +MSCOCO, demonstrate the superiority of our proposed method. + +
+
+ comment: ACM MM +
+
+
+
+
+ + ☆ Subsurface Scattering for 3D Gaussian Splatting + + +
+ 3D reconstruction and relighting of objects made from scattering materials +present a significant challenge due to the complex light transport beneath the +surface. 3D Gaussian Splatting introduced high-quality novel view synthesis at +real-time speeds. While 3D Gaussians efficiently approximate an object's +surface, they fail to capture the volumetric properties of subsurface +scattering. We propose a framework for optimizing an object's shape together +with the radiance transfer field given multi-view OLAT (one light at a time) +data. Our method decomposes the scene into an explicit surface represented as +3D Gaussians, with a spatially varying BRDF, and an implicit volumetric +representation of the scattering component. A learned incident light field +accounts for shadowing. We optimize all parameters jointly via ray-traced +differentiable rendering. Our approach enables material editing, relighting and +novel view synthesis at interactive rates. We show successful application on +synthetic data and introduce a newly acquired multi-view multi-light dataset of +objects in a light-stage setup. Compared to previous work we achieve comparable +or better results at a fraction of optimization and rendering time while +enabling detailed control over material attributes. Project page +https://sss.jdihlmann.com/ + +
+
+ comment: Project page: https://sss.jdihlmann.com/ +
+
+
+
+
+ + ☆ Whole Slide Image Classification of Salivary Gland Tumours + + +
+ This work shows promising results using multiple instance learning on +salivary gland tumours in classifying cancers on whole slide images. Utilising +CTransPath as a patch-level feature extractor and CLAM as a feature aggregator, +an F1 score of over 0.88 and AUROC of 0.92 are obtained for detecting cancer in +whole slide images. + +
+
+ comment: 5 pages, 2 figures, 28th UK Conference on Medical Image Understanding + and Analysis - clinical abstract +
+
+
+
+
+ + ☆ Epsilon: Exploring Comprehensive Visual-Semantic Projection for + Multi-Label Zero-Shot Learning + + +
+ This paper investigates a challenging problem of zero-shot learning in the +multi-label scenario (MLZSL), wherein the model is trained to recognize +multiple unseen classes within a sample (e.g., an image) based on seen classes +and auxiliary knowledge, e.g., semantic information. Existing methods usually +resort to analyzing the relationship of various seen classes residing in a +sample from the dimension of spatial or semantic characteristics and +transferring the learned model to unseen ones. However, they neglect the +integrity of local and global features. Although the use of the attention +structure will accurately locate local features, especially objects, it will +significantly lose its integrity, and the relationship between classes will +also be affected. Rough processing of global features will also directly affect +comprehensiveness. This neglect will make the model lose its grasp of the main +components of the image. Relying only on the local existence of seen classes +during the inference stage introduces unavoidable bias. In this paper, we +propose a novel and comprehensive visual-semantic framework for MLZSL, dubbed +Epsilon, to fully make use of such properties and enable a more accurate and +robust visual-semantic projection. In terms of spatial information, we achieve +effective refinement by group aggregating image features into several semantic +prompts. It can aggregate semantic information rather than class information, +preserving the correlation between semantics. In terms of global semantics, we +use global forward propagation to collect as much information as possible to +ensure that semantics are not omitted. Experiments on large-scale MLZSL +benchmark datasets NUS-Wide and Open-Images-v4 demonstrate that the proposed +Epsilon outperforms other state-of-the-art methods with large margins. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2309.00923 +
+
+
+
+
+ + ☆ PRG: Prompt-Based Distillation Without Annotation via Proxy Relational + Graph + + +
+ In this paper, we propose a new distillation method for extracting knowledge +from Large Foundation Models (LFM) into lightweight models, introducing a novel +supervision mode that does not require manually annotated data. While LFMs +exhibit exceptional zero-shot classification abilities across datasets, relying +solely on LFM-generated embeddings for distillation poses two main challenges: +LFM's task-irrelevant knowledge and the high density of features. The transfer +of task-irrelevant knowledge could compromise the student model's +discriminative capabilities, and the high density of features within target +domains obstructs the extraction of discriminative knowledge essential for the +task. To address this issue, we introduce the Proxy Relational Graph (PRG) +method. We initially extract task-relevant knowledge from LFMs by calculating a +weighted average of logits obtained through text prompt embeddings. Then we +construct sample-class proxy graphs for LFM and student models, respectively, +to model the correlation between samples and class proxies. Then, we achieve +the distillation of selective knowledge by aligning the relational graphs +produced by both the LFM and the student model. Specifically, the distillation +from LFM to the student model is achieved through two types of alignment: 1) +aligning the sample nodes produced by the student model with those produced by +the LFM, and 2) aligning the edge relationships in the student model's graph +with those in the LFM's graph. Our experimental results validate the +effectiveness of PRG, demonstrating its ability to leverage the extensive +knowledge base of LFMs while skillfully circumventing their inherent +limitations in focused learning scenarios. Notably, in our annotation-free +framework, PRG achieves an accuracy of 76.23\% (T: 77.9\%) on CIFAR-100 and +72.44\% (T: 75.3\%) on the ImageNet-1K. + +
+
+
+
+
+ + ☆ OVA-DETR: Open Vocabulary Aerial Object Detection Using Image-Text + Alignment and Fusion + + +
+ Aerial object detection has been a hot topic for many years due to its wide +application requirements. However, most existing approaches can only handle +predefined categories, which limits their applicability for the open scenarios +in real-world. In this paper, we extend aerial object detection to open +scenarios by exploiting the relationship between image and text, and propose +OVA-DETR, a high-efficiency open-vocabulary detector for aerial images. +Specifically, based on the idea of image-text alignment, we propose region-text +contrastive loss to replace the category regression loss in the traditional +detection framework, which breaks the category limitation. Then, we propose +Bidirectional Vision-Language Fusion (Bi-VLF), which includes a dual-attention +fusion encoder and a multi-level text-guided Fusion Decoder. The dual-attention +fusion encoder enhances the feature extraction process in the encoder part. The +multi-level text-guided Fusion Decoder is designed to improve the detection +ability for small objects, which frequently appear in aerial object detection +scenarios. Experimental results on three widely used benchmark datasets show +that our proposed method significantly improves the mAP and recall, while +enjoying faster inference speed. For instance, in zero shot detection +experiments on DIOR, the proposed OVA-DETR outperforms DescReg and YOLO-World +by 37.4% and 33.1%, respectively, while achieving 87 FPS inference speed, which +is 7.9x faster than DescReg and 3x faster than YOLO-world. The code is +available at https://github.com/GT-Wei/OVA-DETR. + +
+
+
+
+
+ + ☆ Scalable Autoregressive Image Generation with Mamba + + +
+ We introduce AiM, an autoregressive (AR) image generative model based on +Mamba architecture. AiM employs Mamba, a novel state-space model characterized +by its exceptional performance for long-sequence modeling with linear time +complexity, to supplant the commonly utilized Transformers in AR image +generation models, aiming to achieve both superior generation quality and +enhanced inference speed. Unlike existing methods that adapt Mamba to handle +two-dimensional signals via multi-directional scan, AiM directly utilizes the +next-token prediction paradigm for autoregressive image generation. This +approach circumvents the need for extensive modifications to enable Mamba to +learn 2D spatial representations. By implementing straightforward yet +strategically targeted modifications for visual generative tasks, we preserve +Mamba's core structure, fully exploiting its efficient long-sequence modeling +capabilities and scalability. We provide AiM models in various scales, with +parameter counts ranging from 148M to 1.3B. On the ImageNet1K 256*256 +benchmark, our best AiM model achieves a FID of 2.21, surpassing all existing +AR models of comparable parameter counts and demonstrating significant +competitiveness against diffusion models, with 2 to 10 times faster inference +speed. Code is available at https://github.com/hp-l33/AiM + +
+
+ comment: 9 pages, 8 figures +
+
+
+
+
+ + ☆ BihoT: A Large-Scale Dataset and Benchmark for Hyperspectral Camouflaged + Object Tracking + + +
+ Hyperspectral object tracking (HOT) has exhibited potential in various +applications, particularly in scenes where objects are camouflaged. Existing +trackers can effectively retrieve objects via band regrouping because of the +bias in existing HOT datasets, where most objects tend to have distinguishing +visual appearances rather than spectral characteristics. This bias allows the +tracker to directly use the visual features obtained from the false-color +images generated by hyperspectral images without the need to extract spectral +features. To tackle this bias, we find that the tracker should focus on the +spectral information when object appearance is unreliable. Thus, we provide a +new task called hyperspectral camouflaged object tracking (HCOT) and +meticulously construct a large-scale HCOT dataset, termed BihoT, which consists +of 41,912 hyperspectral images covering 49 video sequences. The dataset covers +various artificial camouflage scenes where objects have similar appearances, +diverse spectrums, and frequent occlusion, making it a very challenging dataset +for HCOT. Besides, a simple but effective baseline model, named spectral +prompt-based distractor-aware network (SPDAN), is proposed, comprising a +spectral embedding network (SEN), a spectral prompt-based backbone network +(SPBN), and a distractor-aware module (DAM). Specifically, the SEN extracts +spectral-spatial features via 3-D and 2-D convolutions. Then, the SPBN +fine-tunes powerful RGB trackers with spectral prompts and alleviates the +insufficiency of training samples. Moreover, the DAM utilizes a novel statistic +to capture the distractor caused by occlusion from objects and background. +Extensive experiments demonstrate that our proposed SPDAN achieves +state-of-the-art performance on the proposed BihoT and other HOT datasets. + +
+
+
+
+
+ + ☆ Computer-Aided Fall Recognition Using a Three-Stream Spatial-Temporal + GCN Model with Adaptive Feature Aggregation + + +
+ The prevention of falls is paramount in modern healthcare, particularly for +the elderly, as falls can lead to severe injuries or even fatalities. +Additionally, the growing incidence of falls among the elderly, coupled with +the urgent need to prevent suicide attempts resulting from medication overdose, +underscores the critical importance of accurate and efficient fall detection +methods. In this scenario, a computer-aided fall detection system is inevitable +to save elderly people's lives worldwide. Many researchers have been working to +develop fall detection systems. However, the existing fall detection systems +often struggle with issues such as unsatisfactory performance accuracy, limited +robustness, high computational complexity, and sensitivity to environmental +factors due to a lack of effective features. In response to these challenges, +this paper proposes a novel three-stream spatial-temporal feature-based fall +detection system. Our system incorporates joint skeleton-based spatial and +temporal Graph Convolutional Network (GCN) features, joint motion-based spatial +and temporal GCN features, and residual connections-based features. Each stream +employs adaptive graph-based feature aggregation and consecutive separable +convolutional neural networks (Sep-TCN), significantly reducing computational +complexity and model parameters compared to prior systems. Experimental results +across multiple datasets demonstrate the superior effectiveness and efficiency +of our proposed system, with accuracies of 99.51\%, 99.15\%, 99.79\% and 99.85 +\% achieved on the ImViA, UR-Fall, Fall-UP and FU-Kinect datasets, +respectively. The remarkable performance of our system highlights its +superiority, efficiency, and generalizability in real-world fall detection +scenarios, offering significant advancements in healthcare and societal +well-being. + +
+
+
+
+
+ + ☆ Transientangelo: Few-Viewpoint Surface Reconstruction Using + Single-Photon Lidar + + +
+ We consider the problem of few-viewpoint 3D surface reconstruction using raw +measurements from a lidar system. Lidar captures 3D scene geometry by emitting +pulses of light to a target and recording the speed-of-light time delay of the +reflected light. However, conventional lidar systems do not output the raw, +captured waveforms of backscattered light; instead, they pre-process these data +into a 3D point cloud. Since this procedure typically does not accurately model +the noise statistics of the system, exploit spatial priors, or incorporate +information about downstream tasks, it ultimately discards useful information +that is encoded in raw measurements of backscattered light. Here, we propose to +leverage raw measurements captured with a single-photon lidar system from +multiple viewpoints to optimize a neural surface representation of a scene. The +measurements consist of time-resolved photon count histograms, or transients, +which capture information about backscattered light at picosecond time scales. +Additionally, we develop new regularization strategies that improve robustness +to photon noise, enabling accurate surface reconstruction with as few as 10 +photons per pixel. Our method outperforms other techniques for few-viewpoint 3D +reconstruction based on depth maps, point clouds, or conventional lidar as +demonstrated in simulation and with captured data. + +
+
+
+
+
+ + ☆ Rebalancing Multi-Label Class-Incremental Learning + + +
+ Multi-label class-incremental learning (MLCIL) is essential for real-world +multi-label applications, allowing models to learn new labels while retaining +previously learned knowledge continuously. However, recent MLCIL approaches can +only achieve suboptimal performance due to the oversight of the +positive-negative imbalance problem, which manifests at both the label and loss +levels because of the task-level partial label issue. The imbalance at the +label level arises from the substantial absence of negative labels, while the +imbalance at the loss level stems from the asymmetric contributions of the +positive and negative loss parts to the optimization. To address the issue +above, we propose a Rebalance framework for both the Loss and Label levels +(RebLL), which integrates two key modules: asymmetric knowledge distillation +(AKD) and online relabeling (OR). AKD is proposed to rebalance at the loss +level by emphasizing the negative label learning in classification loss and +down-weighting the contribution of overconfident predictions in distillation +loss. OR is designed for label rebalance, which restores the original class +distribution in memory by online relabeling the missing classes. Our +comprehensive experiments on the PASCAL VOC and MS-COCO datasets demonstrate +that this rebalancing strategy significantly improves performance, achieving +new state-of-the-art results even with a vanilla CNN backbone. + +
+
+
+
+
+ + ☆ TRRG: Towards Truthful Radiology Report Generation With Cross-modal + Disease Clue Enhanced Large Language Model + + +
+ The vision-language modeling capability of multi-modal large language models +has attracted wide attention from the community. However, in medical domain, +radiology report generation using vision-language models still faces +significant challenges due to the imbalanced data distribution caused by +numerous negated descriptions in radiology reports and issues such as rough +alignment between radiology reports and radiography. In this paper, we propose +a truthful radiology report generation framework, namely TRRG, based on +stage-wise training for cross-modal disease clue injection into large language +models. In pre-training stage, During the pre-training phase, contrastive +learning is employed to enhance the ability of visual encoder to perceive +fine-grained disease details. In fine-tuning stage, the clue injection module +we proposed significantly enhances the disease-oriented perception capability +of the large language model by effectively incorporating the robust zero-shot +disease perception. Finally, through the cross-modal clue interaction module, +our model effectively achieves the multi-granular interaction of visual +embeddings and an arbitrary number of disease clue embeddings. This +significantly enhances the report generation capability and clinical +effectiveness of multi-modal large language models in the field of radiology +reportgeneration. Experimental results demonstrate that our proposed +pre-training and fine-tuning framework achieves state-of-the-art performance in +radiology report generation on datasets such as IU-Xray and MIMIC-CXR. Further +analysis indicates that our proposed method can effectively enhance the model +to perceive diseases and improve its clinical effectiveness. + +
+
+
+
+
+ + ☆ Diffusion-Based Visual Art Creation: A Survey and New Perspectives + + +
+ The integration of generative AI in visual art has revolutionized not only +how visual content is created but also how AI interacts with and reflects the +underlying domain knowledge. This survey explores the emerging realm of +diffusion-based visual art creation, examining its development from both +artistic and technical perspectives. We structure the survey into three phases, +data feature and framework identification, detailed analyses using a structured +coding process, and open-ended prospective outlooks. Our findings reveal how +artistic requirements are transformed into technical challenges and highlight +the design and application of diffusion-based methods within visual art +creation. We also provide insights into future directions from technical and +synergistic perspectives, suggesting that the confluence of generative AI and +art has shifted the creative paradigm and opened up new possibilities. By +summarizing the development and trends of this emerging interdisciplinary area, +we aim to shed light on the mechanisms through which AI systems emulate and +possibly, enhance human capacities in artistic perception and creativity. + +
+
+ comment: 35 pages, 9 figures +
+
+
+
+
+ + ☆ SPARK: Multi-Vision Sensor Perception and Reasoning Benchmark for + Large-scale Vision-Language Models SP + + +
+ Large-scale Vision-Language Models (LVLMs) have significantly advanced with +text-aligned vision inputs. They have made remarkable progress in computer +vision tasks by aligning text modality with vision inputs. There are also +endeavors to incorporate multi-vision sensors beyond RGB, including thermal, +depth, and medical X-ray images. However, we observe that current LVLMs view +images taken from multi-vision sensors as if they were in the same RGB domain +without considering the physical characteristics of multi-vision sensors. They +fail to convey the fundamental multi-vision sensor information from the dataset +and the corresponding contextual knowledge properly. Consequently, alignment +between the information from the actual physical environment and the text is +not achieved correctly, making it difficult to answer complex sensor-related +questions that consider the physical environment. In this paper, we aim to +establish a multi-vision Sensor Perception And Reasoning benchmarK called SPARK +that can reduce the fundamental multi-vision sensor information gap between +images and multi-vision sensors. We generated 6,248 vision-language test +samples automatically to investigate multi-vision sensory perception and +multi-vision sensory reasoning on physical sensor knowledge proficiency across +different formats, covering different types of sensor-related questions. We +utilized these samples to assess ten leading LVLMs. The results showed that +most models displayed deficiencies in multi-vision sensory reasoning to varying +extents. Codes and data are available at https://github.com/top-yun/SPARK + +
+
+ comment: Codes and data are available at https://github.com/top-yun/SPARK +
+
+
+
+
+ + ☆ ZipGait: Bridging Skeleton and Silhouette with Diffusion Model for + Advancing Gait Recognition + + +
+ Current gait recognition research predominantly focuses on extracting +appearance features effectively, but the performance is severely compromised by +the vulnerability of silhouettes under unconstrained scenes. Consequently, +numerous studies have explored how to harness information from various models, +particularly by sufficiently utilizing the intrinsic information of skeleton +sequences. While these model-based methods have achieved significant +performance, there is still a huge gap compared to appearance-based methods, +which implies the potential value of bridging silhouettes and skeletons. In +this work, we make the first attempt to reconstruct dense body shapes from +discrete skeleton distributions via the diffusion model, demonstrating a new +approach that connects cross-modal features rather than focusing solely on +intrinsic features to improve model-based methods. To realize this idea, we +propose a novel gait diffusion model named DiffGait, which has been designed +with four specific adaptations suitable for gait recognition. Furthermore, to +effectively utilize the reconstructed silhouettes and skeletons, we introduce +Perception Gait Integration (PGI) to integrate different gait features through +a two-stage process. Incorporating those modifications leads to an efficient +model-based gait recognition framework called ZipGait. Through extensive +experiments on four public benchmarks, ZipGait demonstrates superior +performance, outperforming the state-of-the-art methods by a large margin under +both cross-domain and intra-domain settings, while achieving significant +plug-and-play performance improvements. + +
+
+
+
+
+ + ☆ RoVRM: A Robust Visual Reward Model Optimized via Auxiliary Textual + Preference Data + + +
+ Large vision-language models (LVLMs) often fail to align with human +preferences, leading to issues like generating misleading content without +proper visual context (also known as hallucination). A promising solution to +this problem is using human-preference alignment techniques, such as best-of-n +sampling and reinforcement learning. However, these techniques face the +difficulty arising from the scarcity of visual preference data, which is +required to train a visual reward model (VRM). In this work, we continue the +line of research. We present a Robust Visual Reward Model (RoVRM) which +improves human-preference alignment for LVLMs. RoVRM leverages auxiliary +textual preference data through a three-phase progressive training and optimal +transport-based preference data selection to effectively mitigate the scarcity +of visual preference data. We experiment with RoVRM on the commonly used +vision-language tasks based on the LLaVA-1.5-7B and -13B models. Experimental +results demonstrate that RoVRM consistently outperforms traditional VRMs. +Furthermore, our three-phase progressive training and preference data selection +approaches can yield consistent performance gains over ranking-based alignment +techniques, such as direct preference optimization. + +
+
+
+
+
+ + ☆ Integrating Audio, Visual, and Semantic Information for Enhanced + Multimodal Speaker Diarization + + +
+ Speaker diarization, the process of segmenting an audio stream or transcribed +speech content into homogenous partitions based on speaker identity, plays a +crucial role in the interpretation and analysis of human speech. Most existing +speaker diarization systems rely exclusively on unimodal acoustic information, +making the task particularly challenging due to the innate ambiguities of audio +signals. Recent studies have made tremendous efforts towards audio-visual or +audio-semantic modeling to enhance performance. However, even the incorporation +of up to two modalities often falls short in addressing the complexities of +spontaneous and unstructured conversations. To exploit more meaningful dialogue +patterns, we propose a novel multimodal approach that jointly utilizes audio, +visual, and semantic cues to enhance speaker diarization. Our method elegantly +formulates the multimodal modeling as a constrained optimization problem. +First, we build insights into the visual connections among active speakers and +the semantic interactions within spoken content, thereby establishing abundant +pairwise constraints. Then we introduce a joint pairwise constraint propagation +algorithm to cluster speakers based on these visual and semantic constraints. +This integration effectively leverages the complementary strengths of different +modalities, refining the affinity estimation between individual speaker +embeddings. Extensive experiments conducted on multiple multimodal datasets +demonstrate that our approach consistently outperforms state-of-the-art speaker +diarization methods. + +
+
+
+
+
+ + ☆ A Unified Plug-and-Play Algorithm with Projected Landweber Operator for + Split Convex Feasibility Problems + + +
+ In recent years Plug-and-Play (PnP) methods have achieved state-of-the-art +performance in inverse imaging problems by replacing proximal operators with +denoisers. Based on the proximal gradient method, some theoretical results of +PnP have appeared, where appropriate step size is crucial for convergence +analysis. However, in practical applications, applying PnP methods with +theoretically guaranteed step sizes is difficult, and these algorithms are +limited to Gaussian noise. In this paper,from a perspective of split convex +feasibility problems (SCFP), an adaptive PnP algorithm with Projected Landweber +Operator (PnP-PLO) is proposed to address these issues. Numerical experiments +on image deblurring, super-resolution, and compressed sensing MRI experiments +illustrate that PnP-PLO with theoretical guarantees outperforms +state-of-the-art methods such as RED and RED-PRO. + +
+
+
+
+
+ + ☆ Query-Efficient Video Adversarial Attack with Stylized Logo + + +
+ Video classification systems based on Deep Neural Networks (DNNs) have +demonstrated excellent performance in accurately verifying video content. +However, recent studies have shown that DNNs are highly vulnerable to +adversarial examples. Therefore, a deep understanding of adversarial attacks +can better respond to emergency situations. In order to improve attack +performance, many style-transfer-based attacks and patch-based attacks have +been proposed. However, the global perturbation of the former will bring +unnatural global color, while the latter is difficult to achieve success in +targeted attacks due to the limited perturbation space. Moreover, compared to a +plethora of methods targeting image classifiers, video adversarial attacks are +still not that popular. Therefore, to generate adversarial examples with a low +budget and to provide them with a higher verisimilitude, we propose a novel +black-box video attack framework, called Stylized Logo Attack (SLA). SLA is +conducted through three steps. The first step involves building a style +references set for logos, which can not only make the generated examples more +natural, but also carry more target class features in the targeted attacks. +Then, reinforcement learning (RL) is employed to determine the style reference +and position parameters of the logo within the video, which ensures that the +stylized logo is placed in the video with optimal attributes. Finally, +perturbation optimization is designed to optimize perturbations to improve the +fooling rate in a step-by-step manner. Sufficient experimental results indicate +that, SLA can achieve better performance than state-of-the-art methods and +still maintain good deception effects when facing various defense methods. + +
+
+
+
+
+ + ☆ LLM-enhanced Scene Graph Learning for Household Rearrangement SIGGRAPH + + +
+ The household rearrangement task involves spotting misplaced objects in a +scene and accommodate them with proper places. It depends both on common-sense +knowledge on the objective side and human user preference on the subjective +side. In achieving such task, we propose to mine object functionality with user +preference alignment directly from the scene itself, without relying on human +intervention. To do so, we work with scene graph representation and propose +LLM-enhanced scene graph learning which transforms the input scene graph into +an affordance-enhanced graph (AEG) with information-enhanced nodes and newly +discovered edges (relations). In AEG, the nodes corresponding to the receptacle +objects are augmented with context-induced affordance which encodes what kind +of carriable objects can be placed on it. New edges are discovered with newly +discovered non-local relations. With AEG, we perform task planning for scene +rearrangement by detecting misplaced carriables and determining a proper +placement for each of them. We test our method by implementing a tiding robot +in simulator and perform evaluation on a new benchmark we build. Extensive +evaluations demonstrate that our method achieves state-of-the-art performance +on misplacement detection and the following rearrangement planning. + +
+
+ comment: SIGGRAPH ASIA 2024 +
+
+
+
+
+ + ☆ Unlocking Attributes' Contribution to Successful Camouflage: A Combined + Textual and VisualAnalysis Strategy ECCV 2024 + + +
+ In the domain of Camouflaged Object Segmentation (COS), despite continuous +improvements in segmentation performance, the underlying mechanisms of +effective camouflage remain poorly understood, akin to a black box. To address +this gap, we present the first comprehensive study to examine the impact of +camouflage attributes on the effectiveness of camouflage patterns, offering a +quantitative framework for the evaluation of camouflage designs. To support +this analysis, we have compiled the first dataset comprising descriptions of +camouflaged objects and their attribute contributions, termed COD-Text And +X-attributions (COD-TAX). Moreover, drawing inspiration from the hierarchical +process by which humans process information: from high-level textual +descriptions of overarching scenarios, through mid-level summaries of local +areas, to low-level pixel data for detailed analysis. We have developed a +robust framework that combines textual and visual information for the task of +COS, named Attribution CUe Modeling with Eye-fixation Network (ACUMEN). ACUMEN +demonstrates superior performance, outperforming nine leading methods across +three widely-used datasets. We conclude by highlighting key insights derived +from the attributes identified in our study. Code: +https://github.com/lyu-yx/ACUMEN. + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+ + ☆ Vision-Based Detection of Uncooperative Targets and Components on Small + Satellites + + +
+ Space debris and inactive satellites pose a threat to the safety and +integrity of operational spacecraft and motivate the need for space situational +awareness techniques. These uncooperative targets create a challenging tracking +and detection problem due to a lack of prior knowledge of their features, +trajectories, or even existence. Recent advancements in computer vision models +can be used to improve upon existing methods for tracking such uncooperative +targets to make them more robust and reliable to the wide-ranging nature of the +target. This paper introduces an autonomous detection model designed to +identify and monitor these objects using learning and computer vision. The +autonomous detection method aims to identify and accurately track the +uncooperative targets in varied circumstances, including different camera +spectral sensitivities, lighting, and backgrounds. Our method adapts to the +relative distance between the observing spacecraft and the target, and +different detection strategies are adjusted based on distance. At larger +distances, we utilize You Only Look Once (YOLOv8), a multitask Convolutional +Neural Network (CNN), for zero-shot and domain-specific single-shot real time +detection of the target. At shorter distances, we use knowledge distillation to +combine visual foundation models with a lightweight fast segmentation CNN +(Fast-SCNN) to segment the spacecraft components with low storage requirements +and fast inference times, and to enable weight updates from earth and possible +onboard training. Lastly, we test our method on a custom dataset simulating the +unique conditions encountered in space, as well as a publicly-available +dataset. + +
+
+ comment: Small Satellite 2024 Conference, 13 pages, 8 figures, 6 tables +
+
+
+
+
+ + ☆ Through-the-Wall Radar Human Activity Micro-Doppler Signature + Representation Method Based on Joint Boulic-Sinusoidal Pendulum Model + + +
+ With the help of micro-Doppler signature, ultra-wideband (UWB) +through-the-wall radar (TWR) enables the reconstruction of range and velocity +information of limb nodes to accurately identify indoor human activities. +However, existing methods are usually trained and validated directly using +range-time maps (RTM) and Doppler-time maps (DTM), which have high feature +redundancy and poor generalization ability. In order to solve this problem, +this paper proposes a human activity micro-Doppler signature representation +method based on joint Boulic-sinusoidal pendulum motion model. In detail, this +paper presents a simplified joint Boulic-sinusoidal pendulum human motion model +by taking head, torso, both hands and feet into consideration improved from +Boulic-Thalmann kinematic model. The paper also calculates the minimum number +of key points needed to describe the Doppler and micro-Doppler information +sufficiently. Both numerical simulations and experiments are conducted to +verify the effectiveness. The results demonstrate that the proposed number of +key points of micro-Doppler signature can precisely represent the indoor human +limb node motion characteristics, and substantially improve the generalization +capability of the existing methods for different testers. + +
+
+ comment: 17 pages, 14 figures, 7 tables, in IEEE Transactions on Microwave + Theory and Techniques, 2024 +
+
+
+
+
+ + ☆ Enhancing Sampling Protocol for Robust Point Cloud Classification + + +
+ Established sampling protocols for 3D point cloud learning, such as Farthest +Point Sampling (FPS) and Fixed Sample Size (FSS), have long been recognized and +utilized. However, real-world data often suffer from corrputions such as sensor +noise, which violates the benignness assumption of point cloud in current +protocols. Consequently, they are notably vulnerable to noise, posing +significant safety risks in critical applications like autonomous driving. To +address these issues, we propose an enhanced point cloud sampling protocol, +PointDR, which comprises two components: 1) Downsampling for key point +identification and 2) Resampling for flexible sample size. Furthermore, +differentiated strategies are implemented for training and inference processes. +Particularly, an isolation-rated weight considering local density is designed +for the downsampling method, assisting it in performing random key points +selection in the training phase and bypassing noise in the inference phase. A +local-geometry-preserved upsampling is incorporated into resampling, +facilitating it to maintain a stochastic sample size in the training stage and +complete insufficient data in the inference. It is crucial to note that the +proposed protocol is free of model architecture altering and extra learning, +thus minimal efforts are demanded for its replacement of the existing one. +Despite the simplicity, it substantially improves the robustness of point cloud +learning, showcased by outperforming the state-of-the-art methods on multiple +benchmarks of corrupted point cloud classification. The code will be available +upon the paper's acceptance. + +
+
+
+
+
+ + ☆ ISETHDR: A Physics-based Synthetic Radiance Dataset for High Dynamic + Range Driving Scenes + + +
+ This paper describes a physics-based end-to-end software simulation for image +systems. We use the software to explore sensors designed to enhance performance +in high dynamic range (HDR) environments, such as driving through daytime +tunnels and under nighttime conditions. We synthesize physically realistic HDR +spectral radiance images and use them as the input to digital twins that model +the optics and sensors of different systems. This paper makes three main +contributions: (a) We create a labeled (instance segmentation and depth), +synthetic radiance dataset of HDR driving scenes. (b) We describe the +development and validation of the end-to-end simulation framework. (c) We +present a comparative analysis of two single-shot sensors designed for HDR. We +open-source both the dataset and the software. + +
+
+
+
+
+ + ♻ ☆ SiNGR: Brain Tumor Segmentation via Signed Normalized Geodesic Transform + Regression MICCAI 2024 + + +
+ One of the primary challenges in brain tumor segmentation arises from the +uncertainty of voxels close to tumor boundaries. However, the conventional +process of generating ground truth segmentation masks fails to treat such +uncertainties properly. Those "hard labels" with 0s and 1s conceptually +influenced the majority of prior studies on brain image segmentation. As a +result, tumor segmentation is often solved through voxel classification. In +this work, we instead view this problem as a voxel-level regression, where the +ground truth represents a certainty mapping from any pixel to the border of the +tumor. We propose a novel ground truth label transformation, which is based on +a signed geodesic transform, to capture the uncertainty in brain tumors' +vicinity. We combine this idea with a Focal-like regression L1-loss that +enables effective regression learning in high-dimensional output space by +appropriately weighting voxels according to their difficulty. We thoroughly +conduct an experimental evaluation to validate the components of our proposed +method, compare it to a diverse array of state-of-the-art segmentation models, +and show that it is architecture-agnostic. The code of our method is made +publicly available (\url{https://github.com/Oulu-IMEDS/SiNGR/}). + +
+
+ comment: Accepted as a conference paper at MICCAI 2024 +
+
+
+
+
+ + ♻ ☆ Generalizing Visual Question Answering from Synthetic to Human-Written + Questions via a Chain of QA with a Large Language Model + + +
+ Visual question answering (VQA) is a task where an image is given, and a +series of questions are asked about the image. To build an efficient VQA +algorithm, a large amount of QA data is required which is very expensive. +Generating synthetic QA pairs based on templates is a practical way to obtain +data. However, VQA models trained on those data do not perform well on complex, +human-written questions. To address this issue, we propose a new method called +{\it chain of QA for human-written questions} (CoQAH). CoQAH utilizes a +sequence of QA interactions between a large language model and a VQA model +trained on synthetic data to reason and derive logical answers for +human-written questions. We tested the effectiveness of CoQAH on two types of +human-written VQA datasets for 3D-rendered and chest X-ray images and found +that it achieved state-of-the-art accuracy in both types of data. Notably, +CoQAH outperformed general vision-language models, VQA models, and medical +foundation models with no finetuning. + +
+
+
+
+
+ + ♻ ☆ Segment anything model 2: an application to 2D and 3D medical images + + +
+ Segment Anything Model (SAM) has gained significant attention because of its +ability to segment various objects in images given a prompt. The recently +developed SAM 2 has extended this ability to video inputs. This opens an +opportunity to apply SAM to 3D images, one of the fundamental tasks in the +medical imaging field. In this paper, we extensively evaluate SAM 2's ability +to segment both 2D and 3D medical images by first collecting 21 medical imaging +datasets, including surgical videos, common 3D modalities such as computed +tomography (CT), magnetic resonance imaging (MRI), and positron emission +tomography (PET) as well as 2D modalities such as X-ray and ultrasound. Two +evaluation settings of SAM 2 are considered: (1) multi-frame 3D segmentation, +where prompts are provided to one or multiple slice(s) selected from the +volume, and (2) single-frame 2D segmentation, where prompts are provided to +each slice. The former only applies to videos and 3D modalities, while the +latter applies to all datasets. Our results show that SAM 2 exhibits similar +performance as SAM under single-frame 2D segmentation, and has variable +performance under multi-frame 3D segmentation depending on the choices of +slices to annotate, the direction of the propagation, the predictions utilized +during the propagation, etc. We believe our work enhances the understanding of +SAM 2's behavior in the medical field and provides directions for future work +in adapting SAM 2 to this domain. Our code is available at: +https://github.com/mazurowski-lab/segment-anything2-medical-evaluation. + +
+
+ comment: 20 pages, 13 figures. Codes are available at + https://github.com/mazurowski-lab/segment-anything2-medical-evaluation +
+
+
+
+
+ + ♻ ☆ Real-world Image Dehazing with Coherence-based Label Generator and + Cooperative Unfolding Network + + +
+ Real-world Image Dehazing (RID) aims to alleviate haze-induced degradation in +real-world settings. This task remains challenging due to the complexities in +accurately modeling real haze distributions and the scarcity of paired +real-world data. To address these challenges, we first introduce a cooperative +unfolding network that jointly models atmospheric scattering and image scenes, +effectively integrating physical knowledge into deep networks to restore +haze-contaminated details. Additionally, we propose the first RID-oriented +iterative mean-teacher framework, termed the Coherence-based Label Generator, +to generate high-quality pseudo labels for network training. Specifically, we +provide an optimal label pool to store the best pseudo-labels during network +training, leveraging both global and local coherence to select high-quality +candidates and assign weights to prioritize haze-free regions. We verify the +effectiveness of our method, with experiments demonstrating that it achieves +state-of-the-art performance on RID tasks. Code will be available at +\url{https://github.com/cnyvfang/CORUN-Colabator}. + +
+
+ comment: 10 pages, 7 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ StreamLTS: Query-based Temporal-Spatial LiDAR Fusion for Cooperative + Object Detection + + +
+ Cooperative perception via communication among intelligent traffic agents has +great potential to improve the safety of autonomous driving. However, limited +communication bandwidth, localization errors and asynchronized capturing time +of sensor data, all introduce difficulties to the data fusion of different +agents. To some extend, previous works have attempted to reduce the shared data +size, mitigate the spatial feature misalignment caused by localization errors +and communication delay. However, none of them have considered the +asynchronized sensor ticking times, which can lead to dynamic object +misplacement of more than one meter during data fusion. In this work, we +propose Time-Aligned COoperative Object Detection (TA-COOD), for which we adapt +widely used dataset OPV2V and DairV2X with considering asynchronous LiDAR +sensor ticking times and build an efficient fully sparse framework with +modeling the temporal information of individual objects with query-based +techniques. The experiment results confirmed the superior efficiency of our +fully sparse framework compared to the state-of-the-art dense models. More +importantly, they show that the point-wise observation timestamps of the +dynamic objects are crucial for accurate modeling the object temporal context +and the predictability of their time-related locations. The official code is +available at \url{https://github.com/YuanYunshuang/CoSense3D}. + +
+
+
+
+
+ + ♻ ☆ SuperSimpleNet: Unifying Unsupervised and Supervised Learning for Fast + and Reliable Surface Defect Detection ICPR 2024 + + +
+ The aim of surface defect detection is to identify and localise abnormal +regions on the surfaces of captured objects, a task that's increasingly +demanded across various industries. Current approaches frequently fail to +fulfil the extensive demands of these industries, which encompass high +performance, consistency, and fast operation, along with the capacity to +leverage the entirety of the available training data. Addressing these gaps, we +introduce SuperSimpleNet, an innovative discriminative model that evolved from +SimpleNet. This advanced model significantly enhances its predecessor's +training consistency, inference time, as well as detection performance. +SuperSimpleNet operates in an unsupervised manner using only normal training +images but also benefits from labelled abnormal training images when they are +available. SuperSimpleNet achieves state-of-the-art results in both the +supervised and the unsupervised settings, as demonstrated by experiments across +four challenging benchmark datasets. Code: +https://github.com/blaz-r/SuperSimpleNet . + +
+
+ comment: Accepted to ICPR 2024 +
+
+
+
+
+ + ♻ ☆ FOUND: Foot Optimization with Uncertain Normals for Surface Deformation + Using Synthetic Data + + +
+ Surface reconstruction from multi-view images is a challenging task, with +solutions often requiring a large number of sampled images with high overlap. +We seek to develop a method for few-view reconstruction, for the case of the +human foot. To solve this task, we must extract rich geometric cues from RGB +images, before carefully fusing them into a final 3D object. Our FOUND approach +tackles this, with 4 main contributions: (i) SynFoot, a synthetic dataset of +50,000 photorealistic foot images, paired with ground truth surface normals and +keypoints; (ii) an uncertainty-aware surface normal predictor trained on our +synthetic dataset; (iii) an optimization scheme for fitting a generative foot +model to a series of images; and (iv) a benchmark dataset of calibrated images +and high resolution ground truth geometry. We show that our normal predictor +outperforms all off-the-shelf equivalents significantly on real images, and our +optimization scheme outperforms state-of-the-art photogrammetry pipelines, +especially for a few-view setting. We release our synthetic dataset and +baseline 3D scans to the research community. + +
+
+ comment: 14 pages, 15 figures +
+
+
+
+
+ + ♻ ☆ Dual-path Frequency Discriminators for Few-shot Anomaly Detection + + +
+ Few-shot anomaly detection (FSAD) plays a crucial role in industrial +manufacturing. However, existing FSAD methods encounter difficulties leveraging +a limited number of normal samples, frequently failing to detect and locate +inconspicuous anomalies in the spatial domain. We have further discovered that +these subtle anomalies would be more noticeable in the frequency domain. In +this paper, we propose a Dual-Path Frequency Discriminators (DFD) network from +a frequency perspective to tackle these issues. The original spatial images are +transformed into multi-frequency images, making them more conducive to the +tailored discriminators in detecting anomalies. Additionally, the +discriminators learn a joint representation with forms of pseudo-anomalies. +Extensive experiments conducted on MVTec AD and VisA benchmarks demonstrate +that our DFD surpasses current state-of-the-art methods. The code is available +at \url{https://github.com/yuhbai/DFD}. + +
+
+ comment: Accepted by KBS +
+
+
+
+
+ + ♻ ☆ Domain Generalization through Meta-Learning: A Survey + + +
+ Deep neural networks (DNNs) have revolutionized artificial intelligence but +often lack performance when faced with out-of-distribution (OOD) data, a common +scenario due to the inevitable domain shifts in real-world applications. This +limitation stems from the common assumption that training and testing data +share the same distribution--an assumption frequently violated in practice. +Despite their effectiveness with large amounts of data and computational power, +DNNs struggle with distributional shifts and limited labeled data, leading to +overfitting and poor generalization across various tasks and domains. +Meta-learning presents a promising approach by employing algorithms that +acquire transferable knowledge across various tasks for fast adaptation, +eliminating the need to learn each task from scratch. This survey paper delves +into the realm of meta-learning with a focus on its contribution to domain +generalization. We first clarify the concept of meta-learning for domain +generalization and introduce a novel taxonomy based on the feature extraction +strategy and the classifier learning methodology, offering a granular view of +methodologies. Additionally, we present a decision graph to assist readers in +navigating the taxonomy based on data availability and domain shifts, enabling +them to select and develop a proper model tailored to their specific problem +requirements. Through an exhaustive review of existing methods and underlying +theories, we map out the fundamentals of the field. Our survey provides +practical insights and an informed discussion on promising research directions. + +
+
+
+
+
+ + ♻ ☆ Gaze-guided Hand-Object Interaction Synthesis: Dataset and Method + + +
+ Gaze plays a crucial role in revealing human attention and intention, +particularly in hand-object interaction scenarios, where it guides and +synchronizes complex tasks that require precise coordination between the brain, +hand, and object. Motivated by this, we introduce a novel task: Gaze-Guided +Hand-Object Interaction Synthesis, with potential applications in augmented +reality, virtual reality, and assistive technologies. To support this task, we +present GazeHOI, the first dataset to capture simultaneous 3D modeling of gaze, +hand, and object interactions. This task poses significant challenges due to +the inherent sparsity and noise in gaze data, as well as the need for high +consistency and physical plausibility in generating hand and object motions. To +tackle these issues, we propose a stacked gaze-guided hand-object interaction +diffusion model, named GHO-Diffusion. The stacked design effectively reduces +the complexity of motion generation. We also introduce HOI-Manifold Guidance +during the sampling stage of GHO-Diffusion, enabling fine-grained control over +generated motions while maintaining the data manifold. Additionally, we propose +a spatial-temporal gaze feature encoding for the diffusion condition and select +diffusion results based on consistency scores between gaze-contact maps and +gaze-interaction trajectories. Extensive experiments highlight the +effectiveness of our method and the unique contributions of our dataset. + +
+
+
+
+
+ + ♻ ☆ A New Chinese Landscape Paintings Generation Model based on Stable + Diffusion using DreamBooth HPCA + + +
+ This study mainly introduces a method combining the Stable Diffusion Model +(SDM) and Parameter-Efficient Fine-Tuning method for generating Chinese +Landscape Paintings. This training process is accelerated by combining LoRA +with pre-trained SDM and DreamBooth with pre-trained SDM, respectively. On the +Chinese Landscape Paintings Internet dataset used in this paper, this study +finds that SDM combined with DreamBooth exhibits superior performance, +outperforming other models, including the generic pre-trained SDM and +LoRA-based fine-tuning SDM. The SDM combined with DreamBooth achieves a FID of +12.75 on the dataset and outperforms all other models in terms of expert +evaluation, highlighting the model's versatility in the field of Chinese +Landscape Paintings given the unique identifier, high fidelity and high +quality. This study illustrates the potential of specialised fine-tuning method +to improve the performance of SDM on domain-specific tasks, particularly in the +domain of Landscape Paintings. + +
+
+ comment: accepted by AHPCAI +
+
+
+
+
+ + ♻ ☆ Mixstyle-Entropy: Domain Generalization with Causal Intervention and + Perturbation BMVC2024 + + +
+ Despite the considerable advancements achieved by deep neural networks, their +performance tends to degenerate when the test environment diverges from the +training ones. Domain generalization (DG) solves this issue by learning +representations independent of domain-related information, thus facilitating +extrapolation to unseen environments. Existing approaches typically focus on +formulating tailored training objectives to extract shared features from the +source data. However, the disjointed training and testing procedures may +compromise robustness, particularly in the face of unforeseen variations during +deployment. In this paper, we propose a novel and holistic framework based on +causality, named InPer, designed to enhance model generalization by +incorporating causal intervention during training and causal perturbation +during testing. Specifically, during the training phase, we employ +entropy-based causal intervention (EnIn) to refine the selection of causal +variables. To identify samples with anti-interference causal variables from the +target domain, we propose a novel metric, homeostatic score, through causal +perturbation (HoPer) to construct a prototype classifier in test time. +Experimental results across multiple cross-domain tasks confirm the efficacy of +InPer. + +
+
+ comment: Accepted by BMVC2024 +
+
+
+
+
+ + ♻ ☆ A Personalized Zero-Shot ECG Arrhythmia Monitoring System: From Sparse + Representation Based Domain Adaption to Energy Efficient Abnormal Beat + Detection for Practical ECG Surveillance + + +
+ This paper proposes a low-cost and highly accurate ECG-monitoring system +intended for personalized early arrhythmia detection for wearable mobile +sensors. Earlier supervised approaches for personalized ECG monitoring require +both abnormal and normal heartbeats for the training of the dedicated +classifier. However, in a real-world scenario where the personalized algorithm +is embedded in a wearable device, such training data is not available for +healthy people with no cardiac disorder history. In this study, (i) we propose +a null space analysis on the healthy signal space obtained via sparse +dictionary learning, and investigate how a simple null space projection or +alternatively regularized least squares-based classification methods can reduce +the computational complexity, without sacrificing the detection accuracy, when +compared to sparse representation-based classification. (ii) Then we introduce +a sparse representation-based domain adaptation technique in order to project +other existing users' abnormal and normal signals onto the new user's signal +space, enabling us to train the dedicated classifier without having any +abnormal heartbeat of the new user. Therefore, zero-shot learning can be +achieved without the need for synthetic abnormal heartbeat generation. An +extensive set of experiments performed on the benchmark MIT-BIH ECG dataset +shows that when this domain adaptation-based training data generator is used +with a simple 1-D CNN classifier, the method outperforms the prior work by a +significant margin. (iii) Then, by combining (i) and (ii), we propose an +ensemble classifier that further improves the performance. This approach for +zero-shot arrhythmia detection achieves an average accuracy level of 98.2% and +an F1-Score of 92.8%. Finally, a personalized energy-efficient ECG monitoring +scheme is proposed using the above-mentioned innovations. + +
+
+ comment: Software implementation: https://github.com/MertDuman/Zero-Shot-ECG +
+
+
+
+
+ + ♻ ☆ U-KAN Makes Strong Backbone for Medical Image Segmentation and + Generation + + +
+ U-Net has become a cornerstone in various visual applications such as image +segmentation and diffusion probability models. While numerous innovative +designs and improvements have been introduced by incorporating transformers or +MLPs, the networks are still limited to linearly modeling patterns as well as +the deficient interpretability. To address these challenges, our intuition is +inspired by the impressive results of the Kolmogorov-Arnold Networks (KANs) in +terms of accuracy and interpretability, which reshape the neural network +learning via the stack of non-linear learnable activation functions derived +from the Kolmogorov-Anold representation theorem. Specifically, in this paper, +we explore the untapped potential of KANs in improving backbones for vision +tasks. We investigate, modify and re-design the established U-Net pipeline by +integrating the dedicated KAN layers on the tokenized intermediate +representation, termed U-KAN. Rigorous medical image segmentation benchmarks +verify the superiority of U-KAN by higher accuracy even with less computation +cost. We further delved into the potential of U-KAN as an alternative U-Net +noise predictor in diffusion models, demonstrating its applicability in +generating task-oriented model architectures. These endeavours unveil valuable +insights and sheds light on the prospect that with U-KAN, you can make strong +backbone for medical image segmentation and generation. Project +page:\url{https://yes-u-kan.github.io/}. + +
+
+
+
+
+ + ♻ ☆ Object Re-identification via Spatial-temporal Fusion Networks and Causal + Identity Matching + + +
+ Object re-identification (ReID) in large camera networks faces numerous +challenges. First, the similar appearances of objects degrade ReID performance, +a challenge that needs to be addressed by existing appearance-based ReID +methods. Second, most ReID studies are performed in laboratory settings and do +not consider real-world scenarios. To overcome these challenges, we introduce a +novel ReID framework that leverages a spatial-temporal fusion network and +causal identity matching (CIM). Our framework estimates camera network topology +using a proposed adaptive Parzen window and combines appearance features with +spatial-temporal cues within the fusion network. This approach has demonstrated +outstanding performance across several datasets, including VeRi776, Vehicle-3I, +and Market-1501, achieving up to 99.70% rank-1 accuracy and 95.5% mAP. +Furthermore, the proposed CIM approach, which dynamically assigns gallery sets +based on camera network topology, has further improved ReID accuracy and +robustness in real-world settings, evidenced by a 94.95% mAP and a 95.19% F1 +score on the Vehicle-3I dataset. The experimental results support the +effectiveness of incorporating spatial-temporal information and CIM for +real-world ReID scenarios, regardless of the data domain (e.g., vehicle, +person). + +
+
+
+
+
+ + ♻ ☆ FQGA-single: Towards Fewer Training Epochs and Fewer Model Parameters + for Image-to-Image Translation Tasks + + +
+ CycleGAN was trained on SynthRAD Grand Challenge Dataset using the +single-epoch modification (SEM) method proposed in this paper which is referred +to as (CycleGAN-single) compared to the usual method of training CycleGAN on +around 200 epochs (CycleGAN-multi). Model performance were evaluated +qualitatively and quantitatively with quantitative performance metrics like +PSNR, SSIM, MAE and MSE. The consideration of both quantitative and qualitative +performance when evaluating a model is unique to certain image-to-image +translation tasks like medical imaging of patient data as detailed in this +paper. Also, this paper shows that good quantitative performance does not +always imply good qualitative performance and the converse is also not always +True (i.e. good qualitative performance does not always imply good quantitative +performance). This paper also proposes a lightweight model called FQGA (Fast +Paired Image-to-Image Translation Quarter-Generator Adversary) which has 1/4 +the number of parameters compared to CycleGAN (when comparing their Generator +Models). FQGA outperforms CycleGAN qualitatively and quantitatively even only +after training on 20 epochs. Finally, using SEM method on FQGA allowed it to +again outperform CycleGAN both quantitatively and qualitatively. These +performance gains even with fewer model parameters and fewer epochs (which will +result in time and computational savings) may also be applicable to other +image-to-image translation tasks in Machine Learning apart from the Medical +image-translation task discussed in this paper between Cone Beam Computed +Tomography (CBCT) and Computed Tomography (CT) images. + +
+
+
+
+
+ + ♻ ☆ High-resolution Population Maps Derived from Sentinel-1 and Sentinel-2 + + +
+ Detailed population maps play an important role in diverse fields ranging +from humanitarian action to urban planning. Generating such maps in a timely +and scalable manner presents a challenge, especially in data-scarce regions. To +address it we have developed POPCORN, a population mapping method whose only +inputs are free, globally available satellite images from Sentinel-1 and +Sentinel-2; and a small number of aggregate population counts over coarse +census districts for calibration. Despite the minimal data requirements our +approach surpasses the mapping accuracy of existing schemes, including several +that rely on building footprints derived from high-resolution imagery. E.g., we +were able to produce population maps for Rwanda with 100m GSD based on less +than 400 regional census counts. In Kigali, those maps reach an R^2 score of +66% w.r.t. a ground truth reference map, with an average error of only about 10 +inhabitants/ha. Conveniently, POPCORN retrieves explicit maps of built-up areas +and of local building occupancy rates, making the mapping process interpretable +and offering additional insights, for instance about the distribution of +built-up, but unpopulated areas, e.g., industrial warehouses. Moreover, we find +that, once trained, the model can be applied repeatedly to track population +changes; and that it can be transferred to geographically similar regions, +e.g., from Uganda to Rwanda). With our work we aim to democratize access to +up-to-date and high-resolution population maps, recognizing that some regions +faced with particularly strong population dynamics may lack the resources for +costly micro-census campaigns. + +
+
+ comment: Accepted to Remote Sensing of Environment 2024 +
+
+
+
+
+ + ♻ ☆ DeMamba: AI-Generated Video Detection on Million-Scale GenVideo + Benchmark + + +
+ Recently, video generation techniques have advanced rapidly. Given the +popularity of video content on social media platforms, these models intensify +concerns about the spread of fake information. Therefore, there is a growing +demand for detectors capable of distinguishing between fake AI-generated videos +and mitigating the potential harm caused by fake information. However, the lack +of large-scale datasets from the most advanced video generators poses a barrier +to the development of such detectors. To address this gap, we introduce the +first AI-generated video detection dataset, GenVideo. It features the following +characteristics: (1) a large volume of videos, including over one million +AI-generated and real videos collected; (2) a rich diversity of generated +content and methodologies, covering a broad spectrum of video categories and +generation techniques. We conducted extensive studies of the dataset and +proposed two evaluation methods tailored for real-world-like scenarios to +assess the detectors' performance: the cross-generator video classification +task assesses the generalizability of trained detectors on generators; the +degraded video classification task evaluates the robustness of detectors to +handle videos that have degraded in quality during dissemination. Moreover, we +introduced a plug-and-play module, named Detail Mamba (DeMamba), designed to +enhance the detectors by identifying AI-generated videos through the analysis +of inconsistencies in temporal and spatial dimensions. Our extensive +experiments demonstrate DeMamba's superior generalizability and robustness on +GenVideo compared to existing detectors. We believe that the GenVideo dataset +and the DeMamba module will significantly advance the field of AI-generated +video detection. Our code and dataset will be aviliable at +\url{https://github.com/chenhaoxing/DeMamba}. + +
+
+
+
+
+ + ♻ ☆ Beyond Specialization: Assessing the Capabilities of MLLMs in Age and + Gender Estimation + + +
+ Multimodal Large Language Models (MLLMs) have recently gained immense +popularity. Powerful commercial models like ChatGPT-4V and Gemini, as well as +open-source ones such as LLaVA, are essentially general-purpose models and are +applied to solve a wide variety of tasks, including those in computer vision. +These neural networks possess such strong general knowledge and reasoning +abilities that they have proven capable of working even on tasks for which they +were not specifically trained. We compared the capabilities of the most +powerful MLLMs to date: ShareGPT4V, ChatGPT, LLaVA-Next in a specialized task +of age and gender estimation with our state-of-the-art specialized model, +MiVOLO. We also updated MiVOLO and provide details and new metrics in this +article. This comparison has yielded some interesting results and insights +about the strengths and weaknesses of the participating models. Furthermore, we +attempted various ways to fine-tune the ShareGPT4V model for this specific +task, aiming to achieve state-of-the-art results in this particular challenge. +Although such a model would not be practical in production, as it is incredibly +expensive compared to a specialized model like MiVOLO, it could be very useful +in some tasks, like data annotation. + +
+
+
+
+
+ + ♻ ☆ On the Element-Wise Representation and Reasoning in Zero-Shot Image + Recognition: A Systematic Survey + + +
+ Zero-shot image recognition (ZSIR) aims at empowering models to recognize and +reason in unseen domains via learning generalized knowledge from limited data +in the seen domain. The gist for ZSIR is to execute element-wise representation +and reasoning from the input visual space to the target semantic space, which +is a bottom-up modeling paradigm inspired by the process by which humans +observe the world, i.e., capturing new concepts by learning and combining the +basic components or shared characteristics. In recent years, element-wise +learning techniques have seen significant progress in ZSIR as well as +widespread application. However, to the best of our knowledge, there remains a +lack of a systematic overview of this topic. To enrich the literature and +provide a sound basis for its future development, this paper presents a broad +review of recent advances in element-wise ZSIR. Concretely, we first attempt to +integrate the three basic ZSIR tasks of object recognition, compositional +recognition, and foundation model-based open-world recognition into a unified +element-wise perspective and provide a detailed taxonomy and analysis of the +main research approaches. Then, we collect and summarize some key information +and benchmarks, such as detailed technical implementations and common datasets. +Finally, we sketch out the wide range of its related applications, discuss +vital challenges, and suggest potential future directions. + +
+
+ comment: 23 pages, 7 figures, and 3 tables +
+
+
+
+
+ + ♻ ☆ TsCA: On the Semantic Consistency Alignment via Conditional Transport + for Compositional Zero-Shot Learning + + +
+ Compositional Zero-Shot Learning (CZSL) aims to recognize novel +\textit{state-object} compositions by leveraging the shared knowledge of their +primitive components. Despite considerable progress, effectively calibrating +the bias between semantically similar multimodal representations, as well as +generalizing pre-trained knowledge to novel compositional contexts, remains an +enduring challenge. In this paper, our interest is to revisit the conditional +transport (CT) theory and its homology to the visual-semantics interaction in +CZSL and further, propose a novel Trisets Consistency Alignment framework +(dubbed TsCA) that well-addresses these issues. Concretely, we utilize three +distinct yet semantically homologous sets, i.e., patches, primitives, and +compositions, to construct pairwise CT costs to minimize their semantic +discrepancies. To further ensure the consistency transfer within these sets, we +implement a cycle-consistency constraint that refines the learning by +guaranteeing the feature consistency of the self-mapping during transport flow, +regardless of modality. Moreover, we extend the CT plans to an open-world +setting, which enables the model to effectively filter out unfeasible pairs, +thereby speeding up the inference as well as increasing the accuracy. Extensive +experiments are conducted to verify the effectiveness of the proposed method. + +
+
+ comment: 12 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ SAM-REF: Rethinking Image-Prompt Synergy for Refinement in Segment + Anything + + +
+ The advent of the Segment Anything Model (SAM) marks a significant milestone +for interactive segmentation using generalist models. As a late fusion model, +SAM extracts image embeddings once and merges them with prompts in later +interactions. This strategy limits the models ability to extract detailed +information from the prompted target zone. Current specialist models utilize +the early fusion strategy that encodes the combination of images and prompts to +target the prompted objects, yet repetitive complex computations on the images +result in high latency. The key to these issues is efficiently synergizing the +images and prompts. We propose SAM-REF, a two-stage refinement framework that +fully integrates images and prompts globally and locally while maintaining the +accuracy of early fusion and the efficiency of late fusion. The first-stage +GlobalDiff Refiner is a lightweight early fusion network that combines the +whole image and prompts, focusing on capturing detailed information for the +entire object. The second-stage PatchDiff Refiner locates the object detail +window according to the mask and prompts, then refines the local details of the +object. Experimentally, we demonstrated the high effectiveness and efficiency +of our method in tackling complex cases with multiple interactions. Our SAM-REF +model outperforms the current state-of-the-art method in most metrics on +segmentation quality without compromising efficiency. + +
+
+
+
+
+ + ♻ ☆ MonoPatchNeRF: Improving Neural Radiance Fields with Patch-based + Monocular Guidance + + +
+ The latest regularized Neural Radiance Field (NeRF) approaches produce poor +geometry and view extrapolation for large scale sparse view scenes, such as +ETH3D. Density-based approaches tend to be under-constrained, while +surface-based approaches tend to miss details. In this paper, we take a +density-based approach, sampling patches instead of individual rays to better +incorporate monocular depth and normal estimates and patch-based photometric +consistency constraints between training views and sampled virtual views. +Loosely constraining densities based on estimated depth aligned to sparse +points further improves geometric accuracy. While maintaining similar view +synthesis quality, our approach significantly improves geometric accuracy on +the ETH3D benchmark, e.g. increasing the F1@2cm score by 4x-8x compared to +other regularized density-based approaches, with much lower training and +inference time than other approaches. + +
+
+
+
+
+ + ♻ ☆ RoadFormer+: Delivering RGB-X Scene Parsing through Scale-Aware + Information Decoupling and Advanced Heterogeneous Feature Fusion + + +
+ Task-specific data-fusion networks have marked considerable achievements in +urban scene parsing. Among these networks, our recently proposed RoadFormer +successfully extracts heterogeneous features from RGB images and surface normal +maps and fuses these features through attention mechanisms, demonstrating +compelling efficacy in RGB-Normal road scene parsing. However, its performance +significantly deteriorates when handling other types/sources of data or +performing more universal, all-category scene parsing tasks. To overcome these +limitations, this study introduces RoadFormer+, an efficient, robust, and +adaptable model capable of effectively fusing RGB-X data, where ``X'', +represents additional types/modalities of data such as depth, thermal, surface +normal, and polarization. Specifically, we propose a novel hybrid feature +decoupling encoder to extract heterogeneous features and decouple them into +global and local components. These decoupled features are then fused through a +dual-branch multi-scale heterogeneous feature fusion block, which employs +parallel Transformer attentions and convolutional neural network modules to +merge multi-scale features across different scales and receptive fields. The +fused features are subsequently fed into a decoder to generate the final +semantic predictions. Notably, our proposed RoadFormer+ ranks first on the +KITTI Road benchmark and achieves state-of-the-art performance in mean +intersection over union on the Cityscapes, MFNet, FMB, and ZJU datasets. +Moreover, it reduces the number of learnable parameters by 65\% compared to +RoadFormer. Our source code will be publicly available at +mias.group/RoadFormerPlus. + +
+
+ comment: 11 pages, 5 figures, accepted by Transactions on Intelligent Vehicles + 2024 +
+
+
+
+
+ + ♻ ☆ Quater-GCN: Enhancing 3D Human Pose Estimation with Orientation and + Semi-supervised Training ECAI24 + + +
+ 3D human pose estimation is a vital task in computer vision, involving the +prediction of human joint positions from images or videos to reconstruct a +skeleton of a human in three-dimensional space. This technology is pivotal in +various fields, including animation, security, human-computer interaction, and +automotive safety, where it promotes both technological progress and enhanced +human well-being. The advent of deep learning significantly advances the +performance of 3D pose estimation by incorporating temporal information for +predicting the spatial positions of human joints. However, traditional methods +often fall short as they primarily focus on the spatial coordinates of joints +and overlook the orientation and rotation of the connecting bones, which are +crucial for a comprehensive understanding of human pose in 3D space. To address +these limitations, we introduce Quater-GCN (Q-GCN), a directed graph +convolutional network tailored to enhance pose estimation by orientation. Q-GCN +excels by not only capturing the spatial dependencies among node joints through +their coordinates but also integrating the dynamic context of bone rotations in +2D space. This approach enables a more sophisticated representation of human +poses by also regressing the orientation of each bone in 3D space, moving +beyond mere coordinate prediction. Furthermore, we complement our model with a +semi-supervised training strategy that leverages unlabeled data, addressing the +challenge of limited orientation ground truth data. Through comprehensive +evaluations, Q-GCN has demonstrated outstanding performance against current +state-of-the-art methods. + +
+
+ comment: Accepted by ECAI24 +
+
+
+
+
+ + ♻ ☆ Addressing Diverging Training Costs using BEVRestore for High-resolution + Bird's Eye View Map Construction + + +
+ Recent advancements in Bird's Eye View (BEV) fusion for map construction have +demonstrated remarkable mapping of urban environments. However, their deep and +bulky architecture incurs substantial amounts of backpropagation memory and +computing latency. Consequently, the problem poses an unavoidable bottleneck in +constructing high-resolution (HR) BEV maps, as their large-sized features cause +significant increases in costs including GPU memory consumption and computing +latency, named diverging training costs issue. Affected by the problem, most +existing methods adopt low-resolution (LR) BEV and struggle to estimate the +precise locations of urban scene components like road lanes, and sidewalks. As +the imprecision leads to risky motion planning like collision avoidance, the +diverging training costs issue has to be resolved. In this paper, we address +the issue with our novel BEVRestore mechanism. Specifically, our proposed model +encodes the features of each sensor to LR BEV space and restores them to HR +space to establish a memory-efficient map constructor. To this end, we +introduce the BEV restoration strategy, which restores aliasing, and blocky +artifacts of the up-scaled BEV features, and narrows down the width of the +labels. Our extensive experiments show that the proposed mechanism provides a +plug-and-play, memory-efficient pipeline, enabling an HR map construction with +a broad BEV scope. + +
+
+
+
+
+ + ♻ ☆ Local Conditional Controlling for Text-to-Image Diffusion Models + + +
+ Diffusion models have exhibited impressive prowess in the text-to-image task. +Recent methods add image-level structure controls, e.g., edge and depth maps, +to manipulate the generation process together with text prompts to obtain +desired images. This controlling process is globally operated on the entire +image, which limits the flexibility of control regions. In this paper, we +explore a novel and practical task setting: local control. It focuses on +controlling specific local region according to user-defined image conditions, +while the remaining regions are only conditioned by the original text prompt. +However, it is non-trivial to achieve local conditional controlling. The naive +manner of directly adding local conditions may lead to the local control +dominance problem, which forces the model to focus on the controlled region and +neglect object generation in other regions. To mitigate this problem, we +propose Regional Discriminate Loss to update the noised latents, aiming at +enhanced object generation in non-control regions. Furthermore, the proposed +Focused Token Response suppresses weaker attention scores which lack the +strongest response to enhance object distinction and reduce duplication. +Lastly, we adopt Feature Mask Constraint to reduce quality degradation in +images caused by information differences across the local control region. All +proposed strategies are operated at the inference stage. Extensive experiments +demonstrate that our method can synthesize high-quality images aligned with the +text prompt under local control conditions. + +
+
+
+
+
+ + ♻ ☆ An Animation-based Augmentation Approach for Action Recognition from + Discontinuous Video ECAI24 + + +
+ Action recognition, an essential component of computer vision, plays a +pivotal role in multiple applications. Despite significant improvements brought +by Convolutional Neural Networks (CNNs), these models suffer performance +declines when trained with discontinuous video frames, which is a frequent +scenario in real-world settings. This decline primarily results from the loss +of temporal continuity, which is crucial for understanding the semantics of +human actions. To overcome this issue, we introduce the 4A (Action +Animation-based Augmentation Approach) pipeline, which employs a series of +sophisticated techniques: starting with 2D human pose estimation from RGB +videos, followed by Quaternion-based Graph Convolution Network for joint +orientation and trajectory prediction, and Dynamic Skeletal Interpolation for +creating smoother, diversified actions using game engine technology. This +innovative approach generates realistic animations in varied game environments, +viewed from multiple viewpoints. In this way, our method effectively bridges +the domain gap between virtual and real-world data. In experimental +evaluations, the 4A pipeline achieves comparable or even superior performance +to traditional training approaches using real-world data, while requiring only +10% of the original data volume. Additionally, our approach demonstrates +enhanced performance on In-the-wild videos, marking a significant advancement +in the field of action recognition. + +
+
+ comment: Accepted by ECAI24 +
+
+
+
+
+ + ♻ ☆ Flying with Photons: Rendering Novel Views of Propagating Light ECCV 2024 + + +
+ We present an imaging and neural rendering technique that seeks to synthesize +videos of light propagating through a scene from novel, moving camera +viewpoints. Our approach relies on a new ultrafast imaging setup to capture a +first-of-its kind, multi-viewpoint video dataset with picosecond-level temporal +resolution. Combined with this dataset, we introduce an efficient neural volume +rendering framework based on the transient field. This field is defined as a +mapping from a 3D point and 2D direction to a high-dimensional, discrete-time +signal that represents time-varying radiance at ultrafast timescales. Rendering +with transient fields naturally accounts for effects due to the finite speed of +light, including viewpoint-dependent appearance changes caused by light +propagation delays to the camera. We render a range of complex effects, +including scattering, specular reflection, refraction, and diffraction. +Additionally, we demonstrate removing viewpoint-dependent propagation delays +using a time warping procedure, rendering of relativistic effects, and video +synthesis of direct and global components of light transport. + +
+
+ comment: ECCV 2024, Project page: https://anaghmalik.com/FlyingWithPhotons/ +
+
+
+
+
+ + ♻ ☆ EXAONEPath 1.0 Patch-level Foundation Model for Pathology + + +
+ Recent advancements in digital pathology have led to the development of +numerous foundational models that utilize self-supervised learning on patches +extracted from gigapixel whole slide images (WSIs). While this approach +leverages vast amounts of unlabeled data, we have discovered a significant +issue: features extracted from these self-supervised models tend to cluster by +individual WSIs, a phenomenon we term WSI-specific feature collapse. This +problem can potentially limit the model's generalization ability and +performance on various downstream tasks. To address this issue, we introduce +EXAONEPath, a novel foundational model trained on patches that have undergone +stain normalization. Stain normalization helps reduce color variability arising +from different laboratories and scanners, enabling the model to learn more +consistent features. EXAONEPath is trained using 285,153,903 patches extracted +from a total of 34,795 WSIs. Our experiments demonstrate that EXAONEPath +significantly mitigates the feature collapse problem, indicating that the model +has learned more generalized features rather than overfitting to individual WSI +characteristics. We compared EXAONEPath with state-of-the-art models across six +downstream task datasets, and our results show that EXAONEPath achieves +superior performance relative to the number of WSIs used and the model's +parameter count. This suggests that the application of stain normalization has +substantially improved the model's efficiency and generalization capabilities. + +
+
+ comment: License updated +
+
+
+
+
+ + ♻ ☆ Exploring Robustness of Visual State Space model against Backdoor + Attacks + + +
+ Visual State Space Model (VSS) has demonstrated remarkable performance in +various computer vision tasks. However, in the process of development, backdoor +attacks have brought severe challenges to security. Such attacks cause an +infected model to predict target labels when a specific trigger is activated, +while the model behaves normally on benign samples. In this paper, we conduct +systematic experiments to comprehend on robustness of VSS through the lens of +backdoor attacks, specifically how the state space model (SSM) mechanism +affects robustness. We first investigate the vulnerability of VSS to different +backdoor triggers and reveal that the SSM mechanism, which captures contextual +information within patches, makes the VSS model more susceptible to backdoor +triggers compared to models without SSM. Furthermore, we analyze the +sensitivity of the VSS model to patch processing techniques and discover that +these triggers are effectively disrupted. Based on these observations, we +consider an effective backdoor for the VSS model that recurs in each patch to +resist patch perturbations. Extensive experiments across three datasets and +various backdoor attacks reveal that the VSS model performs comparably to +Transformers (ViTs) but is less robust than the Gated CNNs, which comprise only +stacked Gated CNN blocks without SSM. + +
+
+ comment: 11 pages, 9 figures, minor revise, under review +
+
+
+
+
+ + ♻ ☆ LAKD-Activation Mapping Distillation Based on Local Learning + + +
+ Knowledge distillation is widely applied in various fundamental vision models +to enhance the performance of compact models. Existing knowledge distillation +methods focus on designing different distillation targets to acquire knowledge +from teacher models. However, these methods often overlook the efficient +utilization of distilled information, crudely coupling different types of +information, making it difficult to explain how the knowledge from the teacher +network aids the student network in learning. This paper proposes a novel +knowledge distillation framework, Local Attention Knowledge Distillation +(LAKD), which more efficiently utilizes the distilled information from teacher +networks, achieving higher interpretability and competitive performance. The +framework establishes an independent interactive training mechanism through a +separation-decoupling mechanism and non-directional activation mapping. LAKD +decouples the teacher's features and facilitates progressive interaction +training from simple to complex. Specifically, the student network is divided +into local modules with independent gradients to decouple the knowledge +transferred from the teacher. The non-directional activation mapping helps the +student network integrate knowledge from different local modules by learning +coarse-grained feature knowledge. We conducted experiments on the CIFAR-10, +CIFAR-100, and ImageNet datasets, and the results show that our LAKD method +significantly outperforms existing methods, consistently achieving +state-of-the-art performance across different datasets. + +
+
+ comment: 8 pages,7 figures +
+
+
+
+
+ + ♻ ☆ Concept Conductor: Orchestrating Multiple Personalized Concepts in + Text-to-Image Synthesis + + +
+ The customization of text-to-image models has seen significant advancements, +yet generating multiple personalized concepts remains a challenging task. +Current methods struggle with attribute leakage and layout confusion when +handling multiple concepts, leading to reduced concept fidelity and semantic +consistency. In this work, we introduce a novel training-free framework, +Concept Conductor, designed to ensure visual fidelity and correct layout in +multi-concept customization. Concept Conductor isolates the sampling processes +of multiple custom models to prevent attribute leakage between different +concepts and corrects erroneous layouts through self-attention-based spatial +guidance. Additionally, we present a concept injection technique that employs +shape-aware masks to specify the generation area for each concept. This +technique injects the structure and appearance of personalized concepts through +feature fusion in the attention layers, ensuring harmony in the final image. +Extensive qualitative and quantitative experiments demonstrate that Concept +Conductor can consistently generate composite images with accurate layouts +while preserving the visual details of each concept. Compared to existing +baselines, Concept Conductor shows significant performance improvements. Our +method supports the combination of any number of concepts and maintains high +fidelity even when dealing with visually similar concepts. The code and models +are available at https://github.com/Nihukat/Concept-Conductor. + +
+
+ comment: Github Page: https://github.com/Nihukat/Concept-Conductor +
+
+
+
+
+ + ♻ ☆ Adversarial Examples in the Physical World: A Survey + + +
+ Deep neural networks (DNNs) have demonstrated high vulnerability to +adversarial examples, raising broad security concerns about their applications. +Besides the attacks in the digital world, the practical implications of +adversarial examples in the physical world present significant challenges and +safety concerns. However, current research on physical adversarial examples +(PAEs) lacks a comprehensive understanding of their unique characteristics, +leading to limited significance and understanding. In this paper, we address +this gap by thoroughly examining the characteristics of PAEs within a practical +workflow encompassing training, manufacturing, and re-sampling processes. By +analyzing the links between physical adversarial attacks, we identify +manufacturing and re-sampling as the primary sources of distinct attributes and +particularities in PAEs. Leveraging this knowledge, we develop a comprehensive +analysis and classification framework for PAEs based on their specific +characteristics, covering over 100 studies on physical-world adversarial +examples. Furthermore, we investigate defense strategies against PAEs and +identify open challenges and opportunities for future research. We aim to +provide a fresh, thorough, and systematic understanding of PAEs, thereby +promoting the development of robust adversarial learning and its application in +open-world scenarios to provide the community with a continuously updated list +of physical world adversarial sample resources, including papers, code, \etc, +within the proposed framework + +
+
+ comment: Adversarial examples, physical-world scenarios, attacks and defenses +
+
+
+
+
+ + ♻ ☆ MUC: Mixture of Uncalibrated Cameras for Robust 3D Human Body + Reconstruction + + +
+ Multiple cameras can provide comprehensive multi-view video coverage of a +person. Fusing this multi-view data is crucial for tasks like behavioral +analysis, although it traditionally requires camera calibration, a process that +is often complex. Moreover, previous studies have overlooked the challenges +posed by self-occlusion under multiple views and the continuity of human body +shape estimation. In this study, we introduce a method to reconstruct the 3D +human body from multiple uncalibrated camera views. Initially, we utilize a +pre-trained human body encoder to process each camera view individually, +enabling the reconstruction of human body models and parameters for each view +along with predicted camera positions. Rather than merely averaging the models +across views, we develop a neural network trained to assign weights to +individual views for all human body joints, based on the estimated distribution +of joint distances from each camera. Additionally, we focus on the mesh surface +of the human body for dynamic fusion, allowing for the seamless integration of +facial expressions and body shape into a unified human body model. Our method +has shown excellent performance in reconstructing the human body on two public +datasets, advancing beyond previous work from the SMPL model to the SMPL-X +model. This extension incorporates more complex hand poses and facial +expressions, enhancing the detail and accuracy of the reconstructions. +Crucially, it supports the flexible ad-hoc deployment of any number of cameras, +offering significant potential for various applications. Our code is available +at https://github.com/AbsterZhu/MUC. + +
+
+
+
+
+ + ♻ ☆ Generalized Face Forgery Detection via Adaptive Learning for Pre-trained + Vision Transformer + + +
+ With the rapid progress of generative models, the current challenge in face +forgery detection is how to effectively detect realistic manipulated faces from +different unseen domains. Though previous studies show that pre-trained Vision +Transformer (ViT) based models can achieve some promising results after fully +fine-tuning on the Deepfake dataset, their generalization performances are +still unsatisfactory. One possible reason is that fully fine-tuned ViT-based +models may disrupt the pre-trained features [1, 2] and overfit to some +data-specific patterns [3]. To alleviate this issue, we present a +\textbf{F}orgery-aware \textbf{A}daptive \textbf{Vi}sion \textbf{T}ransformer +(FA-ViT) under the adaptive learning paradigm, where the parameters in the +pre-trained ViT are kept fixed while the designed adaptive modules are +optimized to capture forgery features. Specifically, a global adaptive module +is designed to model long-range interactions among input tokens, which takes +advantage of self-attention mechanism to mine global forgery clues. To further +explore essential local forgery clues, a local adaptive module is proposed to +expose local inconsistencies by enhancing the local contextual association. In +addition, we introduce a fine-grained adaptive learning module that emphasizes +the common compact representation of genuine faces through relationship +learning in fine-grained pairs, driving these proposed adaptive modules to be +aware of fine-grained forgery-aware information. Extensive experiments +demonstrate that our FA-ViT achieves state-of-the-arts results in the +cross-dataset evaluation, and enhances the robustness against unseen +perturbations. Particularly, FA-ViT achieves 93.83\% and 78.32\% AUC scores on +Celeb-DF and DFDC datasets in the cross-dataset evaluation. The code and +trained model have been released at: https://github.com/LoveSiameseCat/FAViT. + +
+
+
+
+
+ + ♻ ☆ A Scalable Quantum Non-local Neural Network for Image Classification + + +
+ Non-local operations play a crucial role in computer vision enabling the +capture of long-range dependencies through weighted sums of features across the +input, surpassing the constraints of traditional convolution operations that +focus solely on local neighborhoods. Non-local operations typically require +computing pairwise relationships between all elements in a set, leading to +quadratic complexity in terms of time and memory. Due to the high computational +and memory demands, scaling non-local neural networks to large-scale problems +can be challenging. This article introduces a hybrid quantum-classical scalable +non-local neural network, referred to as Quantum Non-Local Neural Network +(QNL-Net), to enhance pattern recognition. The proposed QNL-Net relies on +inherent quantum parallelism to allow the simultaneous processing of a large +number of input features enabling more efficient computations in +quantum-enhanced feature space and involving pairwise relationships through +quantum entanglement. We benchmark our proposed QNL-Net with other quantum +counterparts to binary classification with datasets MNIST and CIFAR-10. The +simulation findings showcase our QNL-Net achieves cutting-edge accuracy levels +in binary image classification among quantum classifiers while utilizing fewer +qubits. + +
+
+ comment: preprint, 12 pages (including references and appendix), 5 figures +
+
+
+
+
+ + ♻ ☆ MolX: Enhancing Large Language Models for Molecular Learning with A + Multi-Modal Extension + + +
+ Large Language Models (LLMs) with their strong task-handling capabilities +have shown remarkable advancements across a spectrum of fields, moving beyond +natural language understanding. However, their proficiency within the chemistry +domain remains restricted, especially in solving professional molecule-related +tasks. This challenge is attributed to their inherent limitations in +comprehending molecules using only common textual representations, i.e., SMILES +strings. In this study, we seek to enhance the ability of LLMs to comprehend +molecules by equipping them with a multi-modal external module, namely MolX. In +particular, instead of directly using a SMILES string to represent a molecule, +we utilize specific encoders to extract fine-grained features from both SMILES +string and 2D molecular graph representations for feeding into an LLM. +Moreover, a handcrafted molecular fingerprint is incorporated to leverage its +embedded domain knowledge. Then, to establish an alignment between MolX and the +LLM's textual input space, the whole model in which the LLM is frozen, is +pre-trained with a versatile strategy including a diverse set of tasks. +Experimental evaluations show that our proposed method outperforms baselines +across 4 downstream molecule-related tasks ranging from molecule-to-text +translation to retrosynthesis, with and without fine-tuning the LLM, while only +introducing a small number of trainable parameters 0.53% and 0.82%, +respectively. + +
+
+
+
+
+ + ♻ ☆ Video Emotion Open-vocabulary Recognition Based on Multimodal Large + Language Model + + +
+ Multimodal emotion recognition is a task of great concern. However, +traditional data sets are based on fixed labels, resulting in models that often +focus on main emotions and ignore detailed emotional changes in complex scenes. +This report introduces the solution of using MLLMs technology to generate +open-vocabulary emotion labels from a video. The solution includes the use of +framework, data generation and processing, training methods, results generation +and multi-model co-judgment. In the MER-OV (Open-Word Emotion Recognition) of +the MER2024 challenge, our method achieved significant advantages, leading to +its superior capabilities in complex emotion computation. + +
+
+
+
+
+ + ♻ ☆ DGMamba: Domain Generalization via Generalized State Space Model ACM MM 2024 + + +
+ Domain generalization~(DG) aims at solving distribution shift problems in +various scenes. Existing approaches are based on Convolution Neural Networks +(CNNs) or Vision Transformers (ViTs), which suffer from limited receptive +fields or quadratic complexities issues. Mamba, as an emerging state space +model (SSM), possesses superior linear complexity and global receptive fields. +Despite this, it can hardly be applied to DG to address distribution shifts, +due to the hidden state issues and inappropriate scan mechanisms. In this +paper, we propose a novel framework for DG, named DGMamba, that excels in +strong generalizability toward unseen domains and meanwhile has the advantages +of global receptive fields, and efficient linear complexity. Our DGMamba +compromises two core components: Hidden State Suppressing~(HSS) and +Semantic-aware Patch refining~(SPR). In particular, HSS is introduced to +mitigate the influence of hidden states associated with domain-specific +features during output prediction. SPR strives to encourage the model to +concentrate more on objects rather than context, consisting of two designs: +Prior-Free Scanning~(PFS), and Domain Context Interchange~(DCI). Concretely, +PFS aims to shuffle the non-semantic patches within images, creating more +flexible and effective sequences from images, and DCI is designed to regularize +Mamba with the combination of mismatched non-semantic and semantic information +by fusing patches among domains. Extensive experiments on five commonly used DG +benchmarks demonstrate that the proposed DGMamba achieves remarkably superior +results to state-of-the-art models. The code will be made publicly available at +https://github.com/longshaocong/DGMamba. + +
+
+ comment: Accepted to ACM MM 2024 +
+
+
+
+
+ + ♻ ☆ DeRainGS: Gaussian Splatting for Enhanced Scene Reconstruction in Rainy + Environments + + +
+ Reconstruction under adverse rainy conditions poses significant challenges +due to reduced visibility and the distortion of visual perception. These +conditions can severely impair the quality of geometric maps, which is +essential for applications ranging from autonomous planning to environmental +monitoring. In response to these challenges, this study introduces the novel +task of 3D Reconstruction in Rainy Environments (3DRRE), specifically designed +to address the complexities of reconstructing 3D scenes under rainy conditions. +To benchmark this task, we construct the HydroViews dataset that comprises a +diverse collection of both synthesized and real-world scene images +characterized by various intensities of rain streaks and raindrops. +Furthermore, we propose DeRainGS, the first 3DGS method tailored for +reconstruction in adverse rainy environments. Extensive experiments across a +wide range of rain scenarios demonstrate that our method delivers +state-of-the-art performance, remarkably outperforming existing occlusion-free +methods. + +
+
+
+
+
+ + ♻ ☆ HyperNeRFGAN: Hypernetwork approach to 3D NeRF GAN + + +
+ The recent surge in popularity of deep generative models for 3D objects has +highlighted the need for more efficient training methods, particularly given +the difficulties associated with training with conventional 3D representations, +such as voxels or point clouds. Neural Radiance Fields (NeRFs), which provide +the current benchmark in terms of quality for the generation of novel views of +complex 3D scenes from a limited set of 2D images, represent a promising +solution to this challenge. However, the training of these models requires the +knowledge of the respective camera positions from which the images were viewed. +In this paper, we overcome this limitation by introducing HyperNeRFGAN, a +Generative Adversarial Network (GAN) architecture employing a hypernetwork +paradigm to transform a Gaussian noise into the weights of a NeRF architecture +that does not utilize viewing directions in its training phase. Consequently, +as evidenced by the findings of our experimental study, the proposed model, +despite its notable simplicity in comparison to existing state-of-the-art +alternatives, demonstrates superior performance on a diverse range of image +datasets where camera position estimation is challenging, particularly in the +context of medical data. + +
+
+
+
+
+ + ♻ ☆ Lighthouse: A User-Friendly Library for Reproducible Video Moment + Retrieval and Highlight Detection + + +
+ We propose Lighthouse, a user-friendly library for reproducible video moment +retrieval and highlight detection (MR-HD). Although researchers proposed +various MR-HD approaches, the research community holds two main issues. The +first is a lack of comprehensive and reproducible experiments across various +methods, datasets, and video-text features. This is because no unified training +and evaluation codebase covers multiple settings. The second is user-unfriendly +design. Because previous works use different libraries, researchers set up +individual environments. In addition, most works release only the training +codes, requiring users to implement the whole inference process of MR-HD. +Lighthouse addresses these issues by implementing a unified reproducible +codebase that includes six models, three features, and five datasets. In +addition, it provides an inference API and web demo to make these methods +easily accessible for researchers and developers. Our experiments demonstrate +that Lighthouse generally reproduces the reported scores in the reference +papers. The code is available at https://github.com/line/lighthouse. + +
+
+ comment: 6 pages; library tech report +
+
+
+
+
+
+
+
+ + Information Retrieval 16 + +
+
+
+ + ☆ RuleAlign: Making Large Language Models Better Physicians with + Diagnostic Rule Alignment + + +
+ Large Language Models (LLMs) like GPT-4, MedPaLM-2, and Med-Gemini achieve +performance competitively with human experts across various medical benchmarks. +However, they still face challenges in making professional diagnoses akin to +physicians, particularly in efficiently gathering patient information and +reasoning the final diagnosis. To this end, we introduce the RuleAlign +framework, designed to align LLMs with specific diagnostic rules. We develop a +medical dialogue dataset comprising rule-based communications between patients +and physicians and design an alignment learning approach through preference +learning. Experimental results demonstrate the effectiveness of the proposed +approach. We hope that our work can serve as an inspiration for exploring the +potential of LLMs as AI physicians. + +
+
+ comment: Ongoing work +
+
+
+
+
+ + ☆ The Importance of Cognitive Biases in the Recommendation Ecosystem + + +
+ Cognitive biases have been studied in psychology, sociology, and behavioral +economics for decades. Traditionally, they have been considered a negative +human trait that leads to inferior decision-making, reinforcement of +stereotypes, or can be exploited to manipulate consumers, respectively. We +argue that cognitive biases also manifest in different parts of the +recommendation ecosystem and at different stages of the recommendation process. +More importantly, we contest this traditional detrimental perspective on +cognitive biases and claim that certain cognitive biases can be beneficial when +accounted for by recommender systems. Concretely, we provide empirical evidence +that biases such as feature-positive effect, Ikea effect, and cultural +homophily can be observed in various components of the recommendation pipeline, +including input data (such as ratings or side information), recommendation +algorithm or model (and consequently recommended items), and user interactions +with the system. In three small experiments covering recruitment and +entertainment domains, we study the pervasiveness of the aforementioned biases. +We ultimately advocate for a prejudice-free consideration of cognitive biases +to improve user and item models as well as recommendation algorithms. + +
+
+
+
+
+ + ☆ DLCRec: A Novel Approach for Managing Diversity in LLM-Based Recommender + Systems + + +
+ The integration of Large Language Models (LLMs) into recommender systems has +led to substantial performance improvements. However, this often comes at the +cost of diminished recommendation diversity, which can negatively impact user +satisfaction. To address this issue, controllable recommendation has emerged as +a promising approach, allowing users to specify their preferences and receive +recommendations that meet their diverse needs. Despite its potential, existing +controllable recommender systems frequently rely on simplistic mechanisms, such +as a single prompt, to regulate diversity-an approach that falls short of +capturing the full complexity of user preferences. In response to these +limitations, we propose DLCRec, a novel framework designed to enable +fine-grained control over diversity in LLM-based recommendations. Unlike +traditional methods, DLCRec adopts a fine-grained task decomposition strategy, +breaking down the recommendation process into three sequential sub-tasks: genre +prediction, genre filling, and item prediction. These sub-tasks are trained +independently and inferred sequentially according to user-defined control +numbers, ensuring more precise control over diversity. Furthermore, the +scarcity and uneven distribution of diversity-related user behavior data pose +significant challenges for fine-tuning. To overcome these obstacles, we +introduce two data augmentation techniques that enhance the model's robustness +to noisy and out-of-distribution data. These techniques expose the model to a +broader range of patterns, improving its adaptability in generating +recommendations with varying levels of diversity. Our extensive empirical +evaluation demonstrates that DLCRec not only provides precise control over +diversity but also outperforms state-of-the-art baselines across multiple +recommendation scenarios. + +
+
+
+
+
+ + ☆ A Comparative Analysis of Faithfulness Metrics and Humans in Citation + Evaluation SIGIR2024 + + +
+ Large language models (LLMs) often generate content with unsupported or +unverifiable content, known as "hallucinations." To address this, +retrieval-augmented LLMs are employed to include citations in their content, +grounding the content in verifiable sources. Despite such developments, +manually assessing how well a citation supports the associated statement +remains a major challenge. Previous studies tackle this challenge by leveraging +faithfulness metrics to estimate citation support automatically. However, they +limit this citation support estimation to a binary classification scenario, +neglecting fine-grained citation support in practical scenarios. To investigate +the effectiveness of faithfulness metrics in fine-grained scenarios, we propose +a comparative evaluation framework that assesses the metric effectiveness in +distinguishing citations between three-category support levels: full, partial, +and no support. Our framework employs correlation analysis, classification +evaluation, and retrieval evaluation to measure the alignment between metric +scores and human judgments comprehensively. Our results indicate no single +metric consistently excels across all evaluations, highlighting the complexity +of accurately evaluating fine-grained support levels. Particularly, we find +that the best-performing metrics struggle to distinguish partial support from +full or no support. Based on these findings, we provide practical +recommendations for developing more effective metrics. + +
+
+ comment: Accepted by the First Workshop on Large Language Model for Evaluation + in Information Retrieval (LLM4Eval@SIGIR2024), non-archival. arXiv admin + note: substantial text overlap with arXiv:2406.15264 +
+
+
+
+
+ + ☆ Dynamic Product Image Generation and Recommendation at Scale for + Personalized E-commerce RecSys'24 + + +
+ Coupling latent diffusion based image generation with contextual bandits +enables the creation of eye-catching personalized product images at scale that +was previously either impossible or too expensive. In this paper we showcase +how we utilized these technologies to increase user engagement with +recommendations in online retargeting campaigns for e-commerce. + +
+
+ comment: Appearing in the Proceedings of the 18th ACM Conference on + Recommender Systems (RecSys'24) as an Industry Track paper +
+
+
+
+
+ + ☆ Fair Augmentation for Graph Collaborative Filtering + + +
+ Recent developments in recommendation have harnessed the collaborative power +of graph neural networks (GNNs) in learning users' preferences from user-item +networks. Despite emerging regulations addressing fairness of automated +systems, unfairness issues in graph collaborative filtering remain +underexplored, especially from the consumer's perspective. Despite numerous +contributions on consumer unfairness, only a few of these works have delved +into GNNs. A notable gap exists in the formalization of the latest mitigation +algorithms, as well as in their effectiveness and reliability on cutting-edge +models. This paper serves as a solid response to recent research highlighting +unfairness issues in graph collaborative filtering by reproducing one of the +latest mitigation methods. The reproduced technique adjusts the system fairness +level by learning a fair graph augmentation. Under an experimental setup based +on 11 GNNs, 5 non-GNN models, and 5 real-world networks across diverse domains, +our investigation reveals that fair graph augmentation is consistently +effective on high-utility models and large datasets. Experiments on the +transferability of the fair augmented graph open new issues for future +recommendation studies. Source code: https://github.com/jackmedda/FA4GCF. + +
+
+
+
+
+ + ☆ Rank and Align: Towards Effective Source-free Graph Domain Adaptation IJCAI2024 + + +
+ Graph neural networks (GNNs) have achieved impressive performance in graph +domain adaptation. However, extensive source graphs could be unavailable in +real-world scenarios due to privacy and storage concerns. To this end, we +investigate an underexplored yet practical problem of source-free graph domain +adaptation, which transfers knowledge from source models instead of source +graphs to a target domain. To solve this problem, we introduce a novel +GNN-based approach called Rank and Align (RNA), which ranks graph similarities +with spectral seriation for robust semantics learning, and aligns inharmonic +graphs with harmonic graphs which close to the source domain for subgraph +extraction. In particular, to overcome label scarcity, we employ the spectral +seriation algorithm to infer the robust pairwise rankings, which can guide +semantic learning using a similarity learning objective. To depict distribution +shifts, we utilize spectral clustering and the silhouette coefficient to detect +harmonic graphs, which the source model can easily classify. To reduce +potential domain discrepancy, we extract domain-invariant subgraphs from +inharmonic graphs by an adversarial edge sampling process, which guides the +invariant learning of GNNs. Extensive experiments on several benchmark datasets +demonstrate the effectiveness of our proposed RNA. + +
+
+ comment: Published in IJCAI2024 +
+
+
+
+
+ + ☆ Hardware Acceleration for Knowledge Graph Processing: Challenges & + Recent Developments + + +
+ Knowledge graphs (KGs) have achieved significant attention in recent years, +particularly in the area of the Semantic Web as well as gaining popularity in +other application domains such as data mining and search engines. +Simultaneously, there has been enormous progress in the development of +different types of heterogeneous hardware, impacting the way KGs are processed. +The aim of this paper is to provide a systematic literature review of knowledge +graph hardware acceleration. For this, we present a classification of the +primary areas in knowledge graph technology that harnesses different hardware +units for accelerating certain knowledge graph functionalities. We then +extensively describe respective works, focusing on how KG related schemes +harness modern hardware accelerators. Based on our review, we identify various +research gaps and future exploratory directions that are anticipated to be of +significant value both for academics and industry practitioners. + +
+
+
+
+
+ + ☆ DimeRec: A Unified Framework for Enhanced Sequential Recommendation via + Generative Diffusion Models + + +
+ Sequential Recommendation (SR) plays a pivotal role in recommender systems by +tailoring recommendations to user preferences based on their non-stationary +historical interactions. Achieving high-quality performance in SR requires +attention to both item representation and diversity. However, designing an SR +method that simultaneously optimizes these merits remains a long-standing +challenge. In this study, we address this issue by integrating recent +generative Diffusion Models (DM) into SR. DM has demonstrated utility in +representation learning and diverse image generation. Nevertheless, a +straightforward combination of SR and DM leads to sub-optimal performance due +to discrepancies in learning objectives (recommendation vs. noise +reconstruction) and the respective learning spaces (non-stationary vs. +stationary). To overcome this, we propose a novel framework called DimeRec +(\textbf{Di}ffusion with \textbf{m}ulti-interest \textbf{e}nhanced +\textbf{Rec}ommender). DimeRec synergistically combines a guidance extraction +module (GEM) and a generative diffusion aggregation module (DAM). The GEM +extracts crucial stationary guidance signals from the user's non-stationary +interaction history, while the DAM employs a generative diffusion process +conditioned on GEM's outputs to reconstruct and generate consistent +recommendations. Our numerical experiments demonstrate that DimeRec +significantly outperforms established baseline methods across three publicly +available datasets. Furthermore, we have successfully deployed DimeRec on a +large-scale short video recommendation platform, serving hundreds of millions +of users. Live A/B testing confirms that our method improves both users' time +spent and result diversification. + +
+
+
+
+
+ + ☆ Behavior Pattern Mining-based Multi-Behavior Recommendation + + +
+ Multi-behavior recommendation systems enhance effectiveness by leveraging +auxiliary behaviors (such as page views and favorites) to address the +limitations of traditional models that depend solely on sparse target behaviors +like purchases. Existing approaches to multi-behavior recommendations typically +follow one of two strategies: some derive initial node representations from +individual behavior subgraphs before integrating them for a comprehensive +profile, while others interpret multi-behavior data as a heterogeneous graph, +applying graph neural networks to achieve a unified node representation. +However, these methods do not adequately explore the intricate patterns of +behavior among users and items. To bridge this gap, we introduce a novel +algorithm called Behavior Pattern mining-based Multi-behavior Recommendation +(BPMR). Our method extensively investigates the diverse interaction patterns +between users and items, utilizing these patterns as features for making +recommendations. We employ a Bayesian approach to streamline the recommendation +process, effectively circumventing the challenges posed by graph neural network +algorithms, such as the inability to accurately capture user preferences due to +over-smoothing. Our experimental evaluation on three real-world datasets +demonstrates that BPMR significantly outperforms existing state-of-the-art +algorithms, showing an average improvement of 268.29% in Recall@10 and 248.02% +in NDCG@10 metrics. The code of our BPMR is openly accessible for use and +further research at https://github.com/rookitkitlee/BPMR. + +
+
+
+
+
+ + ♻ ☆ From Lazy to Prolific: Tackling Missing Labels in Open Vocabulary + Extreme Classification by Positive-Unlabeled Sequence Learning + + +
+ Open-vocabulary Extreme Multi-label Classification (OXMC) extends traditional +XMC by allowing prediction beyond an extremely large, predefined label set +(typically $10^3$ to $10^{12}$ labels), addressing the dynamic nature of +real-world labeling tasks. However, self-selection bias in data annotation +leads to significant missing labels in both training and test data, +particularly for less popular inputs. This creates two critical challenges: +generation models learn to be "lazy'" by under-generating labels, and +evaluation becomes unreliable due to insufficient annotation in the test set. +In this work, we introduce Positive-Unlabeled Sequence Learning (PUSL), which +reframes OXMC as an infinite keyphrase generation task, addressing the +generation model's laziness. Additionally, we propose to adopt a suite of +evaluation metrics, F1@$\mathcal{O}$ and newly proposed B@$k$, to reliably +assess OXMC models with incomplete ground truths. In a highly imbalanced +e-commerce dataset with substantial missing labels, PUSL generates 30% more +unique labels, and 72% of its predictions align with actual user queries. On +the less skewed EURLex-4.3k dataset, PUSL demonstrates superior F1 scores, +especially as label counts increase from 15 to 30. Our approach effectively +tackles both the modeling and evaluation challenges in OXMC with missing +labels. + +
+
+
+
+
+ + ♻ ☆ Mamba Retriever: Utilizing Mamba for Effective and Efficient Dense + Retrieval + + +
+ In the information retrieval (IR) area, dense retrieval (DR) models use deep +learning techniques to encode queries and passages into embedding space to +compute their semantic relations. It is important for DR models to balance both +efficiency and effectiveness. Pre-trained language models (PLMs), especially +Transformer-based PLMs, have been proven to be effective encoders of DR models. +However, the self-attention component in Transformer-based PLM results in a +computational complexity that grows quadratically with sequence length, and +thus exhibits a slow inference speed for long-text retrieval. Some recently +proposed non-Transformer PLMs, especially the Mamba architecture PLMs, have +demonstrated not only comparable effectiveness to Transformer-based PLMs on +generative language tasks but also better efficiency due to linear time scaling +in sequence length. This paper implements the Mamba Retriever to explore +whether Mamba can serve as an effective and efficient encoder of DR model for +IR tasks. We fine-tune the Mamba Retriever on the classic short-text MS MARCO +passage ranking dataset and the long-text LoCoV0 dataset. Experimental results +show that (1) on the MS MARCO passage ranking dataset and BEIR, the Mamba +Retriever achieves comparable or better effectiveness compared to +Transformer-based retrieval models, and the effectiveness grows with the size +of the Mamba model; (2) on the long-text LoCoV0 dataset, the Mamba Retriever +can extend to longer text length than its pre-trained length after fine-tuning +on retrieval task, and it has comparable or better effectiveness compared to +other long-text retrieval models; (3) the Mamba Retriever has superior +inference speed for long-text retrieval. In conclusion, Mamba Retriever is both +effective and efficient, making it a practical model, especially for long-text +retrieval. + +
+
+
+
+
+ + ♻ ☆ RDGCL: Reaction-Diffusion Graph Contrastive Learning for Recommendation + + +
+ Contrastive learning (CL) has emerged as a promising technique for improving +recommender systems, addressing the challenge of data sparsity by using +self-supervised signals from raw data. Integration of CL with graph +convolutional network (GCN)-based collaborative filterings (CFs) has been +explored in recommender systems. However, current CL-based recommendation +models heavily rely on low-pass filters and graph augmentations. In this paper, +inspired by the reaction-diffusion equation, we propose a novel CL method for +recommender systems called the reaction-diffusion graph contrastive learning +model (RDGCL). We design our own GCN for CF based on the equations of +diffusion, i.e., low-pass filter, and reaction, i.e., high-pass filter. Our +proposed CL-based training occurs between reaction and diffusion-based +embeddings, so there is no need for graph augmentations. Experimental +evaluation on 5 benchmark datasets demonstrates that our proposed method +outperforms state-of-the-art CL-based recommendation models. By enhancing +recommendation accuracy and diversity, our method brings an advancement in CL +for recommender systems. + +
+
+ comment: Jeongwhan Choi and Hyowon Wi are co-first authors with equal + contributions +
+
+
+
+
+ + ♻ ☆ From Clicks to Carbon: The Environmental Toll of Recommender Systems + + +
+ As global warming soars, the need to assess the environmental impact of +research is becoming increasingly urgent. Despite this, few recommender systems +research papers address their environmental impact. In this study, we estimate +the environmental impact of recommender systems research by reproducing typical +experimental pipelines. Our analysis spans 79 full papers from the 2013 and +2023 ACM RecSys conferences, comparing traditional "good old-fashioned AI" +algorithms with modern deep learning algorithms. We designed and reproduced +representative experimental pipelines for both years, measuring energy +consumption with a hardware energy meter and converting it to CO2 equivalents. +Our results show that papers using deep learning algorithms emit approximately +42 times more CO2 equivalents than papers using traditional methods. On +average, a single deep learning-based paper generates 3,297 kilograms of CO2 +equivalents - more than the carbon emissions of one person flying from New York +City to Melbourne or the amount of CO2 one tree sequesters over 300 years. + +
+
+ comment: Accepted for presentation at the 18th ACM Conference on Recommender + Systems in the Reproducibility Track +
+
+
+
+
+ + ♻ ☆ Mistral-SPLADE: LLMs for better Learned Sparse Retrieval + + +
+ Learned Sparse Retrievers (LSR) have evolved into an effective retrieval +strategy that can bridge the gap between traditional keyword-based sparse +retrievers and embedding-based dense retrievers. At its core, learned sparse +retrievers try to learn the most important semantic keyword expansions from a +query and/or document which can facilitate better retrieval with overlapping +keyword expansions. LSR like SPLADE has typically been using encoder only +models with MLM (masked language modeling) style objective in conjunction with +known ways of retrieval performance improvement such as hard negative mining, +distillation, etc. In this work, we propose to use decoder-only model for +learning semantic keyword expansion. We posit, decoder only models that have +seen much higher magnitudes of data are better equipped to learn keyword +expansions needed for improved retrieval. We use Mistral as the backbone to +develop our Learned Sparse Retriever similar to SPLADE and train it on a subset +of sentence-transformer data which is often used for training text embedding +models. Our experiments support the hypothesis that a sparse retrieval model +based on decoder only large language model (LLM) surpasses the performance of +existing LSR systems, including SPLADE and all its variants. The LLM based +model (Echo-Mistral-SPLADE) now stands as a state-of-the-art learned sparse +retrieval model on the BEIR text retrieval benchmark. + +
+
+
+
+
+ + ♻ ☆ Neural Machine Unranking + + +
+ We tackle the problem of machine unlearning within neural information +retrieval, termed Neural Machine UnRanking (NuMuR) for short. Many of the +mainstream task- or model-agnostic approaches for machine unlearning were +designed for classification tasks. First, we demonstrate that these methods +perform poorly on NuMuR tasks due to the unique challenges posed by neural +information retrieval. Then, we develop a methodology for NuMuR named +Contrastive and Consistent Loss (CoCoL), which effectively balances the +objectives of data forgetting and model performance retention. Experimental +results demonstrate that CoCoL facilitates more effective and controllable data +removal than existing techniques. + +
+
+
+
+
+
+
+
+ + Machine Learning 150 + +
+
+
+ + ☆ Non-Homophilic Graph Pre-Training and Prompt Learning + + +
+ Graphs are ubiquitous for modeling complex relationships between objects +across various fields. Graph neural networks (GNNs) have become a mainstream +technique for graph-based applications, but their performance heavily relies on +abundant labeled data. To reduce labeling requirement, pre-training and prompt +learning has become a popular alternative. However, most existing prompt +methods do not differentiate homophilic and heterophilic characteristics of +real-world graphs. In particular, many real-world graphs are non-homophilic, +not strictly or uniformly homophilic with mixing homophilic and heterophilic +patterns, exhibiting varying non-homophilic characteristics across graphs and +nodes. In this paper, we propose ProNoG, a novel pre-training and prompt +learning framework for such non-homophilic graphs. First, we analyze existing +graph pre-training methods, providing theoretical insights into the choice of +pre-training tasks. Second, recognizing that each node exhibits unique +non-homophilic characteristics, we propose a conditional network to +characterize the node-specific patterns in downstream tasks. Finally, we +thoroughly evaluate and analyze ProNoG through extensive experiments on ten +public datasets. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Identifying the Best Arm in the Presence of Global Environment Shifts ECAI 2024 + + +
+ This paper formulates a new Best-Arm Identification problem in the +non-stationary stochastic bandits setting, where the means of all arms are +shifted in the same way due to a global influence of the environment. The aim +is to identify the unique best arm across environmental change given a fixed +total budget. While this setting can be regarded as a special case of +Adversarial Bandits or Corrupted Bandits, we demonstrate that existing +solutions tailored to those settings do not fully utilise the nature of this +global influence, and thus, do not work well in practice (despite their +theoretical guarantees). To overcome this issue, in this paper we develop a +novel selection policy that is consistent and robust in dealing with global +environmental shifts. We then propose an allocation policy, LinLUCB, which +exploits information about global shifts across all arms in each environment. +Empirical tests depict a significant improvement in our policies against other +existing methods. + +
+
+ comment: Extended version of the paper accepted at the 27th European + Conference on Artificial Intelligence (ECAI 2024); Paper ID: M1125 +
+
+
+
+
+ + ☆ RuleAlign: Making Large Language Models Better Physicians with + Diagnostic Rule Alignment + + +
+ Large Language Models (LLMs) like GPT-4, MedPaLM-2, and Med-Gemini achieve +performance competitively with human experts across various medical benchmarks. +However, they still face challenges in making professional diagnoses akin to +physicians, particularly in efficiently gathering patient information and +reasoning the final diagnosis. To this end, we introduce the RuleAlign +framework, designed to align LLMs with specific diagnostic rules. We develop a +medical dialogue dataset comprising rule-based communications between patients +and physicians and design an alignment learning approach through preference +learning. Experimental results demonstrate the effectiveness of the proposed +approach. We hope that our work can serve as an inspiration for exploring the +potential of LLMs as AI physicians. + +
+
+ comment: Ongoing work +
+
+
+
+
+ + ☆ A Percolation Model of Emergence: Analyzing Transformers Trained on a + Formal Language + + +
+ Increase in data, size, or compute can lead to sudden learning of specific +capabilities by a neural network -- a phenomenon often called "emergence". +Beyond scientific understanding, establishing the causal factors underlying +such emergent capabilities is crucial to enable risk regulation frameworks for +AI. In this work, we seek inspiration from study of emergent properties in +other fields and propose a phenomenological definition for the concept in the +context of neural networks. Our definition implicates the acquisition of +specific structures underlying the data-generating process as a cause of sudden +performance growth for specific, narrower tasks. We empirically investigate +this definition by proposing an experimental system grounded in a +context-sensitive formal language and find that Transformers trained to perform +tasks on top of strings from this language indeed exhibit emergent +capabilities. Specifically, we show that once the language's underlying grammar +and context-sensitivity inducing structures are learned by the model, +performance on narrower tasks suddenly begins to improve. We then analogize our +network's learning dynamics with the process of percolation on a bipartite +graph, establishing a formal phase transition model that predicts the shift in +the point of emergence observed in experiment when changing the data structure. +Overall, our experimental and theoretical frameworks yield a step towards +better defining, characterizing, and predicting emergence in neural networks. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ MuMA-ToM: Multi-modal Multi-Agent Theory of Mind SC + + +
+ Understanding people's social interactions in complex real-world scenarios +often relies on intricate mental reasoning. To truly understand how and why +people interact with one another, we must infer the underlying mental states +that give rise to the social interactions, i.e., Theory of Mind reasoning in +multi-agent interactions. Additionally, social interactions are often +multi-modal -- we can watch people's actions, hear their conversations, and/or +read about their past behaviors. For AI systems to successfully and safely +interact with people in real-world environments, they also need to understand +people's mental states as well as their inferences about each other's mental +states based on multi-modal information about their interactions. For this, we +introduce MuMA-ToM, a Multi-modal Multi-Agent Theory of Mind benchmark. +MuMA-ToM is the first multi-modal Theory of Mind benchmark that evaluates +mental reasoning in embodied multi-agent interactions. In MuMA-ToM, we provide +video and text descriptions of people's multi-modal behavior in realistic +household environments. Based on the context, we then ask questions about +people's goals, beliefs, and beliefs about others' goals. We validated MuMA-ToM +in a human experiment and provided a human baseline. We also proposed a novel +multi-modal, multi-agent ToM model, LIMP (Language model-based Inverse +Multi-agent Planning). Our experimental results show that LIMP significantly +outperforms state-of-the-art methods, including large multi-modal models (e.g., +GPT-4o, Gemini-1.5 Pro) and a recent multi-modal ToM model, BIP-ALM. + +
+
+ comment: Project website: https://scai.cs.jhu.edu/projects/MuMA-ToM/ Code: + https://github.com/SCAI-JHU/MuMA-ToM +
+
+
+
+
+ + ☆ Jamba-1.5: Hybrid Transformer-Mamba Models at Scale + + +
+ We present Jamba-1.5, new instruction-tuned large language models based on +our Jamba architecture. Jamba is a hybrid Transformer-Mamba mixture of experts +architecture, providing high throughput and low memory usage across context +lengths, while retaining the same or better quality as Transformer models. We +release two model sizes: Jamba-1.5-Large, with 94B active parameters, and +Jamba-1.5-Mini, with 12B active parameters. Both models are fine-tuned for a +variety of conversational and instruction-following capabilties, and have an +effective context length of 256K tokens, the largest amongst open-weight +models. To support cost-effective inference, we introduce ExpertsInt8, a novel +quantization technique that allows fitting Jamba-1.5-Large on a machine with 8 +80GB GPUs when processing 256K-token contexts without loss of quality. When +evaluated on a battery of academic and chatbot benchmarks, Jamba-1.5 models +achieve excellent results while providing high throughput and outperforming +other open-weight models on long-context benchmarks. The model weights for both +sizes are publicly available under the Jamba Open Model License and we release +ExpertsInt8 as open source. + +
+
+ comment: Webpage: https://www.ai21.com/jamba +
+
+
+
+
+ + ☆ Pruning By Explaining Revisited: Optimizing Attribution Methods to Prune + CNNs and Transformers ECCV 2024 + + +
+ To solve ever more complex problems, Deep Neural Networks are scaled to +billions of parameters, leading to huge computational costs. An effective +approach to reduce computational requirements and increase efficiency is to +prune unnecessary components of these often over-parameterized networks. +Previous work has shown that attribution methods from the field of eXplainable +AI serve as effective means to extract and prune the least relevant network +components in a few-shot fashion. We extend the current state by proposing to +explicitly optimize hyperparameters of attribution methods for the task of +pruning, and further include transformer-based networks in our analysis. Our +approach yields higher model compression rates of large transformer- and +convolutional architectures (VGG, ResNet, ViT) compared to previous works, +while still attaining high performance on ImageNet classification tasks. Here, +our experiments indicate that transformers have a higher degree of +over-parameterization compared to convolutional neural networks. Code is +available at +$\href{https://github.com/erfanhatefi/Pruning-by-eXplaining-in-PyTorch}{\text{this +https link}}$. + +
+
+ comment: Accepted as a workshop paper at ECCV 2024 31 pages (14 pages + manuscript, 4 pages references, 13 pages appendix) +
+
+
+
+
+ + ☆ ssProp: Energy-Efficient Training for Convolutional Neural Networks with + Scheduled Sparse Back Propagation + + +
+ Recently, deep learning has made remarkable strides, especially with +generative modeling, such as large language models and probabilistic diffusion +models. However, training these models often involves significant computational +resources, requiring billions of petaFLOPs. This high resource consumption +results in substantial energy usage and a large carbon footprint, raising +critical environmental concerns. Back-propagation (BP) is a major source of +computational expense during training deep learning models. To advance research +on energy-efficient training and allow for sparse learning on any machine and +device, we propose a general, energy-efficient convolution module that can be +seamlessly integrated into any deep learning architecture. Specifically, we +introduce channel-wise sparsity with additional gradient selection schedulers +during backward based on the assumption that BP is often dense and inefficient, +which can lead to over-fitting and high computational consumption. Our +experiments demonstrate that our approach reduces 40\% computations while +potentially improving model performance, validated on image classification and +generation tasks. This reduction can lead to significant energy savings and a +lower carbon footprint during the research and development phases of +large-scale AI systems. Additionally, our method mitigates over-fitting in a +manner distinct from Dropout, allowing it to be combined with Dropout to +further enhance model performance and reduce computational resource usage. +Extensive experiments validate that our method generalizes to a variety of +datasets and tasks and is compatible with a wide range of deep learning +architectures and modules. Code is publicly available at +https://github.com/lujiazho/ssProp. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Human-In-The-Loop Machine Learning for Safe and Ethical Autonomous + Vehicles: Principles, Challenges, and Opportunities + + +
+ Rapid advances in Machine Learning (ML) have triggered new trends in +Autonomous Vehicles (AVs). ML algorithms play a crucial role in interpreting +sensor data, predicting potential hazards, and optimizing navigation +strategies. However, achieving full autonomy in cluttered and complex +situations, such as intricate intersections, diverse sceneries, varied +trajectories, and complex missions, is still challenging, and the cost of data +labeling remains a significant bottleneck. The adaptability and robustness of +humans in complex scenarios motivate the inclusion of humans in ML process, +leveraging their creativity, ethical power, and emotional intelligence to +improve ML effectiveness. The scientific community knows this approach as +Human-In-The-Loop Machine Learning (HITL-ML). Towards safe and ethical +autonomy, we present a review of HITL-ML for AVs, focusing on Curriculum +Learning (CL), Human-In-The-Loop Reinforcement Learning (HITL-RL), Active +Learning (AL), and ethical principles. In CL, human experts systematically +train ML models by starting with simple tasks and gradually progressing to more +difficult ones. HITL-RL significantly enhances the RL process by incorporating +human input through techniques like reward shaping, action injection, and +interactive learning. AL streamlines the annotation process by targeting +specific instances that need to be labeled with human oversight, reducing the +overall time and cost associated with training. Ethical principles must be +embedded in AVs to align their behavior with societal values and norms. In +addition, we provide insights and specify future research directions. + +
+
+ comment: 19 pages, 5 figures +
+
+
+
+
+ + ☆ Dynamics of Meta-learning Representation in the Teacher-student Scenario + + +
+ Gradient-based meta-learning algorithms have gained popularity for their +ability to train models on new tasks using limited data. Empirical observations +indicate that such algorithms are able to learn a shared representation across +tasks, which is regarded as a key factor in their success. However, the +in-depth theoretical understanding of the learning dynamics and the origin of +the shared representation remains underdeveloped. In this work, we investigate +the meta-learning dynamics of the non-linear two-layer neural networks trained +on streaming tasks in the teach-student scenario. Through the lens of +statistical physics analysis, we characterize the macroscopic behavior of the +meta-training processes, the formation of the shared representation, and the +generalization ability of the model on new tasks. The analysis also points to +the importance of the choice of certain hyper-parameters of the learning +algorithms. + +
+
+
+
+
+ + ☆ Exploiting Student Parallelism for Low-latency GPU Inference of + BERT-like Models in Online Services + + +
+ Due to high accuracy, BERT-like models have been widely adopted by +discriminative text mining and web searching. However, large BERT-like models +suffer from inefficient online inference, as they face the following two +problems on GPUs. First, they rely on the large model depth to achieve high +accuracy, which linearly increases the sequential computation on GPUs. Second, +stochastic and dynamic online workloads cause extra costs. In this paper, we +present Academus for low-latency online inference of BERT-like models. At the +core of Academus is the novel student parallelism, which adopts boosting +ensemble and stacking distillation to distill the original deep model into an +equivalent group of parallel and shallow student models. This enables Academus +to achieve the lower model depth (e.g., two layers) than baselines and +consequently the lowest inference latency without affecting the accuracy.For +occasional workload bursts, it can temporarily decrease the number of students +with minimal accuracy loss to improve throughput. Additionally, it employs +specialized system designs for student parallelism to better handle stochastic +online workloads. We conduct comprehensive experiments to verify the +effectiveness. The results show that Academus outperforms the baselines by +4.1X~1.6X in latency without compromising accuracy, and achieves up to 22.27X +higher throughput for workload bursts. + +
+
+
+
+
+ + ☆ PCGRL+: Scaling, Control and Generalization in Reinforcement Learning + Level Generators + + +
+ Procedural Content Generation via Reinforcement Learning (PCGRL) has been +introduced as a means by which controllable designer agents can be trained +based only on a set of computable metrics acting as a proxy for the level's +quality and key characteristics. While PCGRL offers a unique set of affordances +for game designers, it is constrained by the compute-intensive process of +training RL agents, and has so far been limited to generating relatively small +levels. To address this issue of scale, we implement several PCGRL environments +in Jax so that all aspects of learning and simulation happen in parallel on the +GPU, resulting in faster environment simulation; removing the CPU-GPU transfer +of information bottleneck during RL training; and ultimately resulting in +significantly improved training speed. We replicate several key results from +prior works in this new framework, letting models train for much longer than +previously studied, and evaluating their behavior after 1 billion timesteps. +Aiming for greater control for human designers, we introduce randomized level +sizes and frozen "pinpoints" of pivotal game tiles as further ways of +countering overfitting. To test the generalization ability of learned +generators, we evaluate models on large, out-of-distribution map sizes, and +find that partial observation sizes learn more robust design strategies. + +
+
+ comment: 8 pages, 7 figures, 6 tables. Published at IEEE Conference on Games, + 2024 +
+
+
+
+
+ + ☆ Advanced atom-level representations for protein flexibility prediction + utilizing graph neural networks + + +
+ Protein dynamics play a crucial role in many biological processes and drug +interactions. However, measuring, and simulating protein dynamics is +challenging and time-consuming. While machine learning holds promise in +deciphering the determinants of protein dynamics from structural information, +most existing methods for protein representation learning operate at the +residue level, ignoring the finer details of atomic interactions. In this work, +we propose for the first time to use graph neural networks (GNNs) to learn +protein representations at the atomic level and predict B-factors from protein +3D structures. The B-factor reflects the atomic displacement of atoms in +proteins, and can serve as a surrogate for protein flexibility. We compared +different GNN architectures to assess their performance. The Meta-GNN model +achieves a correlation coefficient of 0.71 on a large and diverse test set of +over 4k proteins (17M atoms) from the Protein Data Bank (PDB), outperforming +previous methods by a large margin. Our work demonstrates the potential of +representations learned by GNNs for protein flexibility prediction and other +related tasks. + +
+
+
+
+
+ + ☆ Stochastic Compositional Minimax Optimization with Provable Convergence + Guarantees + + +
+ Stochastic compositional minimax problems are prevalent in machine learning, +yet there are only limited established on the convergence of this class of +problems. In this paper, we propose a formal definition of the stochastic +compositional minimax problem, which involves optimizing a minimax loss with a +compositional structure either in primal , dual, or both primal and dual +variables. We introduce a simple yet effective algorithm, stochastically +Corrected stOchastic gradient Descent Ascent (CODA), which is a descent ascent +type algorithm with compositional correction steps, and establish its +convergence rate in aforementioned three settings. In the presence of the +compositional structure in primal, the objective function typically becomes +nonconvex in primal due to function composition. Thus, we consider the +nonconvex-strongly-concave and nonconvex-concave settings and show that CODA +can efficiently converge to a stationary point. In the case of composition on +the dual, the objective function becomes nonconcave in the dual variable, and +we demonstrate convergence in the strongly-convex-nonconcave and +convex-nonconcave setting. In the case of composition on both variables, the +primal and dual variables may lose convexity and concavity, respectively. +Therefore, we anaylze the convergence in weakly-convex-weakly-concave setting. +We also give a variance reduction version algorithm, CODA+, which achieves the +best known rate on nonconvex-strongly-concave and nonconvex-concave +compositional minimax problem. This work initiates the theoretical study of the +stochastic compositional minimax problem on various settings and may inform +modern machine learning scenarios such as domain adaptation or robust +model-agnostic meta-learning. + +
+
+
+
+
+ + ☆ AI in radiological imaging of soft-tissue and bone tumours: a systematic + review evaluating against CLAIM and FUTURE-AI guidelines + + +
+ Soft-tissue and bone tumours (STBT) are rare, diagnostically challenging +lesions with variable clinical behaviours and treatment approaches. This +systematic review provides an overview of Artificial Intelligence (AI) methods +using radiological imaging for diagnosis and prognosis of these tumours, +highlighting challenges in clinical translation, and evaluating study alignment +with the Checklist for AI in Medical Imaging (CLAIM) and the FUTURE-AI +international consensus guidelines for trustworthy and deployable AI to promote +the clinical translation of AI methods. The review covered literature from +several bibliographic databases, including papers published before 17/07/2024. +Original research in peer-reviewed journals focused on radiology-based AI for +diagnosing or prognosing primary STBT was included. Exclusion criteria were +animal, cadaveric, or laboratory studies, and non-English papers. Abstracts +were screened by two of three independent reviewers for eligibility. Eligible +papers were assessed against guidelines by one of three independent reviewers. +The search identified 15,015 abstracts, from which 325 articles were included +for evaluation. Most studies performed moderately on CLAIM, averaging a score +of 28.9$\pm$7.5 out of 53, but poorly on FUTURE-AI, averaging 5.1$\pm$2.1 out +of 30. Imaging-AI tools for STBT remain at the proof-of-concept stage, +indicating significant room for improvement. Future efforts by AI developers +should focus on design (e.g. define unmet clinical need, intended clinical +setting and how AI would be integrated in clinical workflow), development (e.g. +build on previous work, explainability), evaluation (e.g. evaluating and +addressing biases, evaluating AI against best practices), and data +reproducibility and availability (making documented code and data publicly +available). Following these recommendations could improve clinical translation +of AI methods. + +
+
+ comment: 23 pages, 6 figures, 6 supplementary figures +
+
+
+
+
+ + ☆ Self-Learning for Personalized Keyword Spotting on Ultra-Low-Power Audio + Sensors + + +
+ This paper proposes a self-learning framework to incrementally train +(fine-tune) a personalized Keyword Spotting (KWS) model after the deployment on +ultra-low power smart audio sensors. We address the fundamental problem of the +absence of labeled training data by assigning pseudo-labels to the new recorded +audio frames based on a similarity score with respect to few user recordings. +By experimenting with multiple KWS models with a number of parameters up to +0.5M on two public datasets, we show an accuracy improvement of up to +19.2% +and +16.0% vs. the initial models pretrained on a large set of generic +keywords. The labeling task is demonstrated on a sensor system composed of a +low-power microphone and an energy-efficient Microcontroller (MCU). By +efficiently exploiting the heterogeneous processing engines of the MCU, the +always-on labeling task runs in real-time with an average power cost of up to +8.2 mW. On the same platform, we estimate an energy cost for on-device training +10x lower than the labeling energy if sampling a new utterance every 5 s or +16.4 s with a DS-CNN-S or a DS-CNN-M model. Our empirical result paves the way +to self-adaptive personalized KWS sensors at the extreme edge. + +
+
+
+
+
+ + ☆ Vintern-1B: An Efficient Multimodal Large Language Model for Vietnamese + + +
+ In this report, we introduce Vintern-1B, a reliable 1-billion-parameters +multimodal large language model (MLLM) for Vietnamese language tasks. By +integrating the Qwen2-0.5B-Instruct language model with the +InternViT-300M-448px visual model, Vintern-1B is optimized for a range of +applications, including optical character recognition (OCR), document +extraction, and general question-answering in Vietnamese context. The model is +fine-tuned on an extensive dataset of over 3 million image-question-answer +pairs, achieving robust performance and reliable results across multiple +Vietnamese language benchmarks like OpenViVQA and ViTextVQA. Vintern-1B is +small enough to fit into various on-device applications easily. Additionally, +we have open-sourced several Vietnamese vision question answering (VQA) +datasets for text and diagrams, created with Gemini 1.5 Flash. Our models are +available at: https://huggingface.co/5CD-AI/Vintern-1B-v2. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2404.16821 by other authors +
+
+
+
+
+ + ☆ Predicting Solar Energy Generation with Machine Learning based on AQI + and Weather Features + + +
+ This paper addresses the pressing need for an accurate solar energy +prediction model, which is crucial for efficient grid integration. We explore +the influence of the Air Quality Index and weather features on solar energy +generation, employing advanced Machine Learning and Deep Learning techniques. +Our methodology uses time series modeling and makes novel use of power +transform normalization and zero-inflated modeling. Various Machine Learning +algorithms and Conv2D Long Short-Term Memory model based Deep Learning models +are applied to these transformations for precise predictions. Results +underscore the effectiveness of our approach, demonstrating enhanced prediction +accuracy with Air Quality Index and weather features. We achieved a 0.9691 +$R^2$ Score, 0.18 MAE, 0.10 RMSE with Conv2D Long Short-Term Memory model, +showcasing the power transform technique's innovation in enhancing time series +forecasting for solar energy generation. Such results help our research +contribute valuable insights to the synergy between Air Quality Index, weather +features, and Deep Learning techniques for solar energy prediction. + +
+
+ comment: 10 pages, 11 figures +
+
+
+
+
+ + ☆ WCEbleedGen: A wireless capsule endoscopy dataset and its benchmarking + for automatic bleeding classification, detection, and segmentation + + +
+ Computer-based analysis of Wireless Capsule Endoscopy (WCE) is crucial. +However, a medically annotated WCE dataset for training and evaluation of +automatic classification, detection, and segmentation of bleeding and +non-bleeding frames is currently lacking. The present work focused on +development of a medically annotated WCE dataset called WCEbleedGen for +automatic classification, detection, and segmentation of bleeding and +non-bleeding frames. It comprises 2,618 WCE bleeding and non-bleeding frames +which were collected from various internet resources and existing WCE datasets. +A comprehensive benchmarking and evaluation of the developed dataset was done +using nine classification-based, three detection-based, and three +segmentation-based deep learning models. The dataset is of high-quality, is +class-balanced and contains single and multiple bleeding sites. Overall, our +standard benchmark results show that Visual Geometric Group (VGG) 19, You Only +Look Once version 8 nano (YOLOv8n), and Link network (Linknet) performed best +in automatic classification, detection, and segmentation-based evaluations, +respectively. Automatic bleeding diagnosis is crucial for WCE video +interpretations. This diverse dataset will aid in developing of real-time, +multi-task learning-based innovative solutions for automatic bleeding diagnosis +in WCE. The dataset and code are publicly available at +https://zenodo.org/records/10156571 and +https://github.com/misahub2023/Benchmarking-Codes-of-the-WCEBleedGen-dataset. + +
+
+
+
+
+ + ☆ Smartphone-based Eye Tracking System using Edge Intelligence and Model + Optimisation + + +
+ A significant limitation of current smartphone-based eye-tracking algorithms +is their low accuracy when applied to video-type visual stimuli, as they are +typically trained on static images. Also, the increasing demand for real-time +interactive applications like games, VR, and AR on smartphones requires +overcoming the limitations posed by resource constraints such as limited +computational power, battery life, and network bandwidth. Therefore, we +developed two new smartphone eye-tracking techniques for video-type visuals by +combining Convolutional Neural Networks (CNN) with two different Recurrent +Neural Networks (RNN), namely Long Short Term Memory (LSTM) and Gated Recurrent +Unit (GRU). Our CNN+LSTM and CNN+GRU models achieved an average Root Mean +Square Error of 0.955cm and 1.091cm, respectively. To address the computational +constraints of smartphones, we developed an edge intelligence architecture to +enhance the performance of smartphone-based eye tracking. We applied various +optimisation methods like quantisation and pruning to deep learning models for +better energy, CPU, and memory usage on edge devices, focusing on real-time +processing. Using model quantisation, the model inference time in the CNN+LSTM +and CNN+GRU models was reduced by 21.72% and 19.50%, respectively, on edge +devices. + +
+
+
+
+
+ + ☆ Finding Closure: A Closer Look at the Gestalt Law of Closure in + Convolutional Neural Networks + + +
+ The human brain has an inherent ability to fill in gaps to perceive figures +as complete wholes, even when parts are missing or fragmented. This phenomenon +is known as Closure in psychology, one of the Gestalt laws of perceptual +organization, explaining how the human brain interprets visual stimuli. Given +the importance of Closure for human object recognition, we investigate whether +neural networks rely on a similar mechanism. Exploring this crucial human +visual skill in neural networks has the potential to highlight their +comparability to humans. Recent studies have examined the Closure effect in +neural networks. However, they typically focus on a limited selection of +Convolutional Neural Networks (CNNs) and have not reached a consensus on their +capability to perform Closure. To address these gaps, we present a systematic +framework for investigating the Closure principle in neural networks. We +introduce well-curated datasets designed to test for Closure effects, including +both modal and amodal completion. We then conduct experiments on various CNNs +employing different measurements. Our comprehensive analysis reveals that VGG16 +and DenseNet-121 exhibit the Closure effect, while other CNNs show variable +results. We interpret these findings by blending insights from psychology and +neural network research, offering a unique perspective that enhances +transparency in understanding neural networks. Our code and dataset will be +made available on GitHub. + +
+
+
+
+
+ + ☆ EX-DRL: Hedging Against Heavy Losses with EXtreme Distributional + Reinforcement Learning + + +
+ Recent advancements in Distributional Reinforcement Learning (DRL) for +modeling loss distributions have shown promise in developing hedging strategies +in derivatives markets. A common approach in DRL involves learning the +quantiles of loss distributions at specified levels using Quantile Regression +(QR). This method is particularly effective in option hedging due to its direct +quantile-based risk assessment, such as Value at Risk (VaR) and Conditional +Value at Risk (CVaR). However, these risk measures depend on the accurate +estimation of extreme quantiles in the loss distribution's tail, which can be +imprecise in QR-based DRL due to the rarity and extremity of tail data, as +highlighted in the literature. To address this issue, we propose EXtreme DRL +(EX-DRL), which enhances extreme quantile prediction by modeling the tail of +the loss distribution with a Generalized Pareto Distribution (GPD). This method +introduces supplementary data to mitigate the scarcity of extreme quantile +observations, thereby improving estimation accuracy through QR. Comprehensive +experiments on gamma hedging options demonstrate that EX-DRL improves existing +QR-based models by providing more precise estimates of extreme quantiles, +thereby improving the computation and reliability of risk metrics for complex +financial risk management. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Verifiable Homomorphic Linear Combinations in Multi-Instance Time-Lock + Puzzles + + +
+ Time-Lock Puzzles (TLPs) have been developed to securely transmit sensitive +information into the future without relying on a trusted third party. +Multi-instance TLP is a scalable variant of TLP that enables a server to +efficiently find solutions to different puzzles provided by a client at once. +Nevertheless, existing multi-instance TLPs lack support for (verifiable) +homomorphic computation. To address this limitation, we introduce the +"Multi-Instance partially Homomorphic TLP" (MH-TLP), a multi-instance TLP +supporting efficient verifiable homomorphic linear combinations of puzzles +belonging to a client. It ensures anyone can verify the correctness of +computations and solutions. Building on MH-TLP, we further propose the +"Multi-instance Multi-client verifiable partially Homomorphic TLP" (MMH-TLP). +It not only supports all the features of MH-TLP but also allows for verifiable +homomorphic linear combinations of puzzles from different clients. Our schemes +refrain from using asymmetric-key cryptography for verification and, unlike +most homomorphic TLPs, do not require a trusted third party. A comprehensive +cost analysis demonstrates that our schemes scale linearly with the number of +clients and puzzles. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2406.15070 +
+
+
+
+
+ + ☆ Dynamic Gated Recurrent Neural Network for Compute-efficient Speech + Enhancement + + +
+ This paper introduces a new Dynamic Gated Recurrent Neural Network (DG-RNN) +for compute-efficient speech enhancement models running on resource-constrained +hardware platforms. It leverages the slow evolution characteristic of RNN +hidden states over steps, and updates only a selected set of neurons at each +step by adding a newly proposed select gate to the RNN model. This select gate +allows the computation cost of the conventional RNN to be reduced during +network inference. As a realization of the DG-RNN, we further propose the +Dynamic Gated Recurrent Unit (D-GRU) which does not require additional +parameters. Test results obtained from several state-of-the-art +compute-efficient RNN-based speech enhancement architectures using the DNS +challenge dataset, show that the D-GRU based model variants maintain similar +speech intelligibility and quality metrics comparable to the baseline GRU based +models even with an average 50% reduction in GRU computes. + +
+
+ comment: Accepted to Interspeech 2024 +
+
+
+
+
+ + ☆ Multi-Knowledge Fusion Network for Time Series Representation Learning ICLR + + +
+ Forecasting the behaviour of complex dynamical systems such as interconnected +sensor networks characterized by high-dimensional multivariate time series(MTS) +is of paramount importance for making informed decisions and planning for the +future in a broad spectrum of applications. Graph forecasting networks(GFNs) +are well-suited for forecasting MTS data that exhibit spatio-temporal +dependencies. However, most prior works of GFN-based methods on MTS forecasting +rely on domain-expertise to model the nonlinear dynamics of the system, but +neglect the potential to leverage the inherent relational-structural +dependencies among time series variables underlying MTS data. On the other +hand, contemporary works attempt to infer the relational structure of the +complex dependencies between the variables and simultaneously learn the +nonlinear dynamics of the interconnected system but neglect the possibility of +incorporating domain-specific prior knowledge to improve forecast accuracy. To +this end, we propose a hybrid architecture that combines explicit prior +knowledge with implicit knowledge of the relational structure within the MTS +data. It jointly learns intra-series temporal dependencies and inter-series +spatial dependencies by encoding time-conditioned structural spatio-temporal +inductive biases to provide more accurate and reliable forecasts. It also +models the time-varying uncertainty of the multi-horizon forecasts to support +decision-making by providing estimates of prediction uncertainty. The proposed +architecture has shown promising results on multiple benchmark datasets and +outperforms state-of-the-art forecasting methods by a significant margin. We +report and discuss the ablation studies to validate our forecasting +architecture. + +
+
+ comment: Paper accepted at ML4IoT Workshop, International Conference on + Learning Representations(ICLR) 2023 +
+
+
+
+
+ + ☆ 4D Diffusion for Dynamic Protein Structure Prediction with Reference + Guided Motion Alignment + + +
+ Protein structure prediction is pivotal for understanding the +structure-function relationship of proteins, advancing biological research, and +facilitating pharmaceutical development and experimental design. While deep +learning methods and the expanded availability of experimental 3D protein +structures have accelerated structure prediction, the dynamic nature of protein +structures has received limited attention. This study introduces an innovative +4D diffusion model incorporating molecular dynamics (MD) simulation data to +learn dynamic protein structures. Our approach is distinguished by the +following components: (1) a unified diffusion model capable of generating +dynamic protein structures, including both the backbone and side chains, +utilizing atomic grouping and side-chain dihedral angle predictions; (2) a +reference network that enhances structural consistency by integrating the +latent embeddings of the initial 3D protein structures; and (3) a motion +alignment module aimed at improving temporal structural coherence across +multiple time steps. To our knowledge, this is the first diffusion-based model +aimed at predicting protein trajectories across multiple time steps +simultaneously. Validation on benchmark datasets demonstrates that our model +exhibits high accuracy in predicting dynamic 3D structures of proteins +containing up to 256 amino acids over 32 time steps, effectively capturing both +local flexibility in stable states and significant conformational changes. + +
+
+
+
+
+ + ☆ Unlearning Trojans in Large Language Models: A Comparison Between + Natural Language and Source Code + + +
+ This work investigates the application of Machine Unlearning (MU) for +mitigating the impact of trojans embedded in conventional large language models +of natural language (Text-LLMs) and large language models of code (Code-LLMs) +We propose a novel unlearning approach, LYA, that leverages both gradient +ascent and elastic weight consolidation, a Fisher Information Matrix (FIM) +based regularization technique, to unlearn trojans from poisoned models. We +compare the effectiveness of LYA against conventional techniques like +fine-tuning, retraining, and vanilla gradient ascent. The subject models we +investigate are BERT and CodeBERT, for sentiment analysis and code defect +detection tasks, respectively. Our findings demonstrate that the combination of +gradient ascent and FIM-based regularization, as done in LYA, outperforms +existing methods in removing the trojan's influence from the poisoned model, +while preserving its original functionality. To the best of our knowledge, this +is the first work that compares and contrasts MU of trojans in LLMs, in the NL +and Coding domain. + +
+
+
+
+
+ + ☆ An Evaluation of Deep Learning Models for Stock Market Trend Prediction + + +
+ The stock market is a fundamental component of financial systems, reflecting +economic health, providing investment opportunities, and influencing global +dynamics. Accurate stock market predictions can lead to significant gains and +promote better investment decisions. However, predicting stock market trends is +challenging due to their non-linear and stochastic nature. This study +investigates the efficacy of advanced deep learning models for short-term trend +forecasting using daily and hourly closing prices from the S&P 500 index and +the Brazilian ETF EWZ. The models explored include Temporal Convolutional +Networks (TCN), Neural Basis Expansion Analysis for Time Series Forecasting +(N-BEATS), Temporal Fusion Transformers (TFT), Neural Hierarchical +Interpolation for Time Series Forecasting (N-HiTS), and Time-series Dense +Encoder (TiDE). Furthermore, we introduce the Extended Long Short-Term Memory +for Time Series (xLSTM-TS) model, an xLSTM adaptation optimised for time series +prediction. Wavelet denoising techniques were applied to smooth the signal and +reduce minor fluctuations, providing cleaner data as input for all approaches. +Denoising significantly improved performance in predicting stock price +direction. Among the models tested, xLSTM-TS consistently outperformed others. +For example, it achieved a test accuracy of 72.82% and an F1 score of 73.16% on +the EWZ daily dataset. By leveraging advanced deep learning models and +effective data preprocessing techniques, this research provides valuable +insights into the application of machine learning for market movement +forecasting, highlighting both the potential and the challenges involved. + +
+
+
+
+
+ + ☆ Multi-Source Knowledge-Based Hybrid Neural Framework for Time Series + Representation Learning IJCAI-23 + + +
+ Accurately predicting the behavior of complex dynamical systems, +characterized by high-dimensional multivariate time series(MTS) in +interconnected sensor networks, is crucial for informed decision-making in +various applications to minimize risk. While graph forecasting networks(GFNs) +are ideal for forecasting MTS data that exhibit spatio-temporal dependencies, +prior works rely solely on the domain-specific knowledge of time-series +variables inter-relationships to model the nonlinear dynamics, neglecting +inherent relational structural dependencies among the variables within the MTS +data. In contrast, contemporary works infer relational structures from MTS data +but neglect domain-specific knowledge. The proposed hybrid architecture +addresses these limitations by combining both domain-specific knowledge and +implicit knowledge of the relational structure underlying the MTS data using +Knowledge-Based Compositional Generalization. The hybrid architecture shows +promising results on multiple benchmark datasets, outperforming +state-of-the-art forecasting methods. Additionally, the architecture models the +time varying uncertainty of multi-horizon forecasts. + +
+
+ comment: Paper is accepted at Knowledge-Based Compositional Generalization + Workshop, International Joint Conferences on Artificial + Intelligence(IJCAI-23) +
+
+
+
+
+ + ☆ Sharper Bounds for Chebyshev Moment Matching with Applications to + Differential Privacy and Beyond + + +
+ We study the problem of approximately recovering a probability distribution +given noisy measurements of its Chebyshev polynomial moments. We sharpen prior +work, proving that accurate recovery in the Wasserstein distance is possible +with more noise than previously known. + As a main application, our result yields a simple "linear query" algorithm +for constructing a differentially private synthetic data distribution with +Wasserstein-1 error $\tilde{O}(1/n)$ based on a dataset of $n$ points in +$[-1,1]$. This bound is optimal up to log factors and matches a recent +breakthrough of Boedihardjo, Strohmer, and Vershynin [Probab. Theory. Rel., +2024], which uses a more complex "superregular random walk" method to beat an +$O(1/\sqrt{n})$ accuracy barrier inherent to earlier approaches. + We illustrate a second application of our new moment-based recovery bound in +numerical linear algebra: by improving an approach of Braverman, Krishnan, and +Musco [STOC 2022], our result yields a faster algorithm for estimating the +spectral density of a symmetric matrix up to small error in the Wasserstein +distance. + +
+
+
+
+
+ + ☆ Sampling Strategies based on Wisdom of Crowds for Amazon Deforestation + Detection + + +
+ Conserving tropical forests is highly relevant socially and ecologically +because of their critical role in the global ecosystem. However, the ongoing +deforestation and degradation affect millions of hectares each year, +necessitating government or private initiatives to ensure effective forest +monitoring. In April 2019, a project based on Citizen Science and Machine +Learning models called ForestEyes (FE) was launched with the aim of providing +supplementary data to assist experts from government and non-profit +organizations in their deforestation monitoring efforts. Recent research has +shown that labeling FE project volunteers/citizen scientists helps tailor +machine learning models. In this sense, we adopt the FE project to create +different sampling strategies based on the wisdom of crowds to select the most +suitable samples from the training set to learn an SVM technique and obtain +better classification results in deforestation detection tasks. In our +experiments, we can show that our strategy based on user entropy-increasing +achieved the best classification results in the deforestation detection task +when compared with the random sampling strategies, as well as, reducing the +convergence time of the SVM technique. + +
+
+ comment: 6 pages, 5 figus, paper accepted at the SIBGRAPI 2024 +
+
+
+
+
+ + ☆ Cell-ontology guided transcriptome foundation model + + +
+ Transcriptome foundation models TFMs hold great promises of deciphering the +transcriptomic language that dictate diverse cell functions by self-supervised +learning on large-scale single-cell gene expression data, and ultimately +unraveling the complex mechanisms of human diseases. However, current TFMs +treat cells as independent samples and ignore the taxonomic relationships +between cell types, which are available in cell ontology graphs. We argue that +effectively leveraging this ontology information during the TFM pre-training +can improve learning biologically meaningful gene co-expression patterns while +preserving TFM as a general purpose foundation model for downstream zero-shot +and fine-tuning tasks. To this end, we present \textbf{s}ingle \textbf{c}ell, +\textbf{Cell}-\textbf{o}ntology guided TFM scCello. We introduce cell-type +coherence loss and ontology alignment loss, which are minimized along with the +masked gene expression prediction loss during the pre-training. The novel loss +component guide scCello to learn the cell-type-specific representation and the +structural relation between cell types from the cell ontology graph, +respectively. We pre-trained scCello on 22 million cells from CellxGene +database leveraging their cell-type labels mapped to the cell ontology graph +from Open Biological and Biomedical Ontology Foundry. Our TFM demonstrates +competitive generalization and transferability performance over the existing +TFMs on biologically important tasks including identifying novel cell types of +unseen cells, prediction of cell-type-specific marker genes, and cancer drug +responses. + +
+
+ comment: All anonymous reviewers' constructive suggestions are appreciated. + The next version will be updated soon +
+
+
+
+
+ + ☆ Robust Principal Component Analysis via Discriminant Sample Weight + Learning + + +
+ Principal component analysis (PCA) is a classical feature extraction method, +but it may be adversely affected by outliers, resulting in inaccurate learning +of the projection matrix. This paper proposes a robust method to estimate both +the data mean and the PCA projection matrix by learning discriminant sample +weights from data containing outliers. Each sample in the dataset is assigned a +weight, and the proposed algorithm iteratively learns the weights, the mean, +and the projection matrix, respectively. Specifically, when the mean and the +projection matrix are available, via fine-grained analysis of outliers, a +weight for each sample is learned hierarchically so that outliers have small +weights while normal samples have large weights. With the learned weights +available, a weighted optimization problem is solved to estimate both the data +mean and the projection matrix. Because the learned weights discriminate +outliers from normal samples, the adverse influence of outliers is mitigated +due to the corresponding small weights. Experiments on toy data, UCI dataset, +and face dataset demonstrate the effectiveness of the proposed method in +estimating the mean and the projection matrix from the data containing +outliers. + +
+
+
+
+
+ + ☆ Enhancing Uncertainty Communication in Time Series Predictions: Insights + and Recommendations + + +
+ As the world increasingly relies on mathematical models for forecasts in +different areas, effective communication of uncertainty in time series +predictions is important for informed decision making. This study explores how +users estimate probabilistic uncertainty in time series predictions under +different variants of line charts depicting uncertainty. It examines the role +of individual characteristics and the influence of user-reported metrics on +uncertainty estimations. By addressing these aspects, this paper aims to +enhance the understanding of uncertainty visualization and for improving +communication in time series forecast visualizations and the design of +prediction data dashboards.As the world increasingly relies on mathematical +models for forecasts in different areas, effective communication of uncertainty +in time series predictions is important for informed decision making. This +study explores how users estimate probabilistic uncertainty in time series +predictions under different variants of line charts depicting uncertainty. It +examines the role of individual characteristics and the influence of +user-reported metrics on uncertainty estimations. By addressing these aspects, +this paper aims to enhance the understanding of uncertainty visualization and +for improving communication in time series forecast visualizations and the +design of prediction data dashboards. + +
+
+
+
+
+ + ☆ Distributed quasi-Newton robust estimation under differential privacy + + +
+ For distributed computing with Byzantine machines under Privacy Protection +(PP) constraints, this paper develops a robust PP distributed quasi-Newton +estimation, which only requires the node machines to transmit five vectors to +the central processor with high asymptotic relative efficiency. Compared with +the gradient descent strategy which requires more rounds of transmission and +the Newton iteration strategy which requires the entire Hessian matrix to be +transmitted, the novel quasi-Newton iteration has advantages in reducing +privacy budgeting and transmission cost. Moreover, our PP algorithm does not +depend on the boundedness of gradients and second-order derivatives. When +gradients and second-order derivatives follow sub-exponential distributions, we +offer a mechanism that can ensure PP with a sufficiently high probability. +Furthermore, this novel estimator can achieve the optimal convergence rate and +the asymptotic normality. The numerical studies on synthetic and real data sets +evaluate the performance of the proposed algorithm. + +
+
+ comment: 38 pages, 6 figures +
+
+
+
+
+ + ☆ Fine-tuning Smaller Language Models for Question Answering over + Financial Documents + + +
+ Recent research has shown that smaller language models can acquire +substantial reasoning abilities when fine-tuned with reasoning exemplars +crafted by a significantly larger teacher model. We explore this paradigm for +the financial domain, focusing on the challenge of answering questions that +require multi-hop numerical reasoning over financial texts. We assess the +performance of several smaller models that have been fine-tuned to generate +programs that encode the required financial reasoning and calculations. Our +findings demonstrate that these fine-tuned smaller models approach the +performance of the teacher model. + To provide a granular analysis of model performance, we propose an approach +to investigate the specific student model capabilities that are enhanced by +fine-tuning. Our empirical analysis indicates that fine-tuning refines the +student models ability to express and apply the required financial concepts +along with adapting the entity extraction for the specific data format. In +addition, we hypothesize and demonstrate that comparable financial reasoning +capability can be induced using relatively smaller datasets. + +
+
+
+
+
+ + ☆ Enhanced Expressivity in Graph Neural Networks with Lanczos-Based Linear + Constraints + + +
+ Graph Neural Networks (GNNs) excel in handling graph-structured data but +often underperform in link prediction tasks compared to classical methods, +mainly due to the limitations of the commonly used Message Passing GNNs +(MPNNs). Notably, their ability to distinguish non-isomorphic graphs is limited +by the 1-dimensional Weisfeiler-Lehman test. Our study presents a novel method +to enhance the expressivity of GNNs by embedding induced subgraphs into the +graph Laplacian matrix's eigenbasis. We introduce a Learnable Lanczos algorithm +with Linear Constraints (LLwLC), proposing two novel subgraph extraction +strategies: encoding vertex-deleted subgraphs and applying Neumann eigenvalue +constraints. For the former, we conjecture that LLwLC establishes a universal +approximator, offering efficient time complexity. The latter focuses on link +representations enabling differentiation between $k$-regular graphs and node +automorphism, a vital aspect for link prediction tasks. Our approach results in +an extremely lightweight architecture, reducing the need for extensive training +datasets. Empirically, our method improves performance in challenging link +prediction tasks across benchmark datasets, establishing its practical utility +and supporting our theoretical findings. Notably, LLwLC achieves 20x and 10x +speedup by only requiring 5% and 10% data from the PubMed and OGBL-Vessel +datasets while comparing to the state-of-the-art. + +
+
+
+
+
+ + ☆ PolyRouter: A Multi-LLM Querying System + + +
+ With the rapid growth of Large Language Models (LLMs) across various domains, +numerous new LLMs have emerged, each possessing domain-specific expertise. This +proliferation has highlighted the need for quick, high-quality, and +cost-effective LLM query response methods. Yet, no single LLM exists to +efficiently balance this trilemma. Some models are powerful but extremely +costly, while others are fast and inexpensive but qualitatively inferior. To +address this challenge, we present PolyRouter, a non-monolithic LLM querying +system that seamlessly integrates various LLM experts into a single query +interface and dynamically routes incoming queries to the most high-performant +expert based on query's requirements. Through extensive experiments, we +demonstrate that when compared to standalone expert models, PolyRouter improves +query efficiency by up to 40%, and leads to significant cost reductions of up +to 30%, while maintaining or enhancing model performance by up to 10%. + +
+
+ comment: 14 pages, 7 figures, 2 tables +
+
+
+
+
+ + ☆ Neural-ANOVA: Model Decomposition for Interpretable Machine Learning + + +
+ The analysis of variance (ANOVA) decomposition offers a systematic method to +understand the interaction effects that contribute to a specific decision +output. In this paper we introduce Neural-ANOVA, an approach to decompose +neural networks into glassbox models using the ANOVA decomposition. Our +approach formulates a learning problem, which enables rapid and closed-form +evaluation of integrals over subspaces that appear in the calculation of the +ANOVA decomposition. Finally, we conduct numerical experiments to illustrate +the advantages of enhanced interpretability and model validation by a +decomposition of the learned interaction effects. + +
+
+ comment: 8 pages, 4 figures, 5 tables +
+
+
+
+
+ + ☆ Deep Learning with CNNs: A Compact Holistic Tutorial with Focus on + Supervised Regression (Preprint) + + +
+ In this tutorial, we present a compact and holistic discussion of Deep +Learning with a focus on Convolutional Neural Networks (CNNs) and supervised +regression. While there are numerous books and articles on the individual +topics we cover, comprehensive and detailed tutorials that address Deep +Learning from a foundational yet rigorous and accessible perspective are rare. +Most resources on CNNs are either too advanced, focusing on cutting-edge +architectures, or too narrow, addressing only specific applications like image +classification.This tutorial not only summarizes the most relevant concepts but +also provides an in-depth exploration of each, offering a complete yet agile +set of ideas. Moreover, we highlight the powerful synergy between learning +theory, statistic, and machine learning, which together underpin the Deep +Learning and CNN frameworks. We aim for this tutorial to serve as an optimal +resource for students, professors, and anyone interested in understanding the +foundations of Deep Learning. Upon acceptance we will provide an accompanying +repository under +\href{https://github.com/neoglez/deep-learning-tutorial}{https://github.com/neoglez/deep-learning-tutorial} + Keywords: Tutorial, Deep Learning, Convolutional Neural Networks, Machine +Learning. + +
+
+
+
+
+ + ☆ Leveraging Unlabeled Data Sharing through Kernel Function Approximation + in Offline Reinforcement Learning + + +
+ Offline reinforcement learning (RL) learns policies from a fixed dataset, but +often requires large amounts of data. The challenge arises when labeled +datasets are expensive, especially when rewards have to be provided by human +labelers for large datasets. In contrast, unlabelled data tends to be less +expensive. This situation highlights the importance of finding effective ways +to use unlabelled data in offline RL, especially when labelled data is limited +or expensive to obtain. In this paper, we present the algorithm to utilize the +unlabeled data in the offline RL method with kernel function approximation and +give the theoretical guarantee. We present various eigenvalue decay conditions +of $\mathcal{H}_k$ which determine the complexity of the algorithm. In summary, +our work provides a promising approach for exploiting the advantages offered by +unlabeled data in offline RL, whilst maintaining theoretical assurances. + +
+
+
+
+
+ + ☆ Tackling Data Heterogeneity in Federated Learning via Loss Decomposition MICCAI 2024 + + +
+ Federated Learning (FL) is a rising approach towards collaborative and +privacy-preserving machine learning where large-scale medical datasets remain +localized to each client. However, the issue of data heterogeneity among +clients often compels local models to diverge, leading to suboptimal global +models. To mitigate the impact of data heterogeneity on FL performance, we +start with analyzing how FL training influence FL performance by decomposing +the global loss into three terms: local loss, distribution shift loss and +aggregation loss. Remarkably, our loss decomposition reveals that existing +local training-based FL methods attempt to reduce the distribution shift loss, +while the global aggregation-based FL methods propose better aggregation +strategies to reduce the aggregation loss. Nevertheless, a comprehensive joint +effort to minimize all three terms is currently limited in the literature, +leading to subpar performance when dealing with data heterogeneity challenges. +To fill this gap, we propose a novel FL method based on global loss +decomposition, called FedLD, to jointly reduce these three loss terms. Our +FedLD involves a margin control regularization in local training to reduce the +distribution shift loss, and a principal gradient-based server aggregation +strategy to reduce the aggregation loss. Notably, under different levels of +data heterogeneity, our strategies achieve better and more robust performance +on retinal and chest X-ray classification compared to other FL algorithms. Our +code is available at +\href{https://github.com/Zeng-Shuang/FedLD}{https://github.com/Zeng-Shuang/FedLD}. + +
+
+ comment: Accepted at MICCAI 2024 +
+
+
+
+
+ + ☆ Multiple testing for signal-agnostic searches of new physics with + machine learning + + +
+ In this work, we address the question of how to enhance signal-agnostic +searches by leveraging multiple testing strategies. Specifically, we consider +hypothesis tests relying on machine learning, where model selection can +introduce a bias towards specific families of new physics signals. We show that +it is beneficial to combine different tests, characterised by distinct choices +of hyperparameters, and that performances comparable to the best available test +are generally achieved while providing a more uniform response to various types +of anomalies. Focusing on the New Physics Learning Machine, a methodology to +perform a signal-agnostic likelihood-ratio test, we explore a number of +approaches to multiple testing, such as combining p-values and aggregating test +statistics. + +
+
+ comment: 17 pages, 5 tables, 6 figures +
+
+
+
+
+ + ☆ Demystifying Functional Random Forests: Novel Explainability Tools for + Model Transparency in High-Dimensional Spaces + + +
+ The advent of big data has raised significant challenges in analysing +high-dimensional datasets across various domains such as medicine, ecology, and +economics. Functional Data Analysis (FDA) has proven to be a robust framework +for addressing these challenges, enabling the transformation of +high-dimensional data into functional forms that capture intricate temporal and +spatial patterns. However, despite advancements in functional classification +methods and very high performance demonstrated by combining FDA and ensemble +methods, a critical gap persists in the literature concerning the transparency +and interpretability of black-box models, e.g. Functional Random Forests (FRF). +In response to this need, this paper introduces a novel suite of explainability +tools to illuminate the inner mechanisms of FRF. We propose using Functional +Partial Dependence Plots (FPDPs), Functional Principal Component (FPC) +Probability Heatmaps, various model-specific and model-agnostic FPCs' +importance metrics, and the FPC Internal-External Importance and Explained +Variance Bubble Plot. These tools collectively enhance the transparency of FRF +models by providing a detailed analysis of how individual FPCs contribute to +model predictions. By applying these methods to an ECG dataset, we demonstrate +the effectiveness of these tools in revealing critical patterns and improving +the explainability of FRF. + +
+
+ comment: 33 pages +
+
+
+
+
+ + ☆ Geometrical structures of digital fluctuations in parameter space of + neural networks trained with adaptive momentum optimization + + +
+ We present results of numerical experiments for neural networks with +stochastic gradient-based optimization with adaptive momentum. This widely +applied optimization has proved convergence and practical efficiency, but for +long-run training becomes numerically unstable. We show that numerical +artifacts are observable not only for large-scale models and finally lead to +divergence also for case of shallow narrow networks. We argue this theory by +experiments with more than 1600 neural networks trained for 50000 epochs. Local +observations show presence of the same behavior of network parameters in both +stable and unstable training segments. Geometrical behavior of parameters forms +double twisted spirals in the parameter space and is caused by alternating of +numerical perturbations with next relaxation oscillations in values for 1st and +2nd momentum. + +
+
+
+
+
+ + ☆ Variance reduction of diffusion model's gradients with Taylor + approximation-based control variate ICML + + +
+ Score-based models, trained with denoising score matching, are remarkably +effective in generating high dimensional data. However, the high variance of +their training objective hinders optimisation. We attempt to reduce it with a +control variate, derived via a $k$-th order Taylor expansion on the training +objective and its gradient. We prove an equivalence between the two and +demonstrate empirically the effectiveness of our approach on a low dimensional +problem setting; and study its effect on larger problems. + +
+
+ comment: 14 pages, ICML Structured Probabilistic Inference & Generative + Modeling 2024 +
+
+
+
+
+ + ☆ Accounts of using the Tustin-Net architecture on a rotary inverted + pendulum + + +
+ In this report we investigate the use of the Tustin neural network +architecture (Tustin-Net) for the identification of a physical rotary inverse +pendulum. This physics-based architecture is of particular interest as it +builds on the known relationship between velocities and positions. We here aim +at discussing the advantages, limitations and performance of Tustin-Nets +compared to first-principles grey-box models on a real physical apparatus, +showing how, with a standard training procedure, the former can hardly achieve +the same accuracy as the latter. To address this limitation, we present a +training strategy based on transfer learning that yields Tustin-Nets that are +competitive with the first-principles model, without requiring extensive +knowledge of the setup as the latter. + +
+
+
+
+
+ + ☆ Toward the Evaluation of Large Language Models Considering Score + Variance across Instruction Templates + + +
+ The natural language understanding (NLU) performance of large language models +(LLMs) has been evaluated across various tasks and datasets. The existing +evaluation methods, however, do not take into account the variance in scores +due to differences in prompts, which leads to unfair evaluation and comparison +of NLU performance. Moreover, evaluation designed for specific prompts is +inappropriate for instruction tuning, which aims to perform well with any +prompt. It is therefore necessary to find a way to measure NLU performance in a +fair manner, considering score variance between different instruction +templates. In this study, we provide English and Japanese cross-lingual +datasets for evaluating the NLU performance of LLMs, which include multiple +instruction templates for fair evaluation of each task, along with regular +expressions to constrain the output format. Furthermore, we propose the Sharpe +score as an evaluation metric that takes into account the variance in scores +between templates. Comprehensive analysis of English and Japanese LLMs reveals +that the high variance among templates has a significant impact on the fair +evaluation of LLMs. + +
+
+ comment: 19 pages, 7 figures +
+
+
+
+
+ + ☆ LLMs are not Zero-Shot Reasoners for Biomedical Information Extraction + + +
+ Large Language Models (LLMs) are increasingly adopted for applications in +healthcare, reaching the performance of domain experts on tasks such as +question answering and document summarisation. Despite their success on these +tasks, it is unclear how well LLMs perform on tasks that are traditionally +pursued in the biomedical domain, such as structured information extration. To +breach this gap, in this paper, we systematically benchmark LLM performance in +Medical Classification and Named Entity Recognition (NER) tasks. We aim to +disentangle the contribution of different factors to the performance, +particularly the impact of LLMs' task knowledge and reasoning capabilities, +their (parametric) domain knowledge, and addition of external knowledge. To +this end we evaluate various open LLMs -- including BioMistral and Llama-2 +models -- on a diverse set of biomedical datasets, using standard prompting, +Chain-of-Thought (CoT) and Self-Consistency based reasoning as well as +Retrieval-Augmented Generation (RAG) with PubMed and Wikipedia corpora. +Counter-intuitively, our results reveal that standard prompting consistently +outperforms more complex techniques across both tasks, laying bare the +limitations in the current application of CoT, self-consistency and RAG in the +biomedical domain. Our findings suggest that advanced prompting methods +developed for knowledge- or reasoning-intensive tasks, such as CoT or RAG, are +not easily portable to biomedical tasks where precise structured outputs are +required. This highlights the need for more effective integration of external +knowledge and reasoning mechanisms in LLMs to enhance their performance in +real-world biomedical applications. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ Weight Scope Alignment: A Frustratingly Easy Method for Model Merging + + +
+ Merging models becomes a fundamental procedure in some applications that +consider model efficiency and robustness. The training randomness or Non-I.I.D. +data poses a huge challenge for averaging-based model fusion. Previous research +efforts focus on element-wise regularization or neural permutations to enhance +model averaging while overlooking weight scope variations among models, which +can significantly affect merging effectiveness. In this paper, we reveal +variations in weight scope under different training conditions, shedding light +on its influence on model merging. Fortunately, the parameters in each layer +basically follow the Gaussian distribution, which inspires a novel and simple +regularization approach named Weight Scope Alignment (WSA). It contains two key +components: 1) leveraging a target weight scope to guide the model training +process for ensuring weight scope matching in the subsequent model merging. 2) +fusing the weight scope of two or more models into a unified one for +multi-stage model fusion. We extend the WSA regularization to two different +scenarios, including Mode Connectivity and Federated Learning. Abundant +experimental studies validate the effectiveness of our approach. + +
+
+
+
+
+ + ☆ Relational decomposition for program synthesis + + +
+ We introduce a novel approach to program synthesis that decomposes complex +functional tasks into simpler relational synthesis sub-tasks. We demonstrate +the effectiveness of our approach using an off-the-shelf inductive logic +programming (ILP) system on three challenging datasets. Our results show that +(i) a relational representation can outperform a functional one, and (ii) an +off-the-shelf ILP system with a relational encoding can outperform +domain-specific approaches. + +
+
+
+
+
+ + ☆ Zeroth-Order Stochastic Mirror Descent Algorithms for Minimax Excess + Risk Optimization + + +
+ The minimax excess risk optimization (MERO) problem is a new variation of the +traditional distributionally robust optimization (DRO) problem, which achieves +uniformly low regret across all test distributions under suitable conditions. +In this paper, we propose a zeroth-order stochastic mirror descent (ZO-SMD) +algorithm available for both smooth and non-smooth MERO to estimate the minimal +risk of each distrbution, and finally solve MERO as (non-)smooth stochastic +convex-concave (linear) minimax optimization problems. The proposed algorithm +is proved to converge at optimal convergence rates of +$\mathcal{O}\left(1/\sqrt{t}\right)$ on the estimate of $R_i^*$ and +$\mathcal{O}\left(1/\sqrt{t}\right)$ on the optimization error of both smooth +and non-smooth MERO. Numerical results show the efficiency of the proposed +algorithm. + +
+
+
+
+
+ + ☆ Fair Augmentation for Graph Collaborative Filtering + + +
+ Recent developments in recommendation have harnessed the collaborative power +of graph neural networks (GNNs) in learning users' preferences from user-item +networks. Despite emerging regulations addressing fairness of automated +systems, unfairness issues in graph collaborative filtering remain +underexplored, especially from the consumer's perspective. Despite numerous +contributions on consumer unfairness, only a few of these works have delved +into GNNs. A notable gap exists in the formalization of the latest mitigation +algorithms, as well as in their effectiveness and reliability on cutting-edge +models. This paper serves as a solid response to recent research highlighting +unfairness issues in graph collaborative filtering by reproducing one of the +latest mitigation methods. The reproduced technique adjusts the system fairness +level by learning a fair graph augmentation. Under an experimental setup based +on 11 GNNs, 5 non-GNN models, and 5 real-world networks across diverse domains, +our investigation reveals that fair graph augmentation is consistently +effective on high-utility models and large datasets. Experiments on the +transferability of the fair augmented graph open new issues for future +recommendation studies. Source code: https://github.com/jackmedda/FA4GCF. + +
+
+
+
+
+ + ☆ Efficient Learning for Linear Properties of Bounded-Gate Quantum + Circuits + + +
+ The vast and complicated large-qubit state space forbids us to +comprehensively capture the dynamics of modern quantum computers via classical +simulations or quantum tomography. However, recent progress in quantum learning +theory invokes a crucial question: given a quantum circuit containing d tunable +RZ gates and G-d Clifford gates, can a learner perform purely classical +inference to efficiently predict its linear properties using new classical +inputs, after learning from data obtained by incoherently measuring states +generated by the same circuit but with different classical inputs? In this +work, we prove that the sample complexity scaling linearly in d is necessary +and sufficient to achieve a small prediction error, while the corresponding +computational complexity may scale exponentially in d. Building upon these +derived complexity bounds, we further harness the concept of classical shadow +and truncated trigonometric expansion to devise a kernel-based learning model +capable of trading off prediction error and computational complexity, +transitioning from exponential to polynomial scaling in many practical +settings. Our results advance two crucial realms in quantum computation: the +exploration of quantum algorithms with practical utilities and learning-based +quantum system certification. We conduct numerical simulations to validate our +proposals across diverse scenarios, encompassing quantum information processing +protocols, Hamiltonian simulation, and variational quantum algorithms up to 60 +qubits. + +
+
+
+
+
+ + ☆ Two-level deep domain decomposition method + + +
+ This study presents a two-level Deep Domain Decomposition Method (Deep-DDM) +augmented with a coarse-level network for solving boundary value problems using +physics-informed neural networks (PINNs). The addition of the coarse level +network improves scalability and convergence rates compared to the single level +method. Tested on a Poisson equation with Dirichlet boundary conditions, the +two-level deep DDM demonstrates superior performance, maintaining efficient +convergence regardless of the number of subdomains. This advance provides a +more scalable and effective approach to solving complex partial differential +equations with machine learning. + +
+
+ comment: Preprint proceeding format +
+
+
+
+
+ + ☆ Empowering Wireless Network Applications with Deep Learning-based Radio + Propagation Models + + +
+ The efficient deployment and operation of any wireless communication +ecosystem rely on knowledge of the received signal quality over the target +coverage area. This knowledge is typically acquired through radio propagation +solvers, which however suffer from intrinsic and well-known performance +limitations. This article provides a primer on how integrating deep learning +and conventional propagation modeling techniques can enhance multiple vital +facets of wireless network operation, and yield benefits in terms of efficiency +and reliability. By highlighting the pivotal role that the deep learning-based +radio propagation models will assume in next-generation wireless networks, we +aspire to propel further research in this direction and foster their adoption +in additional applications. + +
+
+ comment: 7 pages, 3 Figures, 1 Table +
+
+
+
+
+ + ☆ Transformers are Minimax Optimal Nonparametric In-Context Learners ICML 2024 + + +
+ In-context learning (ICL) of large language models has proven to be a +surprisingly effective method of learning a new task from only a few +demonstrative examples. In this paper, we study the efficacy of ICL from the +viewpoint of statistical learning theory. We develop approximation and +generalization error bounds for a transformer composed of a deep neural network +and one linear attention layer, pretrained on nonparametric regression tasks +sampled from general function spaces including the Besov space and piecewise +$\gamma$-smooth class. We show that sufficiently trained transformers can +achieve -- and even improve upon -- the minimax optimal estimation risk in +context by encoding the most relevant basis representations during pretraining. +Our analysis extends to high-dimensional or sequential data and distinguishes +the \emph{pretraining} and \emph{in-context} generalization gaps. Furthermore, +we establish information-theoretic lower bounds for meta-learners w.r.t. both +the number of tasks and in-context examples. These findings shed light on the +roles of task diversity and representation learning for ICL. + +
+
+ comment: 40 pages, 3 figures, ICML 2024 Workshop on Theoretical Foundations of + Foundation Models +
+
+
+
+
+ + ☆ Rank and Align: Towards Effective Source-free Graph Domain Adaptation IJCAI2024 + + +
+ Graph neural networks (GNNs) have achieved impressive performance in graph +domain adaptation. However, extensive source graphs could be unavailable in +real-world scenarios due to privacy and storage concerns. To this end, we +investigate an underexplored yet practical problem of source-free graph domain +adaptation, which transfers knowledge from source models instead of source +graphs to a target domain. To solve this problem, we introduce a novel +GNN-based approach called Rank and Align (RNA), which ranks graph similarities +with spectral seriation for robust semantics learning, and aligns inharmonic +graphs with harmonic graphs which close to the source domain for subgraph +extraction. In particular, to overcome label scarcity, we employ the spectral +seriation algorithm to infer the robust pairwise rankings, which can guide +semantic learning using a similarity learning objective. To depict distribution +shifts, we utilize spectral clustering and the silhouette coefficient to detect +harmonic graphs, which the source model can easily classify. To reduce +potential domain discrepancy, we extract domain-invariant subgraphs from +inharmonic graphs by an adversarial edge sampling process, which guides the +invariant learning of GNNs. Extensive experiments on several benchmark datasets +demonstrate the effectiveness of our proposed RNA. + +
+
+ comment: Published in IJCAI2024 +
+
+
+
+
+ + ☆ How disentangled are your classification uncertainties? + + +
+ Uncertainty Quantification in Machine Learning has progressed to predicting +the source of uncertainty in a prediction: Uncertainty from stochasticity in +the data (aleatoric), or uncertainty from limitations of the model (epistemic). +Generally, each uncertainty is evaluated in isolation, but this obscures the +fact that they are often not truly disentangled. This work proposes a set of +experiments to evaluate disentanglement of aleatoric and epistemic uncertainty, +and uses these methods to compare two competing formulations for +disentanglement (the Information Theoretic approach, and the Gaussian Logits +approach). The results suggest that the Information Theoretic approach gives +better disentanglement, but that either predicted source of uncertainty is +still largely contaminated by the other for both methods. We conclude that with +the current methods for disentangling, aleatoric and epistemic uncertainty are +not reliably separated, and we provide a clear set of experimental criteria +that good uncertainty disentanglement should follow. + +
+
+ comment: 11 pages, 11 figures +
+
+
+
+
+ + ☆ Recent Advances on Machine Learning for Computational Fluid Dynamics: A + Survey + + +
+ This paper explores the recent advancements in enhancing Computational Fluid +Dynamics (CFD) tasks through Machine Learning (ML) techniques. We begin by +introducing fundamental concepts, traditional methods, and benchmark datasets, +then examine the various roles ML plays in improving CFD. The literature +systematically reviews papers in recent five years and introduces a novel +classification for forward modeling: Data-driven Surrogates, Physics-Informed +Surrogates, and ML-assisted Numerical Solutions. Furthermore, we also review +the latest ML methods in inverse design and control, offering a novel +classification and providing an in-depth discussion. Then we highlight +real-world applications of ML for CFD in critical scientific and engineering +disciplines, including aerodynamics, combustion, atmosphere & ocean science, +biology fluid, plasma, symbolic regression, and reduced order modeling. +Besides, we identify key challenges and advocate for future research directions +to address these challenges, such as multi-scale representation, physical +knowledge encoding, scientific foundation model and automatic scientific +discovery. This review serves as a guide for the rapidly expanding ML for CFD +community, aiming to inspire insights for future advancements. We draw the +conclusion that ML is poised to significantly transform CFD research by +enhancing simulation accuracy, reducing computational time, and enabling more +complex analyses of fluid dynamics. The paper resources can be viewed at +https://github.com/WillDreamer/Awesome-AI4CFD. + +
+
+ comment: 22 pages, 6 figures +
+
+
+
+
+ + ☆ DimeRec: A Unified Framework for Enhanced Sequential Recommendation via + Generative Diffusion Models + + +
+ Sequential Recommendation (SR) plays a pivotal role in recommender systems by +tailoring recommendations to user preferences based on their non-stationary +historical interactions. Achieving high-quality performance in SR requires +attention to both item representation and diversity. However, designing an SR +method that simultaneously optimizes these merits remains a long-standing +challenge. In this study, we address this issue by integrating recent +generative Diffusion Models (DM) into SR. DM has demonstrated utility in +representation learning and diverse image generation. Nevertheless, a +straightforward combination of SR and DM leads to sub-optimal performance due +to discrepancies in learning objectives (recommendation vs. noise +reconstruction) and the respective learning spaces (non-stationary vs. +stationary). To overcome this, we propose a novel framework called DimeRec +(\textbf{Di}ffusion with \textbf{m}ulti-interest \textbf{e}nhanced +\textbf{Rec}ommender). DimeRec synergistically combines a guidance extraction +module (GEM) and a generative diffusion aggregation module (DAM). The GEM +extracts crucial stationary guidance signals from the user's non-stationary +interaction history, while the DAM employs a generative diffusion process +conditioned on GEM's outputs to reconstruct and generate consistent +recommendations. Our numerical experiments demonstrate that DimeRec +significantly outperforms established baseline methods across three publicly +available datasets. Furthermore, we have successfully deployed DimeRec on a +large-scale short video recommendation platform, serving hundreds of millions +of users. Live A/B testing confirms that our method improves both users' time +spent and result diversification. + +
+
+
+
+
+ + ☆ A Tighter Complexity Analysis of SparseGPT + + +
+ In this work, we improved the analysis of the running time of SparseGPT +[Frantar, Alistarh ICML 2023] from $O(d^{3})$ to $O(d^{\omega} + d^{2+a+o(1)} + +d^{1+\omega(1,1,a)-a})$ for any $a \in [0, 1]$, where $\omega$ is the exponent +of matrix multiplication. In particular, for the current $\omega \approx 2.371$ +[Alman, Duan, Williams, Xu, Xu, Zhou 2024], our running times boil down to +$O(d^{2.53})$. This running time is due to the analysis of the lazy update +behavior in iterative maintenance problems, such as [Deng, Song, Weinstein +2022, Brand, Song, Zhou ICML 2024]. + +
+
+
+
+
+ + ☆ DeepHQ: Learned Hierarchical Quantizer for Progressive Deep Image Coding + + +
+ Unlike fixed- or variable-rate image coding, progressive image coding (PIC) +aims to compress various qualities of images into a single bitstream, +increasing the versatility of bitstream utilization and providing high +compression efficiency compared to simulcast compression. Research on neural +network (NN)-based PIC is in its early stages, mainly focusing on applying +varying quantization step sizes to the transformed latent representations in a +hierarchical manner. These approaches are designed to compress only the +progressively added information as the quality improves, considering that a +wider quantization interval for lower-quality compression includes multiple +narrower sub-intervals for higher-quality compression. However, the existing +methods are based on handcrafted quantization hierarchies, resulting in +sub-optimal compression efficiency. In this paper, we propose an NN-based +progressive coding method that firstly utilizes learned quantization step sizes +via learning for each quantization layer. We also incorporate selective +compression with which only the essential representation components are +compressed for each quantization layer. We demonstrate that our method achieves +significantly higher coding efficiency than the existing approaches with +decreased decoding time and reduced model size. + +
+
+
+
+
+ + ☆ DRExplainer: Quantifiable Interpretability in Drug Response Prediction + with Directed Graph Convolutional Network + + +
+ Predicting the response of a cancer cell line to a therapeutic drug is +pivotal for personalized medicine. Despite numerous deep learning methods that +have been developed for drug response prediction, integrating diverse +information about biological entities and predicting the directional response +remain major challenges. Here, we propose a novel interpretable predictive +model, DRExplainer, which leverages a directed graph convolutional network to +enhance the prediction in a directed bipartite network framework. DRExplainer +constructs a directed bipartite network integrating multi-omics profiles of +cell lines, the chemical structure of drugs and known drug response to achieve +directed prediction. Then, DRExplainer identifies the most relevant subgraph to +each prediction in this directed bipartite network by learning a mask, +facilitating critical medical decision-making. Additionally, we introduce a +quantifiable method for model interpretability that leverages a ground truth +benchmark dataset curated from biological features. In computational +experiments, DRExplainer outperforms state-of-the-art predictive methods and +another graph-based explanation method under the same experimental setting. +Finally, the case studies further validate the interpretability and the +effectiveness of DRExplainer in predictive novel drug response. Our code is +available at: https://github.com/vshy-dream/DRExplainer. + +
+
+
+
+
+ + ☆ Domain Adaptation for Offline Reinforcement Learning with Limited + Samples + + +
+ Offline reinforcement learning (RL) learns effective policies from a static +target dataset. Despite state-of-the-art (SOTA) offline RL algorithms being +promising, they highly rely on the quality of the target dataset. The +performance of SOTA algorithms can degrade in scenarios with limited samples in +the target dataset, which is often the case in real-world applications. To +address this issue, domain adaptation that leverages auxiliary samples from +related source datasets (such as simulators) can be beneficial. In this +context, determining the optimal way to trade off the source and target +datasets remains a critical challenge in offline RL. To the best of our +knowledge, this paper proposes the first framework that theoretically and +experimentally explores how the weight assigned to each dataset affects the +performance of offline RL. We establish the performance bounds and convergence +neighborhood of our framework, both of which depend on the selection of the +weight. Furthermore, we identify the existence of an optimal weight for +balancing the two datasets. All theoretical guarantees and optimal weight +depend on the quality of the source dataset and the size of the target dataset. +Our empirical results on the well-known Procgen Benchmark substantiate our +theoretical contributions. + +
+
+
+
+
+ + ☆ Self-supervised Learning for Geospatial AI: A Survey + + +
+ The proliferation of geospatial data in urban and territorial environments +has significantly facilitated the development of geospatial artificial +intelligence (GeoAI) across various urban applications. Given the vast yet +inherently sparse labeled nature of geospatial data, there is a critical need +for techniques that can effectively leverage such data without heavy reliance +on labeled datasets. This requirement aligns with the principles of +self-supervised learning (SSL), which has attracted increasing attention for +its adoption in geospatial data. This paper conducts a comprehensive and +up-to-date survey of SSL techniques applied to or developed for three primary +data (geometric) types prevalent in geospatial vector data: points, polylines, +and polygons. We systematically categorize various SSL techniques into +predictive and contrastive methods, discussing their application with respect +to each data type in enhancing generalization across various downstream tasks. +Furthermore, we review the emerging trends of SSL for GeoAI, and several +task-specific SSL techniques. Finally, we discuss several key challenges in the +current research and outline promising directions for future investigation. By +presenting a structured analysis of relevant studies, this paper aims to +inspire continued advancements in the integration of SSL with GeoAI, +encouraging innovative methods to harnessing the power of geospatial data. + +
+
+
+
+
+ + ☆ Deep Analysis of Time Series Data for Smart Grid Startup Strategies: A + Transformer-LSTM-PSO Model Approach + + +
+ Grid startup, an integral component of the power system, holds strategic +importance for ensuring the reliability and efficiency of the electrical grid. +However, current methodologies for in-depth analysis and precise prediction of +grid startup scenarios are inadequate. To address these challenges, we propose +a novel method based on the Transformer-LSTM-PSO model. This model uniquely +combines the Transformer's self-attention mechanism, LSTM's temporal modeling +capabilities, and the parameter tuning features of the particle swarm +optimization algorithm. It is designed to more effectively capture the complex +temporal relationships in grid startup schemes. Our experiments demonstrate +significant improvements, with our model achieving lower RMSE and MAE values +across multiple datasets compared to existing benchmarks, particularly in the +NYISO Electric Market dataset where the RMSE was reduced by approximately 15% +and the MAE by 20% compared to conventional models. Our main contribution is +the development of a Transformer-LSTM-PSO model that significantly enhances the +accuracy and efficiency of smart grid startup predictions. The application of +the Transformer-LSTM-PSO model represents a significant advancement in smart +grid predictive analytics, concurrently fostering the development of more +reliable and intelligent grid management systems. + +
+
+ comment: 46 pages +
+
+
+
+
+ + ☆ Recording Brain Activity While Listening to Music Using Wearable EEG + Devices Combined with Bidirectional Long Short-Term Memory Networks + + +
+ Electroencephalography (EEG) signals are crucial for investigating brain +function and cognitive processes. This study aims to address the challenges of +efficiently recording and analyzing high-dimensional EEG signals while +listening to music to recognize emotional states. We propose a method combining +Bidirectional Long Short-Term Memory (Bi-LSTM) networks with attention +mechanisms for EEG signal processing. Using wearable EEG devices, we collected +brain activity data from participants listening to music. The data was +preprocessed, segmented, and Differential Entropy (DE) features were extracted. +We then constructed and trained a Bi-LSTM model to enhance key feature +extraction and improve emotion recognition accuracy. Experiments were conducted +on the SEED and DEAP datasets. The Bi-LSTM-AttGW model achieved 98.28% accuracy +on the SEED dataset and 92.46% on the DEAP dataset in multi-class emotion +recognition tasks, significantly outperforming traditional models such as SVM +and EEG-Net. This study demonstrates the effectiveness of combining Bi-LSTM +with attention mechanisms, providing robust technical support for applications +in brain-computer interfaces (BCI) and affective computing. Future work will +focus on improving device design, incorporating multimodal data, and further +enhancing emotion recognition accuracy, aiming to achieve practical +applications in real-world scenarios. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ Cross-border Commodity Pricing Strategy Optimization via Mixed Neural + Network for Time Series Analysis + + +
+ In the context of global trade, cross-border commodity pricing largely +determines the competitiveness and market share of businesses. However, +existing methodologies often prove inadequate, as they lack the agility and +precision required to effectively respond to the dynamic international markets. +Time series data is of great significance in commodity pricing and can reveal +market dynamics and trends. Therefore, we propose a new method based on the +hybrid neural network model CNN-BiGRU-SSA. The goal is to achieve accurate +prediction and optimization of cross-border commodity pricing strategies +through in-depth analysis and optimization of time series data. Our model +undergoes experimental validation across multiple datasets. The results show +that our method achieves significant performance advantages on datasets such as +UNCTAD, IMF, WITS and China Customs. For example, on the UNCTAD dataset, our +model reduces MAE to 4.357, RMSE to 5.406, and R2 to 0.961, significantly +better than other models. On the IMF and WITS datasets, our method also +achieves similar excellent performance. These experimental results verify the +effectiveness and reliability of our model in the field of cross-border +commodity pricing. Overall, this study provides an important reference for +enterprises to formulate more reasonable and effective cross-border commodity +pricing strategies, thereby enhancing market competitiveness and profitability. +At the same time, our method also lays a foundation for the application of deep +learning in the fields of international trade and economic strategy +optimization, which has important theoretical and practical significance. + +
+
+ comment: 30 pages +
+
+
+
+
+ + ☆ Risk Analysis in Customer Relationship Management via Quantile Region + Convolutional Neural Network-Long Short-Term Memory and Cross-Attention + Mechanism + + +
+ Risk analysis is an important business decision support task in customer +relationship management (CRM), involving the identification of potential risks +or challenges that may affect customer satisfaction, retention rates, and +overall business performance. To enhance risk analysis in CRM, this paper +combines the advantages of quantile region convolutional neural network-long +short-term memory (QRCNN-LSTM) and cross-attention mechanisms for modeling. The +QRCNN-LSTM model combines sequence modeling with deep learning architectures +commonly used in natural language processing tasks, enabling the capture of +both local and global dependencies in sequence data. The cross-attention +mechanism enhances interactions between different input data parts, allowing +the model to focus on specific areas or features relevant to CRM risk analysis. +By applying QRCNN-LSTM and cross-attention mechanisms to CRM risk analysis, +empirical evidence demonstrates that this approach can effectively identify +potential risks and provide data-driven support for business decisions. + +
+
+ comment: 44 pages +
+
+
+
+
+ + ☆ Balancing Act: Prioritization Strategies for LLM-Designed Restless + Bandit Rewards + + +
+ LLMs are increasingly used to design reward functions based on human +preferences in Reinforcement Learning (RL). We focus on LLM-designed rewards +for Restless Multi-Armed Bandits, a framework for allocating limited resources +among agents. In applications such as public health, this approach empowers +grassroots health workers to tailor automated allocation decisions to community +needs. In the presence of multiple agents, altering the reward function based +on human preferences can impact subpopulations very differently, leading to +complex tradeoffs and a multi-objective resource allocation problem. We are the +first to present a principled method termed Social Choice Language Model for +dealing with these tradeoffs for LLM-designed rewards for multiagent planners +in general and restless bandits in particular. The novel part of our model is a +transparent and configurable selection component, called an adjudicator, +external to the LLM that controls complex tradeoffs via a user-selected social +welfare function. Our experiments demonstrate that our model reliably selects +more effective, aligned, and balanced reward functions compared to purely +LLM-based approaches. + +
+
+
+
+
+ + ☆ Pareto Inverse Reinforcement Learning for Diverse Expert Policy + Generation IJCAI + + +
+ Data-driven offline reinforcement learning and imitation learning approaches +have been gaining popularity in addressing sequential decision-making problems. +Yet, these approaches rarely consider learning Pareto-optimal policies from a +limited pool of expert datasets. This becomes particularly marked due to +practical limitations in obtaining comprehensive datasets for all preferences, +where multiple conflicting objectives exist and each expert might hold a unique +optimization preference for these objectives. In this paper, we adapt inverse +reinforcement learning (IRL) by using reward distance estimates for +regularizing the discriminator. This enables progressive generation of a set of +policies that accommodate diverse preferences on the multiple objectives, while +using only two distinct datasets, each associated with a different expert +preference. In doing so, we present a Pareto IRL framework (ParIRL) that +establishes a Pareto policy set from these limited datasets. In the framework, +the Pareto policy set is then distilled into a single, preference-conditioned +diffusion model, thus allowing users to immediately specify which expert's +patterns they prefer. Through experiments, we show that ParIRL outperforms +other IRL algorithms for various multi-objective control tasks, achieving the +dense approximation of the Pareto frontier. We also demonstrate the +applicability of ParIRL with autonomous driving in CARLA. + +
+
+ comment: 13 pages, 7 figures; Accepted for International Joint Conference on + Artificial Intelligence (IJCAI) 2024; Published version +
+
+
+
+
+ + ☆ You Only Merge Once: Learning the Pareto Set of Preference-Aware Model + Merging + + +
+ Model merging, which combines multiple models into a single model, has gained +increasing popularity in recent years. By efficiently integrating the +capabilities of various models without their original training data, this +significantly reduces the parameter count and memory usage. However, current +methods can only produce one single merged model. This necessitates a +performance trade-off due to conflicts among the various models, and the +resultant one-size-fits-all model may not align with the preferences of +different users who may prioritize certain models over others. To address this +issue, we propose preference-aware model merging, and formulate this as a +multi-objective optimization problem in which the performance of the merged +model on each base model's task is treated as an objective. In only one merging +process, the proposed parameter-efficient structure can generate the whole +Pareto set of merged models, each representing the Pareto-optimal model for a +given user-specified preference. Merged models can also be selected from the +learned Pareto set that are tailored to different user preferences. +Experimental results on a number of benchmark datasets demonstrate that the +proposed preference-aware Pareto Merging can obtain a diverse set of trade-off +models and outperforms state-of-the-art model merging baselines. + +
+
+
+
+
+ + ☆ Integrating Audio, Visual, and Semantic Information for Enhanced + Multimodal Speaker Diarization + + +
+ Speaker diarization, the process of segmenting an audio stream or transcribed +speech content into homogenous partitions based on speaker identity, plays a +crucial role in the interpretation and analysis of human speech. Most existing +speaker diarization systems rely exclusively on unimodal acoustic information, +making the task particularly challenging due to the innate ambiguities of audio +signals. Recent studies have made tremendous efforts towards audio-visual or +audio-semantic modeling to enhance performance. However, even the incorporation +of up to two modalities often falls short in addressing the complexities of +spontaneous and unstructured conversations. To exploit more meaningful dialogue +patterns, we propose a novel multimodal approach that jointly utilizes audio, +visual, and semantic cues to enhance speaker diarization. Our method elegantly +formulates the multimodal modeling as a constrained optimization problem. +First, we build insights into the visual connections among active speakers and +the semantic interactions within spoken content, thereby establishing abundant +pairwise constraints. Then we introduce a joint pairwise constraint propagation +algorithm to cluster speakers based on these visual and semantic constraints. +This integration effectively leverages the complementary strengths of different +modalities, refining the affinity estimation between individual speaker +embeddings. Extensive experiments conducted on multiple multimodal datasets +demonstrate that our approach consistently outperforms state-of-the-art speaker +diarization methods. + +
+
+
+
+
+ + ☆ Extraction of Research Objectives, Machine Learning Model Names, and + Dataset Names from Academic Papers and Analysis of Their Interrelationships + Using LLM and Network Analysis + + +
+ Machine learning is widely utilized across various industries. Identifying +the appropriate machine learning models and datasets for specific tasks is +crucial for the effective industrial application of machine learning. However, +this requires expertise in both machine learning and the relevant domain, +leading to a high learning cost. Therefore, research focused on extracting +combinations of tasks, machine learning models, and datasets from academic +papers is critically important, as it can facilitate the automatic +recommendation of suitable methods. Conventional information extraction methods +from academic papers have been limited to identifying machine learning models +and other entities as named entities. To address this issue, this study +proposes a methodology extracting tasks, machine learning methods, and dataset +names from scientific papers and analyzing the relationships between these +information by using LLM, embedding model, and network clustering. The proposed +method's expression extraction performance, when using Llama3, achieves an +F-score exceeding 0.8 across various categories, confirming its practical +utility. Benchmarking results on financial domain papers have demonstrated the +effectiveness of this method, providing insights into the use of the latest +datasets, including those related to ESG (Environmental, Social, and +Governance) data. + +
+
+ comment: 10 pages, 8 figures +
+
+
+
+
+ + ☆ uMedSum: A Unified Framework for Advancing Medical Abstractive + Summarization + + +
+ Medical abstractive summarization faces the challenge of balancing +faithfulness and informativeness. Current methods often sacrifice key +information for faithfulness or introduce confabulations when prioritizing +informativeness. While recent advancements in techniques like in-context +learning (ICL) and fine-tuning have improved medical summarization, they often +overlook crucial aspects such as faithfulness and informativeness without +considering advanced methods like model reasoning and self-improvement. +Moreover, the field lacks a unified benchmark, hindering systematic evaluation +due to varied metrics and datasets. This paper addresses these gaps by +presenting a comprehensive benchmark of six advanced abstractive summarization +methods across three diverse datasets using five standardized metrics. Building +on these findings, we propose uMedSum, a modular hybrid summarization framework +that introduces novel approaches for sequential confabulation removal followed +by key missing information addition, ensuring both faithfulness and +informativeness. Our work improves upon previous GPT-4-based state-of-the-art +(SOTA) medical summarization methods, significantly outperforming them in both +quantitative metrics and qualitative domain expert evaluations. Notably, we +achieve an average relative performance improvement of 11.8% in reference-free +metrics over the previous SOTA. Doctors prefer uMedSum's summaries 6 times more +than previous SOTA in difficult cases where there are chances of confabulations +or missing information. These results highlight uMedSum's effectiveness and +generalizability across various datasets and metrics, marking a significant +advancement in medical summarization. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ Unsupervised discovery of the shared and private geometry in multi-view + data + + +
+ Modern applications often leverage multiple views of a subject of study. +Within neuroscience, there is growing interest in large-scale simultaneous +recordings across multiple brain regions. Understanding the relationship +between views (e.g., the neural activity in each region recorded) can reveal +fundamental principles about the characteristics of each representation and +about the system. However, existing methods to characterize such relationships +either lack the expressivity required to capture complex nonlinearities, +describe only sources of variance that are shared between views, or discard +geometric information that is crucial to interpreting the data. Here, we +develop a nonlinear neural network-based method that, given paired samples of +high-dimensional views, disentangles low-dimensional shared and private latent +variables underlying these views while preserving intrinsic data geometry. +Across multiple simulated and real datasets, we demonstrate that our method +outperforms competing methods. Using simulated populations of lateral +geniculate nucleus (LGN) and V1 neurons we demonstrate our model's ability to +discover interpretable shared and private structure across different noise +conditions. On a dataset of unrotated and corresponding but randomly rotated +MNIST digits, we recover private latents for the rotated view that encode +rotation angle regardless of digit class, and places the angle representation +on a 1-d manifold, while shared latents encode digit class but not rotation +angle. Applying our method to simultaneous Neuropixels recordings of +hippocampus and prefrontal cortex while mice run on a linear track, we discover +a low-dimensional shared latent space that encodes the animal's position. We +propose our approach as a general-purpose method for finding succinct and +interpretable descriptions of paired data sets in terms of disentangled shared +and private latent variables. + +
+
+
+
+
+ + ☆ Through-the-Wall Radar Human Activity Micro-Doppler Signature + Representation Method Based on Joint Boulic-Sinusoidal Pendulum Model + + +
+ With the help of micro-Doppler signature, ultra-wideband (UWB) +through-the-wall radar (TWR) enables the reconstruction of range and velocity +information of limb nodes to accurately identify indoor human activities. +However, existing methods are usually trained and validated directly using +range-time maps (RTM) and Doppler-time maps (DTM), which have high feature +redundancy and poor generalization ability. In order to solve this problem, +this paper proposes a human activity micro-Doppler signature representation +method based on joint Boulic-sinusoidal pendulum motion model. In detail, this +paper presents a simplified joint Boulic-sinusoidal pendulum human motion model +by taking head, torso, both hands and feet into consideration improved from +Boulic-Thalmann kinematic model. The paper also calculates the minimum number +of key points needed to describe the Doppler and micro-Doppler information +sufficiently. Both numerical simulations and experiments are conducted to +verify the effectiveness. The results demonstrate that the proposed number of +key points of micro-Doppler signature can precisely represent the indoor human +limb node motion characteristics, and substantially improve the generalization +capability of the existing methods for different testers. + +
+
+ comment: 17 pages, 14 figures, 7 tables, in IEEE Transactions on Microwave + Theory and Techniques, 2024 +
+
+
+
+
+ + ☆ Multi-Task Curriculum Graph Contrastive Learning with Clustering Entropy + Guidance + + +
+ Recent advances in unsupervised deep graph clustering have been significantly +promoted by contrastive learning. Despite the strides, most graph contrastive +learning models face challenges: 1) graph augmentation is used to improve +learning diversity, but commonly used random augmentation methods may destroy +inherent semantics and cause noise; 2) the fixed positive and negative sample +selection strategy is limited to deal with complex real data, thereby impeding +the model's capability to capture fine-grained patterns and relationships. To +reduce these problems, we propose the Clustering-guided Curriculum Graph +contrastive Learning (CCGL) framework. CCGL uses clustering entropy as the +guidance of the following graph augmentation and contrastive learning. +Specifically, according to the clustering entropy, the intra-class edges and +important features are emphasized in augmentation. Then, a multi-task +curriculum learning scheme is proposed, which employs the clustering guidance +to shift the focus from the discrimination task to the clustering task. In this +way, the sample selection strategy of contrastive learning can be adjusted +adaptively from early to late stage, which enhances the model's flexibility for +complex data structure. Experimental results demonstrate that CCGL has achieved +excellent performance compared to state-of-the-art competitors. + +
+
+
+
+
+ + ☆ Simplified Mamba with Disentangled Dependency Encoding for Long-Term + Time Series Forecasting + + +
+ Recently many deep learning models have been proposed for Long-term Time +Series Forecasting (LTSF). Based on previous literature, we identify three +critical patterns that can improve forecasting accuracy: the order and semantic +dependencies in time dimension as well as cross-variate dependency. However, +little effort has been made to simultaneously consider order and semantic +dependencies when developing forecasting models. Moreover, existing approaches +utilize cross-variate dependency by mixing information from different +timestamps and variates, which may introduce irrelevant or harmful +cross-variate information to the time dimension and largely hinder forecasting +performance. To overcome these limitations, we investigate the potential of +Mamba for LTSF and discover two key advantages benefiting forecasting: (i) the +selection mechanism makes Mamba focus on or ignore specific inputs and learn +semantic dependency easily, and (ii) Mamba preserves order dependency by +processing sequences recursively. After that, we empirically find that the +non-linear activation used in Mamba is unnecessary for semantically sparse time +series data. Therefore, we further propose SAMBA, a Simplified Mamba with +disentangled dependency encoding. Specifically, we first remove the +non-linearities of Mamba to make it more suitable for LTSF. Furthermore, we +propose a disentangled dependency encoding strategy to endow Mamba with +cross-variate dependency modeling capabilities while reducing the interference +between time and variate dimensions. Extensive experimental results on seven +real-world datasets demonstrate the effectiveness of SAMBA over +state-of-the-art forecasting models. + +
+
+
+
+
+ + ☆ A Deconfounding Approach to Climate Model Bias Correction + + +
+ Global Climate Models (GCMs) are crucial for predicting future climate +changes by simulating the Earth systems. However, GCM outputs exhibit +systematic biases due to model uncertainties, parameterization simplifications, +and inadequate representation of complex climate phenomena. Traditional bias +correction methods, which rely on historical observation data and statistical +techniques, often neglect unobserved confounders, leading to biased results. +This paper proposes a novel bias correction approach to utilize both GCM and +observational data to learn a factor model that captures multi-cause latent +confounders. Inspired by recent advances in causality based time series +deconfounding, our method first constructs a factor model to learn latent +confounders from historical data and then applies them to enhance the bias +correction process using advanced time series forecasting models. The +experimental results demonstrate significant improvements in the accuracy of +precipitation outputs. By addressing unobserved confounders, our approach +offers a robust and theoretically grounded solution for climate model bias +correction. + +
+
+
+
+
+ + ☆ MAC protocol classification in the ISM band using machine learning + methods + + +
+ With the emergence of new technologies and a growing number of wireless +networks, we face the problem of radio spectrum shortages. As a result, +identifying the wireless channel spectrum to exploit the channel's idle state +while also boosting network security is a pivotal issue. Detecting and +classifying protocols in the MAC sublayer enables Cognitive Radio users to +improve spectrum utilization and minimize potential interference. In this +paper, we classify the Wi-Fi and Bluetooth protocols, which are the most widely +used MAC sublayer protocols in the ISM radio band. With the advent of various +wireless technologies, especially in the 2.4 GHz frequency band, the ISM +frequency spectrum has become crowded and high-traffic, which faces a lack of +spectrum resources and user interference. Therefore, identifying and +classifying protocols is an effective and useful method. Leveraging machine +learning and deep learning techniques, known for their advanced classification +capabilities, we apply Support Vector Machine and K-Nearest Neighbors +algorithms, which are machine learning algorithms, to classify protocols into +three classes: Wi-Fi, Wi-Fi Beacon, and Bluetooth. To capture the signals, we +use the USRP N210 Software Defined Radio device and sample the real data in the +indoor environment in different conditions of the presence and absence of +transmitters and receivers for these two protocols. By assembling this dataset +and studying the time and frequency features of the protocols, we extract the +frame width and the silence gap between the two frames as time features and the +PAPR of each frame as a power feature. By comparing the output of the protocols +classification in different conditions and also adding Gaussian noise, it was +found that the samples in the nonlinear SVM method with RBF and KNN functions +have the best performance, with 97.83% and 98.12% classification accuracy, +respectively. + +
+
+
+
+
+ + ☆ Aligning (Medical) LLMs for (Counterfactual) Fairness + + +
+ Large Language Models (LLMs) have emerged as promising solutions for a +variety of medical and clinical decision support applications. However, LLMs +are often subject to different types of biases, which can lead to unfair +treatment of individuals, worsening health disparities, and reducing trust in +AI-augmented medical tools. Aiming to address this important issue, in this +study, we present a new model alignment approach for aligning LLMs using a +preference optimization method within a knowledge distillation framework. Prior +to presenting our proposed method, we first use an evaluation framework to +conduct a comprehensive (largest to our knowledge) empirical evaluation to +reveal the type and nature of existing biases in LLMs used for medical +applications. We then offer a bias mitigation technique to reduce the unfair +patterns in LLM outputs across different subgroups identified by the protected +attributes. We show that our mitigation method is effective in significantly +reducing observed biased patterns. Our code is publicly available at +\url{https://github.com/healthylaife/FairAlignmentLLM}. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2404.15149 +
+
+
+
+
+ + ♻ ☆ Understanding Reference Policies in Direct Preference Optimization + + +
+ Direct Preference Optimization (DPO) has become a widely used training method +for the instruction fine-tuning of large language models (LLMs). In this work, +we explore an under-investigated aspect of DPO - its dependency on the +reference model or policy. Such reference policies, typically instantiated as +the model to be further fine-tuned, are important since they can impose an +upper limit on DPO's effectiveness. Therefore, we address three related +research questions in this work. First, we explore the optimal strength of the +KL divergence constraint in DPO, which penalizes deviations from the reference +policy, and find that DPO is sensitive to this strength. Next, we examine the +necessity of the KL-constraint from the reference policies in DPO by providing +both theoretical and empirical comparisons between DPO and related learning +objectives, demonstrating DPO's superiority in this controlled setting. +Additionally, we investigate whether DPO benefits from stronger reference +policies, finding that a stronger reference policy can lead to improved +performance, but only when it is similar to the model being fine-tuned. Our +findings highlight the confounding role of reference policies in DPO and offer +insights for best practices, while also identifying open research questions for +future studies. + +
+
+ comment: GitHub Repo: https://github.com/yale-nlp/refdpo +
+
+
+
+
+ + ♻ ☆ SST: Multi-Scale Hybrid Mamba-Transformer Experts for Long-Short Range + Time Series Forecasting + + +
+ Despite significant progress in time series forecasting, existing forecasters +often overlook the heterogeneity between long-range and short-range time +series, leading to performance degradation in practical applications. In this +work, we highlight the need of distinct objectives tailored to different +ranges. We point out that time series can be decomposed into global patterns +and local variations, which should be addressed separately in long- and +short-range time series. To meet the objectives, we propose a multi-scale +hybrid Mamba-Transformer experts model State Space Transformer (SST). SST +leverages Mamba as an expert to extract global patterns in coarse-grained +long-range time series, and Local Window Transformer (LWT), the other expert to +focus on capturing local variations in fine-grained short-range time series. +With an input-dependent mechanism, State Space Model (SSM)-based Mamba is able +to selectively retain long-term patterns and filter out fluctuations, while LWT +employs a local window to enhance locality-awareness capability, thus +effectively capturing local variations. To adaptively integrate the global +patterns and local variations, a long-short router dynamically adjusts +contributions of the two experts. SST achieves superior performance with +scaling linearly $O(L)$ on time series length $L$. The comprehensive +experiments demonstrate the SST can achieve SOTA results in long-short range +time series forecasting while maintaining low memory footprint and +computational cost. The code of SST is available at +https://github.com/XiongxiaoXu/SST. + +
+
+
+
+
+ + ♻ ☆ SPICED: News Similarity Detection Dataset with Multiple Topics and + Complexity Levels LREC + + +
+ The proliferation of news media outlets has increased the demand for +intelligent systems capable of detecting redundant information in news articles +in order to enhance user experience. However, the heterogeneous nature of news +can lead to spurious findings in these systems: Simple heuristics such as +whether a pair of news are both about politics can provide strong but deceptive +downstream performance. Segmenting news similarity datasets into topics +improves the training of these models by forcing them to learn how to +distinguish salient characteristics under more narrow domains. However, this +requires the existence of topic-specific datasets, which are currently lacking. +In this article, we propose a novel dataset of similar news, SPICED, which +includes seven topics: Crime & Law, Culture & Entertainment, Disasters & +Accidents, Economy & Business, Politics & Conflicts, Science & Technology, and +Sports. Futhermore, we present four different levels of complexity, +specifically designed for news similarity detection task. We benchmarked the +created datasets using MinHash, BERT, SBERT, and SimCSE models. + +
+
+ comment: LREC-COLING 2024 +
+
+
+
+
+ + ♻ ☆ Topics as Entity Clusters: Entity-based Topics from Large Language + Models and Graph Neural Networks LREC + + +
+ Topic models aim to reveal latent structures within a corpus of text, +typically through the use of term-frequency statistics over bag-of-words +representations from documents. In recent years, conceptual entities -- +interpretable, language-independent features linked to external knowledge +resources -- have been used in place of word-level tokens, as words typically +require extensive language processing with a minimal assurance of +interpretability. However, current literature is limited when it comes to +exploring purely entity-driven neural topic modeling. For instance, despite the +advantages of using entities for eliciting thematic structure, it is unclear +whether current techniques are compatible with these sparsely organised, +information-dense conceptual units. In this work, we explore entity-based +neural topic modeling and propose a novel topic clustering approach using +bimodal vector representations of entities. Concretely, we extract these latent +representations from large language models and graph neural networks trained on +a knowledge base of symbolic relations, in order to derive the most salient +aspects of these conceptual units. Analysis of coherency metrics confirms that +our approach is better suited to working with entities in comparison to +state-of-the-art models, particularly when using graph-based embeddings trained +on a knowledge base. + +
+
+ comment: 16 pages, 1 figure. LREC-COLING 2024 +
+
+
+
+
+ + ♻ ☆ Assessing Lower Limb Strength using Internet-of-Things Enabled Chair + + +
+ This project describes the application of the technologies of Machine +Learning and Internet-of-Things to assess the lower limb strength of +individuals undergoing rehabilitation or therapy. Specifically, it seeks to +measure and assess the progress of individuals by sensors attached to chairs +and processing the data through Google GPU Tensorflow CoLab. Pressure sensors +are attached to various locations on a chair, including but not limited to the +seating area, backrest, hand rests, and legs. Sensor data from the individual +performing both sit-to-stand transition and stand-to-sit transition provides a +time series dataset regarding the pressure distribution and vibratory motion on +the chair. The dataset and timing information can then be fed into a machine +learning model to estimate the relative strength and weakness during various +phases of the movement. + +
+
+ comment: 12 Pages +
+
+
+
+
+ + ♻ ☆ Neural interval-censored survival regression with feature selection + + +
+ Survival analysis is a fundamental area of focus in biomedical research, +particularly in the context of personalized medicine. This prominence is due to +the increasing prevalence of large and high-dimensional datasets, such as omics +and medical image data. However, the literature on non-linear regression +algorithms and variable selection techniques for interval-censoring is either +limited or non-existent, particularly in the context of neural networks. Our +objective is to introduce a novel predictive framework tailored for +interval-censored regression tasks, rooted in Accelerated Failure Time (AFT) +models. Our strategy comprises two key components: i) a variable selection +phase leveraging recent advances on sparse neural network architectures, ii) a +regression model targeting prediction of the interval-censored response. To +assess the performance of our novel algorithm, we conducted a comprehensive +evaluation through both numerical experiments and real-world applications that +encompass scenarios related to diabetes and physical activity. Our results +outperform traditional AFT algorithms, particularly in scenarios featuring +non-linear relationships. + +
+
+
+
+
+ + ♻ ☆ Efficient Sensor Placement from Regression with Sparse Gaussian + Processes in Continuous and Discrete Spaces + + +
+ The sensor placement problem is a common problem that arises when monitoring +correlated phenomena, such as temperature, precipitation, and salinity. +Existing approaches to this problem typically formulate it as the maximization +of information metrics, such as mutual information~(MI), and use optimization +methods such as greedy algorithms in discrete domains, and derivative-free +optimization methods such as genetic algorithms in continuous domains. However, +computing MI for sensor placement requires discretizing the environment, and +its computation cost depends on the size of the discretized environment. These +limitations restrict these approaches from scaling to large problems. + We present a novel formulation to the SP problem based on variational +approximation that can be optimized using gradient descent, allowing us to +efficiently find solutions in continuous domains. We generalize our method to +also handle discrete environments. Our experimental results on four real-world +datasets demonstrate that our approach generates sensor placements consistently +on par with or better than the prior state-of-the-art approaches in terms of +both MI and reconstruction quality, all while being significantly faster. Our +computationally efficient approach enables both large-scale sensor placement +and fast robotic sensor placement for informative path planning algorithms. + +
+
+ comment: preprint +
+
+
+
+
+ + ♻ ☆ A Complete Set of Quadratic Constraints for Repeated ReLU and + Generalizations + + +
+ This paper derives a complete set of quadratic constraints (QCs) for the +repeated ReLU. The complete set of QCs is described by a collection of matrix +copositivity conditions. We also show that only two functions satisfy all QCs +in our complete set: the repeated ReLU and flipped ReLU. Thus our complete set +of QCs bounds the repeated ReLU as tight as possible up to the sign invariance +inherent in quadratic forms. We derive a similar complete set of incremental +QCs for repeated ReLU, which can potentially lead to less conservative +Lipschitz bounds for ReLU networks than the standard LipSDP approach. The basic +constructions are also used to derive the complete sets of QCs for other +piecewise linear activation functions such as leaky ReLU, MaxMin, and +HouseHolder. Finally, we illustrate the use of the complete set of QCs to +assess stability and performance for recurrent neural networks with ReLU +activation functions. We rely on a standard copositivity relaxation to +formulate the stability/performance condition as a semidefinite program. Simple +examples are provided to illustrate that the complete sets of QCs and +incremental QCs can yield less conservative bounds than existing sets. + +
+
+
+
+
+ + ♻ ☆ Label Noise: Correcting the Forward-Correction + + +
+ Training neural network classifiers on datasets with label noise poses a risk +of overfitting them to the noisy labels. To address this issue, researchers +have explored alternative loss functions that aim to be more robust. The +`forward-correction' is a popular approach wherein the model outputs are noised +before being evaluated against noisy data. When the true noise model is known, +applying the forward-correction guarantees consistency of the learning +algorithm. While providing some benefit, the correction is insufficient to +prevent overfitting to finite noisy datasets. In this work, we propose an +approach to tackling overfitting caused by label noise. We observe that the +presence of label noise implies a lower bound on the noisy generalised risk. +Motivated by this observation, we propose imposing a lower bound on the +training loss to mitigate overfitting. Our main contribution is providing +theoretical insights that allow us to approximate the lower bound given only an +estimate of the average noise rate. We empirically demonstrate that using this +bound significantly enhances robustness in various settings, with virtually no +additional computational cost. + +
+
+
+
+
+ + ♻ ☆ Urban Region Pre-training and Prompting: A Graph-based Approach + + +
+ Urban region representation is crucial for various urban downstream tasks. +However, despite the proliferation of methods and their success, acquiring +general urban region knowledge and adapting to different tasks remains +challenging. Previous work often neglects the spatial structures and functional +layouts between entities, limiting their ability to capture transferable +knowledge across regions. Further, these methods struggle to adapt effectively +to specific downstream tasks, as they do not adequately address the unique +features and relationships required for different downstream tasks. In this +paper, we propose a $\textbf{G}$raph-based $\textbf{U}$rban $\textbf{R}$egion +$\textbf{P}$re-training and $\textbf{P}$rompting framework ($\textbf{GURPP}$) +for region representation learning. Specifically, we first construct an urban +region graph that integrates detailed spatial entity data for more effective +urban region representation. Then, we develop a subgraph-centric urban region +pre-training model to capture the heterogeneous and transferable patterns of +interactions among entities. To further enhance the adaptability of these +embeddings to different tasks, we design two graph-based prompting methods to +incorporate explicit/hidden task knowledge. Extensive experiments on various +urban region prediction tasks and different cities demonstrate the superior +performance of our GURPP framework. We wil release code and data upon paper +notification. + +
+
+
+
+
+ + ♻ ☆ Overfitting In Contrastive Learning? + + +
+ Overfitting describes a machine learning phenomenon where the model fits too +closely to the training data, resulting in poor generalization. While this +occurrence is thoroughly documented for many forms of supervised learning, it +is not well examined in the context of unsupervised learning. In this work we +examine the nature of overfitting in unsupervised contrastive learning. We show +that overfitting can indeed occur and the mechanism behind overfitting. + +
+
+
+
+
+ + ♻ ☆ Similarity of Neural Network Models: A Survey of Functional and + Representational Measures + + +
+ Measuring similarity of neural networks to understand and improve their +behavior has become an issue of great importance and research interest. In this +survey, we provide a comprehensive overview of two complementary perspectives +of measuring neural network similarity: (i) representational similarity, which +considers how activations of intermediate layers differ, and (ii) functional +similarity, which considers how models differ in their outputs. In addition to +providing detailed descriptions of existing measures, we summarize and discuss +results on the properties of and relationships between these measures, and +point to open research problems. We hope our work lays a foundation for more +systematic research on the properties and applicability of similarity measures +for neural network models. + +
+
+ comment: Added new similarity measures, application section. Improved overview + of analyses of measures +
+
+
+
+
+ + ♻ ☆ Uncovering Latent Arguments in Social Media Messaging by Employing + LLMs-in-the-Loop Strategy + + +
+ The widespread use of social media has led to a surge in popularity for +automated methods of analyzing public opinion. Supervised methods are adept at +text categorization, yet the dynamic nature of social media discussions poses a +continual challenge for these techniques due to the constant shifting of the +focus. On the other hand, traditional unsupervised methods for extracting +themes from public discourse, such as topic modeling, often reveal overarching +patterns that might not capture specific nuances. Consequently, a significant +portion of research into social media discourse still depends on +labor-intensive manual coding techniques and a human-in-the-loop approach, +which are both time-consuming and costly. In this work, we study the problem of +discovering arguments associated with a specific theme. We propose a generic +LLMs-in-the-Loop strategy that leverages the advanced capabilities of Large +Language Models (LLMs) to extract latent arguments from social media messaging. +To demonstrate our approach, we apply our framework to contentious topics. We +use two publicly available datasets: (1) the climate campaigns dataset of 14k +Facebook ads with 25 themes and (2) the COVID-19 vaccine campaigns dataset of +9k Facebook ads with 14 themes. Additionally, we design a downstream task as +stance prediction by leveraging talking points in climate debates. Furthermore, +we analyze demographic targeting and the adaptation of messaging based on +real-world events. + +
+
+
+
+
+ + ♻ ☆ Time Series Clustering with General State Space Models via Stochastic + Variational Inference + + +
+ In this paper, we propose a novel method of model-based time series +clustering with mixtures of general state space models (MSSMs). Each component +of MSSMs is associated with each cluster. An advantage of the proposed method +is that it enables the use of time series models appropriate to the specific +time series. This not only improves clustering and prediction accuracy but also +enhances the interpretability of the estimated parameters. The parameters of +the MSSMs are estimated using stochastic variational inference, a subtype of +variational inference. The proposed method estimates the latent variables of an +arbitrary state space model by using neural networks with a normalizing flow as +a variational estimator. The number of clusters can be estimated using the +Bayesian information criterion. In addition, to prevent MSSMs from converging +to the local optimum, we propose several optimization tricks, including an +additional penalty term called entropy annealing. To our best knowledge, the +proposed method is the first computationally feasible one for time series +clustering based on general (possibly nonlinear, non-Gaussian) state space +models. Experiments on simulated datasets show that the proposed method is +effective for clustering, parameter estimation, and estimating the number of +clusters. + +
+
+ comment: 23 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ skscope: Fast Sparsity-Constrained Optimization in Python + + +
+ Applying iterative solvers on sparsity-constrained optimization (SCO) +requires tedious mathematical deduction and careful programming/debugging that +hinders these solvers' broad impact. In the paper, the library skscope is +introduced to overcome such an obstacle. With skscope, users can solve the SCO +by just programming the objective function. The convenience of skscope is +demonstrated through two examples in the paper, where sparse linear regression +and trend filtering are addressed with just four lines of code. More +importantly, skscope's efficient implementation allows state-of-the-art solvers +to quickly attain the sparse solution regardless of the high dimensionality of +parameter space. Numerical experiments reveal the available solvers in skscope +can achieve up to 80x speedup on the competing relaxation solutions obtained +via the benchmarked convex solver. skscope is published on the Python Package +Index (PyPI) and Conda, and its source code is available at: +https://github.com/abess-team/skscope. + +
+
+ comment: 4 pages;add experiment +
+
+
+
+
+ + ♻ ☆ Diff-Cleanse: Identifying and Mitigating Backdoor Attacks in Diffusion + Models + + +
+ Diffusion models (DMs) are regarded as one of the most advanced generative +models today, yet recent studies suggest that they are vulnerable to backdoor +attacks, which establish hidden associations between particular input patterns +and model behaviors, compromising model integrity by causing undesirable +actions with manipulated inputs. This vulnerability poses substantial risks, +including reputational damage to model owners and the dissemination of harmful +content. To mitigate the threat of backdoor attacks, there have been some +investigations on backdoor detection and model repair. However, previous work +fails to reliably purify the models backdoored by state-of-the-art attack +methods, rendering the field much underexplored. To bridge this gap, we +introduce Diff-Cleanse, a novel two-stage backdoor defense framework +specifically designed for DMs. The first stage employs a novel trigger +inversion technique to reconstruct the trigger and detect the backdoor, and the +second stage utilizes a structural pruning method to eliminate the backdoor. We +evaluate our framework on hundreds of DMs that are attacked by three existing +backdoor attack methods with a wide range of hyperparameter settings. Extensive +experiments demonstrate that Diff-Cleanse achieves nearly 100\% detection +accuracy and effectively mitigates backdoor impacts, preserving the model's +benign performance with minimal compromise. Our code is avaliable at +https://github.com/shymuel/diff-cleanse. + +
+
+
+
+
+ + ♻ ☆ Can we trust the evaluation on ChatGPT? + + +
+ ChatGPT, the first large language model (LLM) with mass adoption, has +demonstrated remarkable performance in numerous natural language tasks. Despite +its evident usefulness, evaluating ChatGPT's performance in diverse problem +domains remains challenging due to the closed nature of the model and its +continuous updates via Reinforcement Learning from Human Feedback (RLHF). We +highlight the issue of data contamination in ChatGPT evaluations, with a case +study of the task of stance detection. We discuss the challenge of preventing +data contamination and ensuring fair model evaluation in the age of closed and +continuously trained models. + +
+
+
+
+
+ + ♻ ☆ Domain Generalization through Meta-Learning: A Survey + + +
+ Deep neural networks (DNNs) have revolutionized artificial intelligence but +often lack performance when faced with out-of-distribution (OOD) data, a common +scenario due to the inevitable domain shifts in real-world applications. This +limitation stems from the common assumption that training and testing data +share the same distribution--an assumption frequently violated in practice. +Despite their effectiveness with large amounts of data and computational power, +DNNs struggle with distributional shifts and limited labeled data, leading to +overfitting and poor generalization across various tasks and domains. +Meta-learning presents a promising approach by employing algorithms that +acquire transferable knowledge across various tasks for fast adaptation, +eliminating the need to learn each task from scratch. This survey paper delves +into the realm of meta-learning with a focus on its contribution to domain +generalization. We first clarify the concept of meta-learning for domain +generalization and introduce a novel taxonomy based on the feature extraction +strategy and the classifier learning methodology, offering a granular view of +methodologies. Additionally, we present a decision graph to assist readers in +navigating the taxonomy based on data availability and domain shifts, enabling +them to select and develop a proper model tailored to their specific problem +requirements. Through an exhaustive review of existing methods and underlying +theories, we map out the fundamentals of the field. Our survey provides +practical insights and an informed discussion on promising research directions. + +
+
+
+
+
+ + ♻ ☆ AI-Augmented Predictions: LLM Assistants Improve Human Forecasting + Accuracy + + +
+ Large language models (LLMs) match and sometimes exceeding human performance +in many domains. This study explores the potential of LLMs to augment human +judgement in a forecasting task. We evaluate the effect on human forecasters of +two LLM assistants: one designed to provide high-quality ("superforecasting") +advice, and the other designed to be overconfident and base-rate neglecting, +thus providing noisy forecasting advice. We compare participants using these +assistants to a control group that received a less advanced model that did not +provide numerical predictions or engaged in explicit discussion of predictions. +Participants (N = 991) answered a set of six forecasting questions and had the +option to consult their assigned LLM assistant throughout. Our preregistered +analyses show that interacting with each of our frontier LLM assistants +significantly enhances prediction accuracy by between 24 percent and 28 percent +compared to the control group. Exploratory analyses showed a pronounced outlier +effect in one forecasting item, without which we find that the superforecasting +assistant increased accuracy by 41 percent, compared with 29 percent for the +noisy assistant. We further examine whether LLM forecasting augmentation +disproportionately benefits less skilled forecasters, degrades the +wisdom-of-the-crowd by reducing prediction diversity, or varies in +effectiveness with question difficulty. Our data do not consistently support +these hypotheses. Our results suggest that access to a frontier LLM assistant, +even a noisy one, can be a helpful decision aid in cognitively demanding tasks +compared to a less powerful model that does not provide specific forecasting +advice. However, the effects of outliers suggest that further research into the +robustness of this pattern is needed. + +
+
+ comment: 22 pages pages (main text comprised of 19 pages, appendix comprised + of three pages). 10 visualizations in the main text (four figures, six + tables), three additional figures in the appendix +
+
+
+
+
+ + ♻ ☆ An Efficient and Explainable Transformer-Based Few-Shot Learning for + Modeling Electricity Consumption Profiles Across Thousands of Domains + + +
+ Electricity Consumption Profiles (ECPs) are crucial for operating and +planning power distribution systems, especially with the increasing numbers of +various low-carbon technologies such as solar panels and electric vehicles. +Traditional ECP modeling methods typically assume the availability of +sufficient ECP data. However, in practice, the accessibility of ECP data is +limited due to privacy issues or the absence of metering devices. Few-shot +learning (FSL) has emerged as a promising solution for ECP modeling in +data-scarce scenarios. Nevertheless, standard FSL methods, such as those used +for images, are unsuitable for ECP modeling because (1) these methods usually +assume several source domains with sufficient data and several target domains. +However, in the context of ECP modeling, there may be thousands of source +domains with a moderate amount of data and thousands of target domains. (2) +Standard FSL methods usually involve cumbersome knowledge transfer mechanisms, +such as pre-training and fine-tuning, whereas ECP modeling requires more +lightweight methods. (3) Deep learning models often lack explainability, +hindering their application in industry. This paper proposes a novel FSL method +that exploits Transformers and Gaussian Mixture Models (GMMs) for ECP modeling +to address the above-described issues. Results show that our method can +accurately restore the complex ECP distribution with a minimal amount of ECP +data (e.g., only 1.6\% of the complete domain dataset) while it outperforms +state-of-the-art time series modeling methods, maintaining the advantages of +being both lightweight and interpretable. The project is open-sourced at +https://github.com/xiaweijie1996/TransformerEM-GMM.git. + +
+
+
+
+
+ + ♻ ☆ Mixstyle-Entropy: Domain Generalization with Causal Intervention and + Perturbation BMVC2024 + + +
+ Despite the considerable advancements achieved by deep neural networks, their +performance tends to degenerate when the test environment diverges from the +training ones. Domain generalization (DG) solves this issue by learning +representations independent of domain-related information, thus facilitating +extrapolation to unseen environments. Existing approaches typically focus on +formulating tailored training objectives to extract shared features from the +source data. However, the disjointed training and testing procedures may +compromise robustness, particularly in the face of unforeseen variations during +deployment. In this paper, we propose a novel and holistic framework based on +causality, named InPer, designed to enhance model generalization by +incorporating causal intervention during training and causal perturbation +during testing. Specifically, during the training phase, we employ +entropy-based causal intervention (EnIn) to refine the selection of causal +variables. To identify samples with anti-interference causal variables from the +target domain, we propose a novel metric, homeostatic score, through causal +perturbation (HoPer) to construct a prototype classifier in test time. +Experimental results across multiple cross-domain tasks confirm the efficacy of +InPer. + +
+
+ comment: Accepted by BMVC2024 +
+
+
+
+
+ + ♻ ☆ Language Agents as Optimizable Graphs + + +
+ Various human-designed prompt engineering techniques have been proposed to +improve problem solvers based on Large Language Models (LLMs), yielding many +disparate code bases. We unify these approaches by describing LLM-based agents +as computational graphs. The nodes implement functions to process multimodal +data or query LLMs, and the edges describe the information flow between +operations. Graphs can be recursively combined into larger composite graphs +representing hierarchies of inter-agent collaboration (where edges connect +operations of different agents). Our novel automatic graph optimizers (1) +refine node-level LLM prompts (node optimization) and (2) improve agent +orchestration by changing graph connectivity (edge optimization). Experiments +demonstrate that our framework can be used to efficiently develop, integrate, +and automatically improve various LLM agents. The code can be found at +https://github.com/metauto-ai/gptswarm. + +
+
+ comment: Project Website: https://gptswarm.org ; Github Repo: + https://github.com/metauto-ai/gptswarm . In Forty-first International + Conference on Machine Learning (2024) +
+
+
+
+
+ + ♻ ☆ RDGCL: Reaction-Diffusion Graph Contrastive Learning for Recommendation + + +
+ Contrastive learning (CL) has emerged as a promising technique for improving +recommender systems, addressing the challenge of data sparsity by using +self-supervised signals from raw data. Integration of CL with graph +convolutional network (GCN)-based collaborative filterings (CFs) has been +explored in recommender systems. However, current CL-based recommendation +models heavily rely on low-pass filters and graph augmentations. In this paper, +inspired by the reaction-diffusion equation, we propose a novel CL method for +recommender systems called the reaction-diffusion graph contrastive learning +model (RDGCL). We design our own GCN for CF based on the equations of +diffusion, i.e., low-pass filter, and reaction, i.e., high-pass filter. Our +proposed CL-based training occurs between reaction and diffusion-based +embeddings, so there is no need for graph augmentations. Experimental +evaluation on 5 benchmark datasets demonstrates that our proposed method +outperforms state-of-the-art CL-based recommendation models. By enhancing +recommendation accuracy and diversity, our method brings an advancement in CL +for recommender systems. + +
+
+ comment: Jeongwhan Choi and Hyowon Wi are co-first authors with equal + contributions +
+
+
+
+
+ + ♻ ☆ A Personalized Zero-Shot ECG Arrhythmia Monitoring System: From Sparse + Representation Based Domain Adaption to Energy Efficient Abnormal Beat + Detection for Practical ECG Surveillance + + +
+ This paper proposes a low-cost and highly accurate ECG-monitoring system +intended for personalized early arrhythmia detection for wearable mobile +sensors. Earlier supervised approaches for personalized ECG monitoring require +both abnormal and normal heartbeats for the training of the dedicated +classifier. However, in a real-world scenario where the personalized algorithm +is embedded in a wearable device, such training data is not available for +healthy people with no cardiac disorder history. In this study, (i) we propose +a null space analysis on the healthy signal space obtained via sparse +dictionary learning, and investigate how a simple null space projection or +alternatively regularized least squares-based classification methods can reduce +the computational complexity, without sacrificing the detection accuracy, when +compared to sparse representation-based classification. (ii) Then we introduce +a sparse representation-based domain adaptation technique in order to project +other existing users' abnormal and normal signals onto the new user's signal +space, enabling us to train the dedicated classifier without having any +abnormal heartbeat of the new user. Therefore, zero-shot learning can be +achieved without the need for synthetic abnormal heartbeat generation. An +extensive set of experiments performed on the benchmark MIT-BIH ECG dataset +shows that when this domain adaptation-based training data generator is used +with a simple 1-D CNN classifier, the method outperforms the prior work by a +significant margin. (iii) Then, by combining (i) and (ii), we propose an +ensemble classifier that further improves the performance. This approach for +zero-shot arrhythmia detection achieves an average accuracy level of 98.2% and +an F1-Score of 92.8%. Finally, a personalized energy-efficient ECG monitoring +scheme is proposed using the above-mentioned innovations. + +
+
+ comment: Software implementation: https://github.com/MertDuman/Zero-Shot-ECG +
+
+
+
+
+ + ♻ ☆ Adaptive Layer Splitting for Wireless LLM Inference in Edge Computing: A + Model-Based Reinforcement Learning Approach + + +
+ Optimizing the deployment of large language models (LLMs) in edge computing +environments is critical for enhancing privacy and computational efficiency. +Toward efficient wireless LLM inference in edge computing, this study +comprehensively analyzes the impact of different splitting points in mainstream +open-source LLMs. On this basis, this study introduces a framework taking +inspiration from model-based reinforcement learning (MBRL) to determine the +optimal splitting point across the edge and user equipment (UE). By +incorporating a reward surrogate model, our approach significantly reduces the +computational cost of frequent performance evaluations. Extensive simulations +demonstrate that this method effectively balances inference performance and +computational load under varying network conditions, providing a robust +solution for LLM deployment in decentralized settings. + +
+
+
+
+
+ + ♻ ☆ LightFF: Lightweight Inference for Forward-Forward Algorithm + + +
+ The human brain performs tasks with an outstanding energy efficiency, i.e., +with approximately 20 Watts. The state-of-the-art Artificial/Deep Neural +Networks (ANN/DNN), on the other hand, have recently been shown to consume +massive amounts of energy. The training of these ANNs/DNNs is done almost +exclusively based on the back-propagation algorithm, which is known to be +biologically implausible. This has led to a new generation of forward-only +techniques, including the Forward-Forward algorithm. In this paper, we propose +a lightweight inference scheme specifically designed for DNNs trained using the +Forward-Forward algorithm. We have evaluated our proposed lightweight inference +scheme in the case of the MNIST and CIFAR datasets, as well as two real-world +applications, namely, epileptic seizure detection and cardiac arrhythmia +classification using wearable technologies, where complexity overheads/energy +consumption is a major constraint, and demonstrate its relevance. Our code is +available at https://github.com/AminAminifar/LightFF. + +
+
+
+
+
+ + ♻ ☆ Copula-based transferable models for synthetic population generation + + +
+ Population synthesis involves generating synthetic yet realistic +representations of a target population of micro-agents for behavioral modeling +and simulation. Traditional methods, often reliant on target population +samples, such as census data or travel surveys, face limitations due to high +costs and small sample sizes, particularly at smaller geographical scales. We +propose a novel framework based on copulas to generate synthetic data for +target populations where only empirical marginal distributions are known. This +method utilizes samples from different populations with similar marginal +dependencies, introduces a spatial component into population synthesis, and +considers various information sources for more realistic generators. +Concretely, the process involves normalizing the data and treating it as +realizations of a given copula, and then training a generative model before +incorporating the information on the marginals of the target population. +Utilizing American Community Survey data, we assess our framework's performance +through standardized root mean squared error (SRMSE) and so-called sampled +zeros. We focus on its capacity to transfer a model learned from one population +to another. Our experiments include transfer tests between regions at the same +geographical level as well as to lower geographical levels, hence evaluating +the framework's adaptability in varied spatial contexts. We compare Bayesian +Networks, Variational Autoencoders, and Generative Adversarial Networks, both +individually and combined with our copula framework. Results show that the +copula enhances machine learning methods in matching the marginals of the +reference data. Furthermore, it consistently surpasses Iterative Proportional +Fitting in terms of SRMSE in the transferability experiments, while introducing +unique observations not found in the original training sample. + +
+
+
+
+
+ + ♻ ☆ FQGA-single: Towards Fewer Training Epochs and Fewer Model Parameters + for Image-to-Image Translation Tasks + + +
+ CycleGAN was trained on SynthRAD Grand Challenge Dataset using the +single-epoch modification (SEM) method proposed in this paper which is referred +to as (CycleGAN-single) compared to the usual method of training CycleGAN on +around 200 epochs (CycleGAN-multi). Model performance were evaluated +qualitatively and quantitatively with quantitative performance metrics like +PSNR, SSIM, MAE and MSE. The consideration of both quantitative and qualitative +performance when evaluating a model is unique to certain image-to-image +translation tasks like medical imaging of patient data as detailed in this +paper. Also, this paper shows that good quantitative performance does not +always imply good qualitative performance and the converse is also not always +True (i.e. good qualitative performance does not always imply good quantitative +performance). This paper also proposes a lightweight model called FQGA (Fast +Paired Image-to-Image Translation Quarter-Generator Adversary) which has 1/4 +the number of parameters compared to CycleGAN (when comparing their Generator +Models). FQGA outperforms CycleGAN qualitatively and quantitatively even only +after training on 20 epochs. Finally, using SEM method on FQGA allowed it to +again outperform CycleGAN both quantitatively and qualitatively. These +performance gains even with fewer model parameters and fewer epochs (which will +result in time and computational savings) may also be applicable to other +image-to-image translation tasks in Machine Learning apart from the Medical +image-translation task discussed in this paper between Cone Beam Computed +Tomography (CBCT) and Computed Tomography (CT) images. + +
+
+
+
+
+ + ♻ ☆ Talos: A More Effective and Efficient Adversarial Defense for GNN Models + Based on the Global Homophily of Graphs + + +
+ Graph neural network (GNN) models play a pivotal role in numerous tasks +involving graph-related data analysis. Despite their efficacy, similar to other +deep learning models, GNNs are susceptible to adversarial attacks. Even minor +perturbations in graph data can induce substantial alterations in model +predictions. While existing research has explored various adversarial defense +techniques for GNNs, the challenge of defending against adversarial attacks on +real-world scale graph data remains largely unresolved. On one hand, methods +reliant on graph purification and preprocessing tend to excessively emphasize +local graph information, leading to sub-optimal defensive outcomes. On the +other hand, approaches rooted in graph structure learning entail significant +time overheads, rendering them impractical for large-scale graphs. In this +paper, we propose a new defense method named Talos, which enhances the global, +rather than local, homophily of graphs as a defense. Experiments show that the +proposed approach notably outperforms state-of-the-art defense approaches, +while imposing little computational overhead. + +
+
+
+
+
+ + ♻ ☆ Neural networks for insurance pricing with frequency and severity data: + a benchmark study from data preprocessing to technical tariff + + +
+ Insurers usually turn to generalized linear models for modeling claim +frequency and severity data. Due to their success in other fields, machine +learning techniques are gaining popularity within the actuarial toolbox. Our +paper contributes to the literature on frequency-severity insurance pricing +with machine learning via deep learning structures. We present a benchmark +study on four insurance data sets with frequency and severity targets in the +presence of multiple types of input features. We compare in detail the +performance of: a generalized linear model on binned input data, a +gradient-boosted tree model, a feed-forward neural network (FFNN), and the +combined actuarial neural network (CANN). The CANNs combine a baseline +prediction established with a GLM and GBM, respectively, with a neural network +correction. We explain the data preprocessing steps with specific focus on the +multiple types of input features typically present in tabular insurance data +sets, such as postal codes, numeric and categorical covariates. Autoencoders +are used to embed the categorical variables into the neural network, and we +explore their potential advantages in a frequency-severity setting. Model +performance is evaluated not only on out-of-sample deviance but also using +statistical and calibration performance criteria and managerial tools to get +more nuanced insights. Finally, we construct global surrogate models for the +neural nets' frequency and severity models. These surrogates enable the +translation of the essential insights captured by the FFNNs or CANNs to GLMs. +As such, a technical tariff table results that can easily be deployed in +practice. + +
+
+
+
+
+ + ♻ ☆ Regularization for Adversarial Robust Learning + + +
+ Despite the growing prevalence of artificial neural networks in real-world +applications, their vulnerability to adversarial attacks remains a significant +concern, which motivates us to investigate the robustness of machine learning +models. While various heuristics aim to optimize the distributionally robust +risk using the $\infty$-Wasserstein metric, such a notion of robustness +frequently encounters computation intractability. To tackle the computational +challenge, we develop a novel approach to adversarial training that integrates +$\phi$-divergence regularization into the distributionally robust risk +function. This regularization brings a notable improvement in computation +compared with the original formulation. We develop stochastic gradient methods +with biased oracles to solve this problem efficiently, achieving the +near-optimal sample complexity. Moreover, we establish its regularization +effects and demonstrate it is asymptotic equivalence to a regularized empirical +risk minimization framework, by considering various scaling regimes of the +regularization parameter and robustness level. These regimes yield gradient +norm regularization, variance regularization, or a smoothed gradient norm +regularization that interpolates between these extremes. We numerically +validate our proposed method in supervised learning, reinforcement learning, +and contextual learning and showcase its state-of-the-art performance against +various adversarial attacks. + +
+
+ comment: 51 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ A Flexible, Equivariant Framework for Subgraph GNNs via Graph Products + and Graph Coarsening + + +
+ Subgraph Graph Neural Networks (Subgraph GNNs) enhance the expressivity of +message-passing GNNs by representing graphs as sets of subgraphs. They have +shown impressive performance on several tasks, but their complexity limits +applications to larger graphs. Previous approaches suggested processing only +subsets of subgraphs, selected either randomly or via learnable sampling. +However, they make suboptimal subgraph selections or can only cope with very +small subset sizes, inevitably incurring performance degradation. This paper +introduces a new Subgraph GNNs framework to address these issues. We employ a +graph coarsening function to cluster nodes into super-nodes with induced +connectivity. The product between the coarsened and the original graph reveals +an implicit structure whereby subgraphs are associated with specific sets of +nodes. By running generalized message-passing on such graph product, our method +effectively implements an efficient, yet powerful Subgraph GNN. Controlling the +coarsening function enables meaningful selection of any number of subgraphs +while, contrary to previous methods, being fully compatible with standard +training techniques. Notably, we discover that the resulting node feature +tensor exhibits new, unexplored permutation symmetries. We leverage this +structure, characterize the associated linear equivariant layers and +incorporate them into the layers of our Subgraph GNN architecture. Extensive +experiments on multiple graph learning benchmarks demonstrate that our method +is significantly more flexible than previous approaches, as it can seamlessly +handle any number of subgraphs, while consistently outperforming baseline +approaches. + +
+
+ comment: Preprint, under review +
+
+
+
+
+ + ♻ ☆ Active Sensing of Knee Osteoarthritis Progression with Reinforcement + Learning + + +
+ Osteoarthritis (OA) is the most common musculoskeletal disease, which has no +cure. Knee OA (KOA) is one of the highest causes of disability worldwide, and +it costs billions of United States dollars to the global community. Prediction +of KOA progression has been of high interest to the community for years, as it +can advance treatment development through more efficient clinical trials and +improve patient outcomes through more efficient healthcare utilization. +Existing approaches for predicting KOA, however, are predominantly static, i.e. +consider data from a single time point to predict progression many years into +the future, and knee level, i.e. consider progression in a single joint only. +Due to these and related reasons, these methods fail to deliver the level of +predictive performance, which is sufficient to result in cost savings and +better patient outcomes. Collecting extensive data from all patients on a +regular basis could address the issue, but it is limited by the high cost at a +population level. In this work, we propose to go beyond static prediction +models in OA, and bring a novel Active Sensing (AS) approach, designed to +dynamically follow up patients with the objective of maximizing the number of +informative data acquisitions, while minimizing their total cost over a period +of time. Our approach is based on Reinforcement Learning (RL), and it leverages +a novel reward function designed specifically for AS of disease progression in +more than one part of a human body. Our method is end-to-end, relies on +multi-modal Deep Learning, and requires no human input at inference time. +Throughout an exhaustive experimental evaluation, we show that using RL can +provide a higher monetary benefit when compared to state-of-the-art baselines. + +
+
+
+
+
+ + ♻ ☆ Beyond Specialization: Assessing the Capabilities of MLLMs in Age and + Gender Estimation + + +
+ Multimodal Large Language Models (MLLMs) have recently gained immense +popularity. Powerful commercial models like ChatGPT-4V and Gemini, as well as +open-source ones such as LLaVA, are essentially general-purpose models and are +applied to solve a wide variety of tasks, including those in computer vision. +These neural networks possess such strong general knowledge and reasoning +abilities that they have proven capable of working even on tasks for which they +were not specifically trained. We compared the capabilities of the most +powerful MLLMs to date: ShareGPT4V, ChatGPT, LLaVA-Next in a specialized task +of age and gender estimation with our state-of-the-art specialized model, +MiVOLO. We also updated MiVOLO and provide details and new metrics in this +article. This comparison has yielded some interesting results and insights +about the strengths and weaknesses of the participating models. Furthermore, we +attempted various ways to fine-tune the ShareGPT4V model for this specific +task, aiming to achieve state-of-the-art results in this particular challenge. +Although such a model would not be practical in production, as it is incredibly +expensive compared to a specialized model like MiVOLO, it could be very useful +in some tasks, like data annotation. + +
+
+
+
+
+ + ♻ ☆ MuTT: A Multimodal Trajectory Transformer for Robot Skills + + +
+ High-level robot skills represent an increasingly popular paradigm in robot +programming. However, configuring the skills' parameters for a specific task +remains a manual and time-consuming endeavor. Existing approaches for learning +or optimizing these parameters often require numerous real-world executions or +do not work in dynamic environments. To address these challenges, we propose +MuTT, a novel encoder-decoder transformer architecture designed to predict +environment-aware executions of robot skills by integrating vision, trajectory, +and robot skill parameters. Notably, we pioneer the fusion of vision and +trajectory, introducing a novel trajectory projection. Furthermore, we +illustrate MuTT's efficacy as a predictor when combined with a model-based +robot skill optimizer. This approach facilitates the optimization of robot +skill parameters for the current environment, without the need for real-world +executions during optimization. Designed for compatibility with any +representation of robot skills, MuTT demonstrates its versatility across three +comprehensive experiments, showcasing superior performance across two different +skill representations. + +
+
+
+
+
+ + ♻ ☆ Can AI be enabled to dynamical downscaling? A Latent Diffusion Model to + mimic km-scale COSMO5.0\_CLM9 simulations + + +
+ Downscaling techniques are one of the most prominent applications of Deep +Learning (DL) in Earth System Modeling. A robust DL downscaling model can +generate high-resolution fields from coarse-scale numerical model simulations, +saving the timely and resourceful applications of regional/local models. +Additionally, generative DL models have the potential to provide uncertainty +information, by generating ensemble-like scenario pools, a task that is +computationally prohibitive for traditional numerical simulations. In this +study, we apply a Latent Diffusion Model (LDM) to downscale ERA5 data over +Italy up to a resolution of 2 km. The high-resolution target data consists of +2-m temperature and 10-m horizontal wind components from a dynamical +downscaling performed with COSMO_CLM. Our goal is to demonstrate that recent +advancements in generative modeling enable DL to deliver results comparable to +those of numerical dynamical models, given the same input data, preserving the +realism of fine-scale features and flow characteristics. A selection of +predictors from ERA5 is used as input to the LDM, and a residual approach +against a reference UNET is leveraged in applying the LDM. The performance of +the generative LDM is compared with reference baselines of increasing +complexity: quadratic interpolation of ERA5, a UNET, and a Generative +Adversarial Network (GAN) built on the same reference UNET. Results highlight +the improvements introduced by the LDM architecture and the residual approach +over these baselines. The models are evaluated on a yearly test dataset, +assessing the models' performance through deterministic metrics, spatial +distribution of errors, and reconstruction of frequency and power spectra +distributions. + +
+
+ comment: 24 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ GNN-SKAN: Harnessing the Power of SwallowKAN to Advance Molecular + Representation Learning with GNNs + + +
+ Effective molecular representation learning is crucial for advancing +molecular property prediction and drug design. Mainstream molecular +representation learning approaches are based on Graph Neural Networks (GNNs). +However, these approaches struggle with three significant challenges: +insufficient annotations, molecular diversity, and architectural limitations +such as over-squashing, which leads to the loss of critical structural details. +To address these challenges, we introduce a new class of GNNs that integrates +the Kolmogorov-Arnold Networks (KANs), known for their robust data-fitting +capabilities and high accuracy in small-scale AI + Science tasks. By +incorporating KANs into GNNs, our model enhances the representation of +molecular structures. We further advance this approach with a variant called +SwallowKAN (SKAN), which employs adaptive Radial Basis Functions (RBFs) as the +core of the non-linear neurons. This innovation improves both computational +efficiency and adaptability to diverse molecular structures. Building on the +strengths of SKAN, we propose a new class of GNNs, GNN-SKAN, and its augmented +variant, GNN-SKAN+, which incorporates a SKAN-based classifier to further boost +performance. To our knowledge, this is the first work to integrate KANs into +GNN architectures tailored for molecular representation learning. Experiments +across 6 classification datasets, 6 regression datasets, and 4 few-shot +learning datasets demonstrate that our approach achieves new state-of-the-art +performance in terms of accuracy and computational cost. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Self-supervised Learning for Clustering of Wireless Spectrum Activity + + +
+ In recent years, much work has been done on processing of wireless spectrum +data involving machine learning techniques in domain-related problems for +cognitive radio networks, such as anomaly detection, modulation classification, +technology classification and device fingerprinting. Most of the solutions are +based on labeled data, created in a controlled manner and processed with +supervised learning approaches. However, spectrum data measured in real-world +environment is highly nondeterministic, making its labeling a laborious and +expensive process, requiring domain expertise, thus being one of the main +drawbacks of using supervised learning approaches in this domain. In this +paper, we investigate the use of self-supervised learning (SSL) for exploring +spectrum activities in a real-world unlabeled data. In particular, we compare +the performance of two SSL models, one based on a reference DeepCluster +architecture and one adapted for spectrum activity identification and +clustering, and a baseline model based on K-means clustering algorithm. We show +that SSL models achieve superior performance regarding the quality of extracted +features and clustering performance. With SSL models we achieve reduction of +the feature vectors size by two orders of magnitude, while improving the +performance by a factor of 2 to 2.5 across the evaluation metrics, supported by +visual assessment. Additionally we show that adaptation of the reference SSL +architecture to the domain data provides reduction of model complexity by one +order of magnitude, while preserving or even improving the clustering +performance. + +
+
+
+
+
+ + ♻ ☆ Advancements in Molecular Property Prediction: A Survey of Single and + Multimodal Approaches + + +
+ Molecular Property Prediction (MPP) plays a pivotal role across diverse +domains, spanning drug discovery, material science, and environmental +chemistry. Fueled by the exponential growth of chemical data and the evolution +of artificial intelligence, recent years have witnessed remarkable strides in +MPP. However, the multifaceted nature of molecular data, such as molecular +structures, SMILES notation, and molecular images, continues to pose a +fundamental challenge in its effective representation. To address this, +representation learning techniques are instrumental as they acquire informative +and interpretable representations of molecular data. This article explores +recent AI/-based approaches in MPP, focusing on both single and multiple +modality representation techniques. It provides an overview of various molecule +representations and encoding schemes, categorizes MPP methods by their use of +modalities, and outlines datasets and tools available for feature generation. +The article also analyzes the performance of recent methods and suggests future +research directions to advance the field of MPP. + +
+
+ comment: Submitted to the journal +
+
+
+
+
+ + ♻ ☆ CGGM: A conditional graph generation model with adaptive sparsity for + node anomaly detection in IoT networks + + +
+ Dynamic graphs are extensively employed for detecting anomalous behavior in +nodes within the Internet of Things (IoT). Graph generative models are often +used to address the issue of imbalanced node categories in dynamic graphs. +Neverthe less, the constraints it faces include the monotonicity of adjacency +relationships, the difficulty in constructing multi-dimensional features for +nodes, and the lack of a method for end-to-end generation of multiple +categories of nodes. In this paper, we propose a novel graph generation model, +called CGGM, specifically for generating samples belonging to the minority +class. The framework consists two core module: a conditional graph generation +module and a graph-based anomaly detection module. The generative module adapts +to the sparsity of the matrix by downsampling a noise adjacency matrix, and +incorporates a multi-dimensional feature encoder based on multi-head +self-attention to capture latent dependencies among features. Additionally, a +latent space constraint is combined with the distribution distance to +approximate the latent distribution of real data. The graph-based anomaly +detection module utilizes the generated balanced dataset to predict the node +behaviors. Extensive experiments have shown that CGGM outperforms the +state-of-the-art methods in terms of accuracy and divergence. The results also +demonstrate CGGM can generated diverse data categories, that enhancing the +performance of multi-category classification task. + +
+
+ comment: 23 pages, 19 figures +
+
+
+
+
+ + ♻ ☆ Personalized Federated Learning via ADMM with Moreau Envelope + + +
+ Personalized federated learning (PFL) is an approach proposed to address the +issue of poor convergence on heterogeneous data. However, most existing PFL +frameworks require strong assumptions for convergence. In this paper, we +propose an alternating direction method of multipliers (ADMM) for training PFL +models with Moreau envelope (FLAME), which achieves a sublinear convergence +rate, relying on the relatively weak assumption of gradient Lipschitz +continuity. Moreover, due to the gradient-free nature of ADMM, FLAME alleviates +the need for hyperparameter tuning, particularly in avoiding the adjustment of +the learning rate when training the global model. In addition, we propose a +biased client selection strategy to expedite the convergence of training of PFL +models. Our theoretical analysis establishes the global convergence under both +unbiased and biased client selection strategies. Our experiments validate that +FLAME, when trained on heterogeneous data, outperforms state-of-the-art methods +in terms of model performance. Regarding communication efficiency, it exhibits +an average speedup of 3.75x compared to the baselines. Furthermore, +experimental results validate that the biased client selection strategy speeds +up the convergence of both personalized and global models. + +
+
+ comment: I have uploaded the latest version of this paper to arXiv:2407.16397. + Due to my mistake, I didn't use 'replacement' but instead uploaded a new + version. I deeply apologize for my error +
+
+
+
+
+ + ♻ ☆ Pulse shape discrimination based on the Tempotron: a powerful classifier + on GPU + + +
+ This study utilized the Tempotron, a robust classifier based on a +third-generation neural network model, for pulse shape discrimination. By +eliminating the need for manual feature extraction, the Tempotron model can +process pulse signals directly, generating discrimination results based on +prior knowledge. The study performed experiments using GPU acceleration, +resulting in over 500 times faster compared to the CPU-based model, and +investigated the impact of noise augmentation on the Tempotron performance. +Experimental results substantiated that Tempotron serves as a formidable +classifier, adept at accomplishing high discrimination accuracy on both AmBe +and time-of-flight PuBe datasets. Furthermore, analyzing the neural activity of +Tempotron during training shed light on its learning characteristics and aided +in selecting its hyperparameters. Moreover, the study addressed the constraints +and potential avenues for future development in utilizing the Tempotron for +pulse shape discrimination. The dataset used in this study and the GPU-based +Tempotron are publicly available on GitHub at +https://github.com/HaoranLiu507/TempotronGPU. + +
+
+ comment: 12 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ A Survey of Mamba + + +
+ As one of the most representative DL techniques, Transformer architecture has +empowered numerous advanced models, especially the large language models (LLMs) +that comprise billions of parameters, becoming a cornerstone in deep learning. +Despite the impressive achievements, Transformers still face inherent +limitations, particularly the time-consuming inference resulting from the +quadratic computation complexity of attention calculation. Recently, a novel +architecture named Mamba, drawing inspiration from classical state space models +(SSMs), has emerged as a promising alternative for building foundation models, +delivering comparable modeling abilities to Transformers while preserving +near-linear scalability concerning sequence length. This has sparked an +increasing number of studies actively exploring Mamba's potential to achieve +impressive performance across diverse domains. Given such rapid evolution, +there is a critical need for a systematic review that consolidates existing +Mamba-empowered models, offering a comprehensive understanding of this emerging +model architecture. In this survey, we therefore conduct an in-depth +investigation of recent Mamba-associated studies, covering three main aspects: +the advancements of Mamba-based models, the techniques of adapting Mamba to +diverse data, and the applications where Mamba can excel. Specifically, we +first review the foundational knowledge of various representative deep learning +models and the details of Mamba-1&2 as preliminaries. Then, to showcase the +significance of Mamba for AI, we comprehensively review the related studies +focusing on Mamba models' architecture design, data adaptability, and +applications. Finally, we present a discussion of current limitations and +explore various promising research directions to provide deeper insights for +future investigations. + +
+
+
+
+
+ + ♻ ☆ Decentralized Online Learning for Random Inverse Problems Over Graphs + + +
+ We propose a decentralized online learning algorithm for distributed random +inverse problems over network graphs with online measurements, and unifies the +distributed parameter estimation in Hilbert spaces and the least mean square +problem in reproducing kernel Hilbert spaces (RKHS-LMS). We transform the +convergence of the algorithm into the asymptotic stability of a class of +inhomogeneous random difference equations in Hilbert spaces with +$L_{2}$-bounded martingale difference terms and develop the $L_2$-asymptotic +stability theory in Hilbert spaces. We show that if the network graph is +connected and the sequence of forward operators satisfies the +infinite-dimensional spatio-temporal persistence of excitation condition, then +the estimates of all nodes are mean square and almost surely strongly +consistent. Moreover, we propose a decentralized online learning algorithm in +RKHS based on non-stationary online data streams, and prove that the algorithm +is mean square and almost surely strongly consistent if the operators induced +by the random input data satisfy the infinite-dimensional spatio-temporal +persistence of excitation condition. + +
+
+
+
+
+ + ♻ ☆ On Statistical Rates and Provably Efficient Criteria of Latent Diffusion + Transformers (DiTs) + + +
+ We investigate the statistical and computational limits of latent +\textbf{Di}ffusion \textbf{T}ransformers (\textbf{DiT}s) under the +low-dimensional linear latent space assumption. Statistically, we study the +universal approximation and sample complexity of the DiTs score function, as +well as the distribution recovery property of the initial data. Specifically, +under mild data assumptions, we derive an approximation error bound for the +score network of latent DiTs, which is sub-linear in the latent space +dimension. Additionally, we derive the corresponding sample complexity bound +and show that the data distribution generated from the estimated score function +converges toward a proximate area of the original one. Computationally, we +characterize the hardness of both forward inference and backward computation of +latent DiTs, assuming the Strong Exponential Time Hypothesis (SETH). For +forward inference, we identify efficient criteria for all possible latent DiTs +inference algorithms and showcase our theory by pushing the efficiency toward +almost-linear time inference. For backward computation, we leverage the +low-rank structure within the gradient computation of DiTs training for +possible algorithmic speedup. Specifically, we show that such speedup achieves +almost-linear time latent DiTs training by casting the DiTs gradient as a +series of chained low-rank approximations with bounded error. Under the +low-dimensional assumption, we show that the convergence rate and the +computational efficiency are both dominated by the dimension of the subspace, +suggesting that latent DiTs have the potential to bypass the challenges +associated with the high dimensionality of initial data. + +
+
+ comment: v2 fixed typos, added Fig. 1 and added clarifications +
+
+
+
+
+ + ♻ ☆ MoTCoder: Elevating Large Language Models with Modular of Thought for + Challenging Programming Tasks + + +
+ Large Language Models (LLMs) have showcased impressive capabilities in +handling straightforward programming tasks. However, their performance tends to +falter when confronted with more challenging programming problems. We observe +that conventional models often generate solutions as monolithic code blocks, +restricting their effectiveness in tackling intricate questions. To overcome +this limitation, we present Modular-of-Thought Coder (MoTCoder). We introduce a +pioneering framework for MoT instruction tuning, designed to promote the +decomposition of tasks into logical sub-tasks and sub-modules. Our +investigations reveal that, through the cultivation and utilization of +sub-modules, MoTCoder significantly improves both the modularity and +correctness of the generated solutions, leading to substantial relative pass@1 +improvements of 12.9% on APPS and 9.43% on CodeContests. Our codes are +available at https://github.com/dvlab-research/MoTCoder. + +
+
+ comment: Model: https://huggingface.co/JingyaoLi/MoTCoder-15B-v1.0. Code: + https://github.com/dvlab-research/MoTCoder +
+
+
+
+
+ + ♻ ☆ QuickLLaMA: Query-aware Inference Acceleration for Large Language Models + + +
+ The capacity of Large Language Models (LLMs) to comprehend and reason over +long contexts is pivotal for advancements in diverse fields. Yet, they still +stuggle with capturing long-distance dependencies within sequences to deeply +understand semantics. To address this issue, we introduce Query-aware Inference +for LLMs (Q-LLM), a system designed to process extensive sequences akin to +human cognition. By focusing on memory data relevant to a given query, Q-LLM +can accurately capture pertinent information within a fixed window size and +provide precise answers to queries. It doesn't require extra training and can +be seamlessly integrated with any LLMs. Q-LLM using LLaMA3 (QuickLLaMA) can +read Harry Potter within 30s and accurately answer the questions. On widely +recognized benchmarks, Q-LLM improved by 7.17% compared to the current +state-of-the-art on LLaMA3, and by 3.26% on Mistral on the $\infty$-bench. In +the Needle-in-a-Haystack and BABILong task, Q-LLM improved upon the current +SOTA by 7.0% and 6.1%. Our code can be found in +https://github.com/dvlab-research/Q-LLM. + +
+
+
+
+
+ + ♻ ☆ Using Part-based Representations for Explainable Deep Reinforcement + Learning + + +
+ Utilizing deep learning models to learn part-based representations holds +significant potential for interpretable-by-design approaches, as these models +incorporate latent causes obtained from feature representations through simple +addition. However, training a part-based learning model presents challenges, +particularly in enforcing non-negative constraints on the model's parameters, +which can result in training difficulties such as instability and convergence +issues. Moreover, applying such approaches in Deep Reinforcement Learning (RL) +is even more demanding due to the inherent instabilities that impact many +optimization methods. In this paper, we propose a non-negative training +approach for actor models in RL, enabling the extraction of part-based +representations that enhance interpretability while adhering to non-negative +constraints. To this end, we employ a non-negative initialization technique, as +well as a modified sign-preserving training method, which can ensure better +gradient flow compared to existing approaches. We demonstrate the effectiveness +of the proposed approach using the well-known Cartpole benchmark. + +
+
+
+
+
+ + ♻ ☆ Integrating Physics-Based Modeling with Machine Learning for Lithium-Ion + Batteries + + +
+ Mathematical modeling of lithium-ion batteries (LiBs) is a primary challenge +in advanced battery management. This paper proposes two new frameworks to +integrate physics-based models with machine learning to achieve high-precision +modeling for LiBs. The frameworks are characterized by informing the machine +learning model of the state information of the physical model, enabling a deep +integration between physics and machine learning. Based on the frameworks, a +series of hybrid models are constructed, through combining an electrochemical +model and an equivalent circuit model, respectively, with a feedforward neural +network. The hybrid models are relatively parsimonious in structure and can +provide considerable voltage predictive accuracy under a broad range of +C-rates, as shown by extensive simulations and experiments. The study further +expands to conduct aging-aware hybrid modeling, leading to the design of a +hybrid model conscious of the state-of-health to make prediction. The +experiments show that the model has high voltage predictive accuracy throughout +a LiB's cycle life. + +
+
+ comment: 15 pages, 10 figures, 2 tables. arXiv admin note: text overlap with + arXiv:2103.11580 +
+
+
+
+
+ + ♻ ☆ EXAONEPath 1.0 Patch-level Foundation Model for Pathology + + +
+ Recent advancements in digital pathology have led to the development of +numerous foundational models that utilize self-supervised learning on patches +extracted from gigapixel whole slide images (WSIs). While this approach +leverages vast amounts of unlabeled data, we have discovered a significant +issue: features extracted from these self-supervised models tend to cluster by +individual WSIs, a phenomenon we term WSI-specific feature collapse. This +problem can potentially limit the model's generalization ability and +performance on various downstream tasks. To address this issue, we introduce +EXAONEPath, a novel foundational model trained on patches that have undergone +stain normalization. Stain normalization helps reduce color variability arising +from different laboratories and scanners, enabling the model to learn more +consistent features. EXAONEPath is trained using 285,153,903 patches extracted +from a total of 34,795 WSIs. Our experiments demonstrate that EXAONEPath +significantly mitigates the feature collapse problem, indicating that the model +has learned more generalized features rather than overfitting to individual WSI +characteristics. We compared EXAONEPath with state-of-the-art models across six +downstream task datasets, and our results show that EXAONEPath achieves +superior performance relative to the number of WSIs used and the model's +parameter count. This suggests that the application of stain normalization has +substantially improved the model's efficiency and generalization capabilities. + +
+
+ comment: License updated +
+
+
+
+
+ + ♻ ☆ Deep Reinforcement Learning for Efficient and Fair Allocation of Health + Care Resources + + +
+ Scarcity of health care resources could result in the unavoidable consequence +of rationing. For example, ventilators are often limited in supply, especially +during public health emergencies or in resource-constrained health care +settings, such as amid the pandemic of COVID-19. Currently, there is no +universally accepted standard for health care resource allocation protocols, +resulting in different governments prioritizing patients based on various +criteria and heuristic-based protocols. In this study, we investigate the use +of reinforcement learning for critical care resource allocation policy +optimization to fairly and effectively ration resources. We propose a +transformer-based deep Q-network to integrate the disease progression of +individual patients and the interaction effects among patients during the +critical care resource allocation. We aim to improve both fairness of +allocation and overall patient outcomes. Our experiments demonstrate that our +method significantly reduces excess deaths and achieves a more equitable +distribution under different levels of ventilator shortage, when compared to +existing severity-based and comorbidity-based methods in use by different +governments. Our source code is included in the supplement and will be released +on Github upon publication. + +
+
+ comment: 9 pages, 4 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ LAKD-Activation Mapping Distillation Based on Local Learning + + +
+ Knowledge distillation is widely applied in various fundamental vision models +to enhance the performance of compact models. Existing knowledge distillation +methods focus on designing different distillation targets to acquire knowledge +from teacher models. However, these methods often overlook the efficient +utilization of distilled information, crudely coupling different types of +information, making it difficult to explain how the knowledge from the teacher +network aids the student network in learning. This paper proposes a novel +knowledge distillation framework, Local Attention Knowledge Distillation +(LAKD), which more efficiently utilizes the distilled information from teacher +networks, achieving higher interpretability and competitive performance. The +framework establishes an independent interactive training mechanism through a +separation-decoupling mechanism and non-directional activation mapping. LAKD +decouples the teacher's features and facilitates progressive interaction +training from simple to complex. Specifically, the student network is divided +into local modules with independent gradients to decouple the knowledge +transferred from the teacher. The non-directional activation mapping helps the +student network integrate knowledge from different local modules by learning +coarse-grained feature knowledge. We conducted experiments on the CIFAR-10, +CIFAR-100, and ImageNet datasets, and the results show that our LAKD method +significantly outperforms existing methods, consistently achieving +state-of-the-art performance across different datasets. + +
+
+ comment: 8 pages,7 figures +
+
+
+
+
+ + ♻ ☆ Vaccine: Perturbation-aware Alignment for Large Language Models against + Harmful Fine-tuning + + +
+ The new paradigm of finetuning-as-a-service introduces a new attack surface +for Large Language Models (LLMs): a few harmful data uploaded by users can +easily trick the finetuning to produce an alignment-broken model. We conduct an +empirical analysis and uncover a \textit{harmful embedding drift} phenomenon, +showing a probable cause of the alignment-broken effect. Inspired by our +findings, we propose Vaccine, a perturbation-aware alignment technique to +mitigate the security risk of users finetuning. The core idea of Vaccine is to +produce invariant hidden embeddings by progressively adding crafted +perturbation to them in the alignment phase. This enables the embeddings to +withstand harmful perturbation from un-sanitized user data in the finetuning +phase. Our results on open source mainstream LLMs (e.g., Llama2, Opt, Vicuna) +demonstrate that Vaccine can boost the robustness of alignment against harmful +prompts induced embedding drift while reserving reasoning ability towards +benign prompts. Our code is available at +\url{https://github.com/git-disl/Vaccine}. + +
+
+
+
+
+ + ♻ ☆ Improving the Utility of Differentially Private Clustering through + Dynamical Processing + + +
+ This study aims to alleviate the trade-off between utility and privacy of +differentially private clustering. Existing works focus on simple methods, +which show poor performance for non-convex clusters. To fit complex cluster +distributions, we propose sophisticated dynamical processing inspired by Morse +theory, with which we hierarchically connect the Gaussian sub-clusters obtained +through existing methods. Our theoretical results imply that the proposed +dynamical processing introduces little to no additional privacy loss. +Experiments show that our framework can improve the clustering performance of +existing methods at the same privacy level. + +
+
+
+
+
+ + ♻ ☆ Two-Timescale Optimization Framework for Decentralized Linear-Quadratic + Optimal Control + + +
+ A $\mathcal{H}_2$-guaranteed decentralized linear-quadratic optimal control +with convex parameterization and convex-bounded uncertainty is studied in this +paper, where several sparsity promoting functions are added, respectively, into +the $\mathcal{H}_2$ cost to penalize the number of communication links among +decentralized controllers. Then, the sparse feedback gain is investigated to +minimize the modified $\mathcal{H}_2$ cost together with the stability +guarantee, and the corresponding main results are of three parts. First, the +weighted-$\ell_1$ sparsity promoting function is of concern, and a +two-timescale algorithm is developed based on the BSUM (Block Successive +Upper-bound Minimization) framework and a primal-dual splitting approach. +Second, the optimization problem induced by piecewise quadratic sparsity +penalty is investigated, which exhibits an accelerated convergence rate. Third, +the nonconvex sparse optimization problem with $\ell_0$-penalty is studied, +which can be approximated by successive coordinatewise convex optimization +problems. + +
+
+
+
+
+ + ♻ ☆ A General Control-Theoretic Approach for Reinforcement Learning: Theory + and Algorithms + + +
+ We devise a control-theoretic reinforcement learning approach to support +direct learning of the optimal policy. We establish various theoretical +properties of our approach, such as convergence and optimality of our +control-theoretic operator, a new control-policy-parameter gradient ascent +theorem, and a specific gradient ascent algorithm based on this theorem. As a +representative example, we adapt our approach to a particular control-theoretic +framework and empirically evaluate its performance on several classical +reinforcement learning tasks, demonstrating significant improvements in +solution quality, sample complexity, and running time of our control-theoretic +approach over state-of-the-art baseline methods. + +
+
+
+
+
+ + ♻ ☆ Adversarial Examples in the Physical World: A Survey + + +
+ Deep neural networks (DNNs) have demonstrated high vulnerability to +adversarial examples, raising broad security concerns about their applications. +Besides the attacks in the digital world, the practical implications of +adversarial examples in the physical world present significant challenges and +safety concerns. However, current research on physical adversarial examples +(PAEs) lacks a comprehensive understanding of their unique characteristics, +leading to limited significance and understanding. In this paper, we address +this gap by thoroughly examining the characteristics of PAEs within a practical +workflow encompassing training, manufacturing, and re-sampling processes. By +analyzing the links between physical adversarial attacks, we identify +manufacturing and re-sampling as the primary sources of distinct attributes and +particularities in PAEs. Leveraging this knowledge, we develop a comprehensive +analysis and classification framework for PAEs based on their specific +characteristics, covering over 100 studies on physical-world adversarial +examples. Furthermore, we investigate defense strategies against PAEs and +identify open challenges and opportunities for future research. We aim to +provide a fresh, thorough, and systematic understanding of PAEs, thereby +promoting the development of robust adversarial learning and its application in +open-world scenarios to provide the community with a continuously updated list +of physical world adversarial sample resources, including papers, code, \etc, +within the proposed framework + +
+
+ comment: Adversarial examples, physical-world scenarios, attacks and defenses +
+
+
+
+
+ + ♻ ☆ Robust Policy Learning via Offline Skill Diffusion AAAI + + +
+ Skill-based reinforcement learning (RL) approaches have shown considerable +promise, especially in solving long-horizon tasks via hierarchical structures. +These skills, learned task-agnostically from offline datasets, can accelerate +the policy learning process for new tasks. Yet, the application of these skills +in different domains remains restricted due to their inherent dependency on the +datasets, which poses a challenge when attempting to learn a skill-based policy +via RL for a target domain different from the datasets' domains. In this paper, +we present a novel offline skill learning framework DuSkill which employs a +guided Diffusion model to generate versatile skills extended from the limited +skills in datasets, thereby enhancing the robustness of policy learning for +tasks in different domains. Specifically, we devise a guided diffusion-based +skill decoder in conjunction with the hierarchical encoding to disentangle the +skill embedding space into two distinct representations, one for encapsulating +domain-invariant behaviors and the other for delineating the factors that +induce domain variations in the behaviors. Our DuSkill framework enhances the +diversity of skills learned offline, thus enabling to accelerate the learning +procedure of high-level policies for different domains. Through experiments, we +show that DuSkill outperforms other skill-based imitation learning and RL +algorithms for several long-horizon tasks, demonstrating its benefits in +few-shot imitation and online RL. + +
+
+ comment: 11 pages, 6 figures; Accepted for AAAI Conference on Artificial + Intelligence (AAAI 2024); Published version +
+
+
+
+
+ + ♻ ☆ Covariate-Elaborated Robust Partial Information Transfer with + Conditional Spike-and-Slab Prior + + +
+ The popularity of transfer learning stems from the fact that it can borrow +information from useful auxiliary datasets. Existing statistical transfer +learning methods usually adopt a global similarity measure between the source +data and the target data, which may lead to inefficiency when only partial +information is shared. In this paper, we propose a novel Bayesian transfer +learning method named ``CONCERT'' to allow robust partial information transfer +for high-dimensional data analysis. A conditional spike-and-slab prior is +introduced in the joint distribution of target and source parameters for +information transfer. By incorporating covariate-specific priors, we can +characterize partial similarities and integrate source information +collaboratively to improve the performance on the target. In contrast to +existing work, the CONCERT is a one-step procedure, which achieves variable +selection and information transfer simultaneously. We establish variable +selection consistency, as well as estimation and prediction error bounds for +CONCERT. Our theory demonstrates the covariate-specific benefit of transfer +learning. To ensure that our algorithm is scalable, we adopt the variational +Bayes framework to facilitate implementation. Extensive experiments and two +real data applications showcase the validity and advantage of CONCERT over +existing cutting-edge transfer learning methods. + +
+
+ comment: 35 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Vanilla Gradient Descent for Oblique Decision Trees ECAI-2024 + + +
+ Decision Trees (DTs) constitute one of the major highly non-linear AI models, +valued, e.g., for their efficiency on tabular data. Learning accurate DTs is, +however, complicated, especially for oblique DTs, and does take a significant +training time. Further, DTs suffer from overfitting, e.g., they proverbially +"do not generalize" in regression tasks. Recently, some works proposed ways to +make (oblique) DTs differentiable. This enables highly efficient +gradient-descent algorithms to be used to learn DTs. It also enables +generalizing capabilities by learning regressors at the leaves simultaneously +with the decisions in the tree. Prior approaches to making DTs differentiable +rely either on probabilistic approximations at the tree's internal nodes (soft +DTs) or on approximations in gradient computation at the internal node +(quantized gradient descent). In this work, we propose DTSemNet, a novel +semantically equivalent and invertible encoding for (hard, oblique) DTs as +Neural Networks (NNs), that uses standard vanilla gradient descent. Experiments +across various classification and regression benchmarks show that oblique DTs +learned using DTSemNet are more accurate than oblique DTs of similar size +learned using state-of-the-art techniques. Further, DT training time is +significantly reduced. We also experimentally demonstrate that DTSemNet can +learn DT policies as efficiently as NN policies in the Reinforcement Learning +(RL) setup with physical inputs (dimensions $\leq32$). The code is available at +{\color{blue}\textit{\url{https://github.com/CPS-research-group/dtsemnet}}}. + +
+
+ comment: Published in ECAI-2024. Full version (includes supplementary + material) +
+
+
+
+
+ + ♻ ☆ Distilling the Unknown to Unveil Certainty + + +
+ Out-of-distribution (OOD) detection is essential in identifying test samples +that deviate from the in-distribution (ID) data upon which a standard network +is trained, ensuring network robustness and reliability. This paper introduces +OOD knowledge distillation, a pioneering learning framework applicable whether +or not training ID data is available, given a standard network. This framework +harnesses unknown OOD-sensitive knowledge from the standard network to craft a +certain binary classifier adept at distinguishing between ID and OOD samples. +To accomplish this, we introduce Confidence Amendment (CA), an innovative +methodology that transforms an OOD sample into an ID one while progressively +amending prediction confidence derived from the standard network. This approach +enables the simultaneous synthesis of both ID and OOD samples, each accompanied +by an adjusted prediction confidence, thereby facilitating the training of a +binary classifier sensitive to OOD. Theoretical analysis provides bounds on the +generalization error of the binary classifier, demonstrating the pivotal role +of confidence amendment in enhancing OOD sensitivity. Extensive experiments +spanning various datasets and network architectures confirm the efficacy of the +proposed method in detecting OOD samples. + +
+
+
+
+
+ + ♻ ☆ Understanding the Relationship between Prompts and Response Uncertainty + in Large Language Models + + +
+ Large language models (LLMs) are widely used in decision-making, but their +reliability, especially in critical tasks like healthcare, is not +well-established. Therefore, understanding how LLMs reason and make decisions +is crucial for their safe deployment. This paper investigates how the +uncertainty of responses generated by LLMs relates to the information provided +in the input prompt. Leveraging the insight that LLMs learn to infer latent +concepts during pretraining, we propose a prompt-response concept model that +explains how LLMs generate responses and helps understand the relationship +between prompts and response uncertainty. We show that the uncertainty +decreases as the prompt's informativeness increases, similar to epistemic +uncertainty. Our detailed experimental results on real datasets validate our +proposed model. + +
+
+ comment: 27 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ A Scalable Quantum Non-local Neural Network for Image Classification + + +
+ Non-local operations play a crucial role in computer vision enabling the +capture of long-range dependencies through weighted sums of features across the +input, surpassing the constraints of traditional convolution operations that +focus solely on local neighborhoods. Non-local operations typically require +computing pairwise relationships between all elements in a set, leading to +quadratic complexity in terms of time and memory. Due to the high computational +and memory demands, scaling non-local neural networks to large-scale problems +can be challenging. This article introduces a hybrid quantum-classical scalable +non-local neural network, referred to as Quantum Non-Local Neural Network +(QNL-Net), to enhance pattern recognition. The proposed QNL-Net relies on +inherent quantum parallelism to allow the simultaneous processing of a large +number of input features enabling more efficient computations in +quantum-enhanced feature space and involving pairwise relationships through +quantum entanglement. We benchmark our proposed QNL-Net with other quantum +counterparts to binary classification with datasets MNIST and CIFAR-10. The +simulation findings showcase our QNL-Net achieves cutting-edge accuracy levels +in binary image classification among quantum classifiers while utilizing fewer +qubits. + +
+
+ comment: preprint, 12 pages (including references and appendix), 5 figures +
+
+
+
+
+ + ♻ ☆ Accelerated stochastic approximation with state-dependent noise + + +
+ We consider a class of stochastic smooth convex optimization problems under +rather general assumptions on the noise in the stochastic gradient observation. +As opposed to the classical problem setting in which the variance of noise is +assumed to be uniformly bounded, herein we assume that the variance of +stochastic gradients is related to the "sub-optimality" of the approximate +solutions delivered by the algorithm. Such problems naturally arise in a +variety of applications, in particular, in the well-known generalized linear +regression problem in statistics. However, to the best of our knowledge, none +of the existing stochastic approximation algorithms for solving this class of +problems attain optimality in terms of the dependence on accuracy, problem +parameters, and mini-batch size. + We discuss two non-Euclidean accelerated stochastic approximation +routines--stochastic accelerated gradient descent (SAGD) and stochastic +gradient extrapolation (SGE)--which carry a particular duality relationship. We +show that both SAGD and SGE, under appropriate conditions, achieve the optimal +convergence rate, attaining the optimal iteration and sample complexities +simultaneously. However, corresponding assumptions for the SGE algorithm are +more general; they allow, for instance, for efficient application of the SGE to +statistical estimation problems under heavy tail noises and discontinuous score +functions. We also discuss the application of the SGE to problems satisfying +quadratic growth conditions, and show how it can be used to recover sparse +solutions. Finally, we report on some simulation experiments to illustrate +numerical performance of our proposed algorithms in high-dimensional settings. + +
+
+
+
+
+ + ♻ ☆ Graph Partial Label Learning with Potential Cause Discovering + + +
+ Graph Neural Networks (GNNs) have garnered widespread attention for their +potential to address the challenges posed by graph representation learning, +which face complex graph-structured data across various domains. However, due +to the inherent complexity and interconnectedness of graphs, accurately +annotating graph data for training GNNs is extremely challenging. To address +this issue, we have introduced Partial Label Learning (PLL) into graph +representation learning. PLL is a critical weakly supervised learning problem +where each training instance is associated with a set of candidate labels, +including the ground-truth label and the additional interfering labels. PLL +allows annotators to make errors, which reduces the difficulty of data +labeling. Subsequently, we propose a novel graph representation learning method +that enables GNN models to effectively learn discriminative information within +the context of PLL. Our approach utilizes potential cause extraction to obtain +graph data that holds causal relationships with the labels. By conducting +auxiliary training based on the extracted graph data, our model can effectively +eliminate the interfering information in the PLL scenario. We support the +rationale behind our method with a series of theoretical analyses. Moreover, we +conduct extensive evaluations and ablation studies on multiple datasets, +demonstrating the superiority of our proposed method. + +
+
+
+
+
+ + ♻ ☆ An Infinite-Width Analysis on the Jacobian-Regularised Training of a + Neural Network ICML 2024 + + +
+ The recent theoretical analysis of deep neural networks in their +infinite-width limits has deepened our understanding of initialisation, feature +learning, and training of those networks, and brought new practical techniques +for finding appropriate hyperparameters, learning network weights, and +performing inference. In this paper, we broaden this line of research by +showing that this infinite-width analysis can be extended to the Jacobian of a +deep neural network. We show that a multilayer perceptron (MLP) and its +Jacobian at initialisation jointly converge to a Gaussian process (GP) as the +widths of the MLP's hidden layers go to infinity and characterise this GP. We +also prove that in the infinite-width limit, the evolution of the MLP under the +so-called robust training (i.e., training with a regulariser on the Jacobian) +is described by a linear first-order ordinary differential equation that is +determined by a variant of the Neural Tangent Kernel. We experimentally show +the relevance of our theoretical claims to wide finite networks, and +empirically analyse the properties of kernel regression solution to obtain an +insight into Jacobian regularisation. + +
+
+ comment: Accepted at ICML 2024. 74 pages, 18 figures +
+
+
+
+
+ + ♻ ☆ Clarify: Improving Model Robustness With Natural Language Corrections + + +
+ The standard way to teach models is by feeding them lots of data. However, +this approach often teaches models incorrect ideas because they pick up on +misleading signals in the data. To prevent such misconceptions, we must +necessarily provide additional information beyond the training data. Prior +methods incorporate additional instance-level supervision, such as labels for +misleading features or additional labels for debiased data. However, such +strategies require a large amount of labeler effort. We hypothesize that people +are good at providing textual feedback at the concept level, a capability that +existing teaching frameworks do not leverage. We propose Clarify, a novel +interface and method for interactively correcting model misconceptions. Through +Clarify, users need only provide a short text description of a model's +consistent failure patterns. Then, in an entirely automated way, we use such +descriptions to improve the training process. Clarify is the first end-to-end +system for user model correction. Our user studies show that non-expert users +can successfully describe model misconceptions via Clarify, leading to +increased worst-case performance in two datasets. We additionally conduct a +case study on a large-scale image dataset, ImageNet, using Clarify to find and +rectify 31 novel hard subpopulations. + +
+
+ comment: UIST 2024. Interface code available at + https://github.com/yoonholee/Clarify +
+
+
+
+
+
+
+
+ + Multimedia 6 + +
+
+
+ + ☆ DreamCinema: Cinematic Transfer with Free Camera and 3D Character + + +
+ We are living in a flourishing era of digital media, where everyone has the +potential to become a personal filmmaker. Current research on cinematic +transfer empowers filmmakers to reproduce and manipulate the visual elements +(e.g., cinematography and character behaviors) from classic shots. However, +characters in the reimagined films still rely on manual crafting, which +involves significant technical complexity and high costs, making it +unattainable for ordinary users. Furthermore, their estimated cinematography +lacks smoothness due to inadequate capturing of inter-frame motion and modeling +of physical trajectories. Fortunately, the remarkable success of 2D and 3D AIGC +has opened up the possibility of efficiently generating characters tailored to +users' needs, diversifying cinematography. In this paper, we propose +DreamCinema, a novel cinematic transfer framework that pioneers generative AI +into the film production paradigm, aiming at facilitating user-friendly film +creation. Specifically, we first extract cinematic elements (i.e., human and +camera pose) and optimize the camera trajectory. Then, we apply a character +generator to efficiently create 3D high-quality characters with a human +structure prior. Finally, we develop a structure-guided motion transfer +strategy to incorporate generated characters into film creation and transfer it +via 3D graphics engines smoothly. Extensive experiments demonstrate the +effectiveness of our method for creating high-quality films with free camera +and 3D characters. + +
+
+ comment: Project page: https://liuff19.github.io/DreamCinema +
+
+
+
+
+ + ☆ Exploring the Role of Audio in Multimodal Misinformation Detection + + +
+ With the rapid development of deepfake technology, especially the deep audio +fake technology, misinformation detection on the social media scene meets a +great challenge. Social media data often contains multimodal information which +includes audio, video, text, and images. However, existing multimodal +misinformation detection methods tend to focus only on some of these +modalities, failing to comprehensively address information from all modalities. +To comprehensively address the various modal information that may appear on +social media, this paper constructs a comprehensive multimodal misinformation +detection framework. By employing corresponding neural network encoders for +each modality, the framework can fuse different modality information and +support the multimodal misinformation detection task. Based on the constructed +framework, this paper explores the importance of the audio modality in +multimodal misinformation detection tasks on social media. By adjusting the +architecture of the acoustic encoder, the effectiveness of different acoustic +feature encoders in the multimodal misinformation detection tasks is +investigated. Furthermore, this paper discovers that audio and video +information must be carefully aligned, otherwise the misalignment across +different audio and video modalities can severely impair the model performance. + +
+
+
+
+
+ + ☆ MaVEn: An Effective Multi-granularity Hybrid Visual Encoding Framework + for Multimodal Large Language Model + + +
+ This paper presents MaVEn, an innovative Multi-granularity Visual Encoding +framework designed to enhance the capabilities of Multimodal Large Language +Models (MLLMs) in multi-image reasoning. Current MLLMs primarily focus on +single-image visual understanding, limiting their ability to interpret and +integrate information across multiple images. MaVEn addresses this limitation +by combining discrete visual symbol sequences, which abstract coarse-grained +semantic concepts, with traditional continuous representation sequences that +model fine-grained features. This dual approach bridges the semantic gap +between visual and textual data, thereby improving the model's ability to +process and interpret information from multiple images effectively. +Additionally, we design a dynamic reduction mechanism by for long-sequence +continuous features to enhance multi-image processing efficiency. Experimental +results demonstrate that MaVEn significantly enhances MLLMs' understanding in +complex multi-image scenarios, while also improving performance in single-image +contexts. + +
+
+
+
+
+ + ♻ ☆ Concept Conductor: Orchestrating Multiple Personalized Concepts in + Text-to-Image Synthesis + + +
+ The customization of text-to-image models has seen significant advancements, +yet generating multiple personalized concepts remains a challenging task. +Current methods struggle with attribute leakage and layout confusion when +handling multiple concepts, leading to reduced concept fidelity and semantic +consistency. In this work, we introduce a novel training-free framework, +Concept Conductor, designed to ensure visual fidelity and correct layout in +multi-concept customization. Concept Conductor isolates the sampling processes +of multiple custom models to prevent attribute leakage between different +concepts and corrects erroneous layouts through self-attention-based spatial +guidance. Additionally, we present a concept injection technique that employs +shape-aware masks to specify the generation area for each concept. This +technique injects the structure and appearance of personalized concepts through +feature fusion in the attention layers, ensuring harmony in the final image. +Extensive qualitative and quantitative experiments demonstrate that Concept +Conductor can consistently generate composite images with accurate layouts +while preserving the visual details of each concept. Compared to existing +baselines, Concept Conductor shows significant performance improvements. Our +method supports the combination of any number of concepts and maintains high +fidelity even when dealing with visually similar concepts. The code and models +are available at https://github.com/Nihukat/Concept-Conductor. + +
+
+ comment: Github Page: https://github.com/Nihukat/Concept-Conductor +
+
+
+
+
+ + ♻ ☆ Generalized Face Forgery Detection via Adaptive Learning for Pre-trained + Vision Transformer + + +
+ With the rapid progress of generative models, the current challenge in face +forgery detection is how to effectively detect realistic manipulated faces from +different unseen domains. Though previous studies show that pre-trained Vision +Transformer (ViT) based models can achieve some promising results after fully +fine-tuning on the Deepfake dataset, their generalization performances are +still unsatisfactory. One possible reason is that fully fine-tuned ViT-based +models may disrupt the pre-trained features [1, 2] and overfit to some +data-specific patterns [3]. To alleviate this issue, we present a +\textbf{F}orgery-aware \textbf{A}daptive \textbf{Vi}sion \textbf{T}ransformer +(FA-ViT) under the adaptive learning paradigm, where the parameters in the +pre-trained ViT are kept fixed while the designed adaptive modules are +optimized to capture forgery features. Specifically, a global adaptive module +is designed to model long-range interactions among input tokens, which takes +advantage of self-attention mechanism to mine global forgery clues. To further +explore essential local forgery clues, a local adaptive module is proposed to +expose local inconsistencies by enhancing the local contextual association. In +addition, we introduce a fine-grained adaptive learning module that emphasizes +the common compact representation of genuine faces through relationship +learning in fine-grained pairs, driving these proposed adaptive modules to be +aware of fine-grained forgery-aware information. Extensive experiments +demonstrate that our FA-ViT achieves state-of-the-arts results in the +cross-dataset evaluation, and enhances the robustness against unseen +perturbations. Particularly, FA-ViT achieves 93.83\% and 78.32\% AUC scores on +Celeb-DF and DFDC datasets in the cross-dataset evaluation. The code and +trained model have been released at: https://github.com/LoveSiameseCat/FAViT. + +
+
+
+
+
+ + ♻ ☆ Lighthouse: A User-Friendly Library for Reproducible Video Moment + Retrieval and Highlight Detection + + +
+ We propose Lighthouse, a user-friendly library for reproducible video moment +retrieval and highlight detection (MR-HD). Although researchers proposed +various MR-HD approaches, the research community holds two main issues. The +first is a lack of comprehensive and reproducible experiments across various +methods, datasets, and video-text features. This is because no unified training +and evaluation codebase covers multiple settings. The second is user-unfriendly +design. Because previous works use different libraries, researchers set up +individual environments. In addition, most works release only the training +codes, requiring users to implement the whole inference process of MR-HD. +Lighthouse addresses these issues by implementing a unified reproducible +codebase that includes six models, three features, and five datasets. In +addition, it provides an inference API and web demo to make these methods +easily accessible for researchers and developers. Our experiments demonstrate +that Lighthouse generally reproduces the reported scores in the reference +papers. The code is available at https://github.com/line/lighthouse. + +
+
+ comment: 6 pages; library tech report +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 100 + +
+
+
+ + ☆ Great Memory, Shallow Reasoning: Limits of $k$NN-LMs + + +
+ $K$-nearest neighbor language models ($k$NN-LMs), which integrate retrieval +with next-word prediction, have demonstrated strong performance in language +modeling as well as downstream NLP benchmarks. These results have led +researchers to argue that models trained on poor quality or outdated data could +perform well by employing a $k$NN extension that has access to a higher-quality +datastore. In this work, we ask whether this improved ability to recall +information really translates into downstream abilities. We extensively +evaluate $k$NN-LMs on a diverse set of tasks, ranging from sentiment +classification and commonsense reasoning to multi-hop reasoning. Results show +that $k$NN-LMs excel at memory-intensive tasks, where utilizing the patterns in +the input is sufficient for determining the output, but struggle with reasoning +tasks that require integrating multiple pieces of information to derive new +knowledge. We further demonstrate through oracle experiments and qualitative +analysis that even with perfect retrieval, $k$NN-LMs still fail to determine +the correct answers, placing an upper bound on their reasoning performance. +Code and datastores are released at https://github.com/GSYfate/knnlm-limits/. + +
+
+
+
+
+ + ☆ PermitQA: A Benchmark for Retrieval Augmented Generation in Wind Siting + and Permitting domain + + +
+ In the rapidly evolving landscape of Natural Language Processing (NLP) and +text generation, the emergence of Retrieval Augmented Generation (RAG) presents +a promising avenue for improving the quality and reliability of generated text +by leveraging information retrieved from user specified database. Benchmarking +is essential to evaluate and compare the performance of the different RAG +configurations in terms of retriever and generator, providing insights into +their effectiveness, scalability, and suitability for the specific domain and +applications. In this paper, we present a comprehensive framework to generate a +domain relevant RAG benchmark. Our framework is based on automatic +question-answer generation with Human (domain experts)-AI Large Language Model +(LLM) teaming. As a case study, we demonstrate the framework by introducing +PermitQA, a first-of-its-kind benchmark on the wind siting and permitting +domain which comprises of multiple scientific documents/reports related to +environmental impact of wind energy projects. Our framework systematically +evaluates RAG performance using diverse metrics and multiple question types +with varying complexity level. We also demonstrate the performance of different +models on our benchmark. + +
+
+
+
+
+ + ☆ Practical token pruning for foundation models in few-shot conversational + virtual assistant systems + + +
+ In an enterprise Virtual Assistant (VA) system, intent classification is the +crucial component that determines how a user input is handled based on what the +user wants. The VA system is expected to be a cost-efficient SaaS service with +low training and inference time while achieving high accuracy even with a small +number of training samples. We pretrain a transformer-based sentence embedding +model with a contrastive learning objective and leverage the embedding of the +model as features when training intent classification models. Our approach +achieves the state-of-the-art results for few-shot scenarios and performs +better than other commercial solutions on popular intent classification +benchmarks. However, generating features via a transformer-based model +increases the inference time, especially for longer user inputs, due to the +quadratic runtime of the transformer's attention mechanism. On top of model +distillation, we introduce a practical multi-task adaptation approach that +configures dynamic token pruning without the need for task-specific training +for intent classification. We demonstrate that this approach improves the +inference speed of popular sentence transformer models without affecting model +performance. + +
+
+ comment: 6 pages, 3 figures +
+
+
+
+
+ + ☆ LLM Pruning and Distillation in Practice: The Minitron Approach + + +
+ We present a comprehensive report on compressing the Llama 3.1 8B and Mistral +NeMo 12B models to 4B and 8B parameters, respectively, using pruning and +distillation. We explore two distinct pruning strategies: (1) depth pruning and +(2) joint hidden/attention/MLP (width) pruning, and evaluate the results on +common benchmarks from the LM Evaluation Harness. The models are then aligned +with NeMo Aligner and tested in instruct-tuned versions. This approach produces +a compelling 4B model from Llama 3.1 8B and a state-of-the-art +Mistral-NeMo-Minitron-8B (MN-Minitron-8B for brevity) model from Mistral NeMo +12B. We found that with no access to the original data, it is beneficial to +slightly fine-tune teacher models on the distillation dataset. We open-source +our base model weights on Hugging Face with a permissive license. + +
+
+
+
+
+ + ☆ DreamFactory: Pioneering Multi-Scene Long Video Generation with a + Multi-Agent Framework + + +
+ Current video generation models excel at creating short, realistic clips, but +struggle with longer, multi-scene videos. We introduce \texttt{DreamFactory}, +an LLM-based framework that tackles this challenge. \texttt{DreamFactory} +leverages multi-agent collaboration principles and a Key Frames Iteration +Design Method to ensure consistency and style across long videos. It utilizes +Chain of Thought (COT) to address uncertainties inherent in large language +models. \texttt{DreamFactory} generates long, stylistically coherent, and +complex videos. Evaluating these long-form videos presents a challenge. We +propose novel metrics such as Cross-Scene Face Distance Score and Cross-Scene +Style Consistency Score. To further research in this area, we contribute the +Multi-Scene Videos Dataset containing over 150 human-rated videos. + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ☆ Personality Alignment of Large Language Models + + +
+ Current methods for aligning large language models (LLMs) typically aim to +reflect general human values and behaviors, but they often fail to capture the +unique characteristics and preferences of individual users. To address this +gap, we introduce the concept of Personality Alignment. This approach tailors +LLMs' responses and decisions to match the specific preferences of individual +users or closely related groups. Inspired by psychometrics, we created the +Personality Alignment with Personality Inventories (PAPI) dataset, which +includes data from 300,000 real subjects, each providing behavioral preferences +based on the Big Five Personality Factors. This dataset allows us to +quantitatively evaluate the extent to which LLMs can align with each subject's +behavioral patterns. Recognizing the challenges of personality alignments: such +as limited personal data, diverse preferences, and scalability requirements: we +developed an activation intervention optimization method. This method enhances +LLMs' ability to efficiently align with individual behavioral preferences using +minimal data and computational resources. Remarkably, our method, PAS, achieves +superior performance while requiring only 1/5 of the optimization time compared +to DPO, offering practical value for personality alignment. Our work paves the +way for future AI systems to make decisions and reason in truly personality +ways, enhancing the relevance and meaning of AI interactions for each user and +advancing human-centered artificial intelligence.The code has released in +\url{https://github.com/zhu-minjun/PAlign}. + +
+
+
+
+
+ + ☆ Leveraging Fine-Tuned Retrieval-Augmented Generation with Long-Context + Support: For 3GPP Standards + + +
+ Recent studies show that large language models (LLMs) struggle with technical +standards in telecommunications. We propose a fine-tuned retrieval-augmented +generation (RAG) system based on the Phi-2 small language model (SLM) to serve +as an oracle for communication networks. Our developed system leverages +forward-looking semantic chunking to adaptively determine parsing breakpoints +based on embedding similarity, enabling effective processing of diverse +document formats. To handle the challenge of multiple similar contexts in +technical standards, we employ a re-ranking algorithm to prioritize the most +relevant retrieved chunks. Recognizing the limitations of Phi-2's small context +window, we implement a recent technique, namely SelfExtend, to expand the +context window during inference, which not only boosts the performance but also +can accommodate a wider range of user queries and design requirements from +customers to specialized technicians. For fine-tuning, we utilize the low-rank +adaptation (LoRA) technique to enhance computational efficiency during training +and enable effective fine-tuning on small datasets. Our comprehensive +experiments demonstrate substantial improvements over existing +question-answering approaches in the telecom domain, achieving performance that +exceeds larger language models such as GPT-4 (which is about 880 times larger +in size). This work presents a novel approach to leveraging SLMs for +communication networks, offering a balance of efficiency and performance. This +work can serve as a foundation towards agentic language models for networks. + +
+
+ comment: submitted to Proc. IEEE Globecom +
+
+
+
+
+ + ☆ Against All Odds: Overcoming Typology, Script, and Language Confusion in + Multilingual Embedding Inversion Attacks + + +
+ Large Language Models (LLMs) are susceptible to malicious influence by cyber +attackers through intrusions such as adversarial, backdoor, and embedding +inversion attacks. In response, the burgeoning field of LLM Security aims to +study and defend against such threats. Thus far, the majority of works in this +area have focused on monolingual English models, however, emerging research +suggests that multilingual LLMs may be more vulnerable to various attacks than +their monolingual counterparts. While previous work has investigated embedding +inversion over a small subset of European languages, it is challenging to +extrapolate these findings to languages from different linguistic families and +with differing scripts. To this end, we explore the security of multilingual +LLMs in the context of embedding inversion attacks and investigate +cross-lingual and cross-script inversion across 20 languages, spanning over 8 +language families and 12 scripts. Our findings indicate that languages written +in Arabic script and Cyrillic script are particularly vulnerable to embedding +inversion, as are languages within the Indo-Aryan language family. We further +observe that inversion models tend to suffer from language confusion, sometimes +greatly reducing the efficacy of an attack. Accordingly, we systematically +explore this bottleneck for inversion models, uncovering predictable patterns +which could be leveraged by attackers. Ultimately, this study aims to further +the field's understanding of the outstanding security vulnerabilities facing +multilingual LLMs and raise awareness for the languages most at risk of +negative impact from these attacks. + +
+
+ comment: 11 pages, 4 figures, 7 tables +
+
+
+
+
+ + ☆ FocusLLM: Scaling LLM's Context by Parallel Decoding + + +
+ Empowering LLMs with the ability to utilize useful information from a long +context is crucial for many downstream applications. However, achieving long +context lengths with the conventional transformer architecture requires +substantial training and inference resources. In this paper, we present +FocusLLM, a framework designed to extend the context length of any decoder-only +LLM, enabling the model to focus on relevant information from very long +sequences. FocusLLM processes long text inputs by dividing them into chunks +based on the model's original context length to alleviate the issue of +attention distraction. Then, it appends the local context to each chunk as a +prompt to extract essential information from each chunk based on a novel +parallel decoding mechanism, and ultimately integrates the extracted +information into the local context. FocusLLM stands out for great training +efficiency and versatility: trained with an 8K input length with much less +training cost than previous methods, FocusLLM exhibits superior performance +across downstream long-context tasks and maintains strong language modeling +ability when handling extensive long texts, even up to 400K tokens. Our code is +available at https://github.com/leezythu/FocusLLM. + +
+
+
+
+
+ + ☆ Efficient Detection of Toxic Prompts in Large Language Models + + +
+ Large language models (LLMs) like ChatGPT and Gemini have significantly +advanced natural language processing, enabling various applications such as +chatbots and automated content generation. However, these models can be +exploited by malicious individuals who craft toxic prompts to elicit harmful or +unethical responses. These individuals often employ jailbreaking techniques to +bypass safety mechanisms, highlighting the need for robust toxic prompt +detection methods. Existing detection techniques, both blackbox and whitebox, +face challenges related to the diversity of toxic prompts, scalability, and +computational efficiency. In response, we propose ToxicDetector, a lightweight +greybox method designed to efficiently detect toxic prompts in LLMs. +ToxicDetector leverages LLMs to create toxic concept prompts, uses embedding +vectors to form feature vectors, and employs a Multi-Layer Perceptron (MLP) +classifier for prompt classification. Our evaluation on various versions of the +LLama models, Gemma-2, and multiple datasets demonstrates that ToxicDetector +achieves a high accuracy of 96.39\% and a low false positive rate of 2.00\%, +outperforming state-of-the-art methods. Additionally, ToxicDetector's +processing time of 0.0780 seconds per prompt makes it highly suitable for +real-time applications. ToxicDetector achieves high accuracy, efficiency, and +scalability, making it a practical method for toxic prompt detection in LLMs. + +
+
+ comment: Accepted by the 39th IEEE/ACM International Conference on Automated + Software Engineering (ASE 2024) +
+
+
+
+
+ + ☆ Xinyu: An Efficient LLM-based System for Commentary Generation + + +
+ Commentary provides readers with a deep understanding of events by presenting +diverse arguments and evidence. However, creating commentary is a +time-consuming task, even for skilled commentators. Large language models +(LLMs) have simplified the process of natural language generation, but their +direct application in commentary creation still faces challenges due to unique +task requirements. These requirements can be categorized into two levels: 1) +fundamental requirements, which include creating well-structured and logically +consistent narratives, and 2) advanced requirements, which involve generating +quality arguments and providing convincing evidence. In this paper, we +introduce Xinyu, an efficient LLM-based system designed to assist commentators +in generating Chinese commentaries. To meet the fundamental requirements, we +deconstruct the generation process into sequential steps, proposing targeted +strategies and supervised fine-tuning (SFT) for each step. To address the +advanced requirements, we present an argument ranking model for arguments and +establish a comprehensive evidence database that includes up-to-date events and +classic books, thereby strengthening the substantiation of the evidence with +retrieval augmented generation (RAG) technology. To evaluate the generated +commentaries more fairly, corresponding to the two-level requirements, we +introduce a comprehensive evaluation metric that considers five distinct +perspectives in commentary generation. Our experiments confirm the +effectiveness of our proposed system. We also observe a significant increase in +the efficiency of commentators in real-world scenarios, with the average time +spent on creating a commentary dropping from 4 hours to 20 minutes. +Importantly, such an increase in efficiency does not compromise the quality of +the commentaries. + +
+
+
+
+
+ + ☆ Cause-Aware Empathetic Response Generation via Chain-of-Thought + Fine-Tuning + + +
+ Empathetic response generation endows agents with the capability to +comprehend dialogue contexts and react to expressed emotions. Previous works +predominantly focus on leveraging the speaker's emotional labels, but ignore +the importance of emotion cause reasoning in empathetic response generation, +which hinders the model's capacity for further affective understanding and +cognitive inference. In this paper, we propose a cause-aware empathetic +generation approach by integrating emotions and causes through a well-designed +Chain-of-Thought (CoT) prompt on Large Language Models (LLMs). Our approach can +greatly promote LLMs' performance of empathy by instruction tuning and +enhancing the role awareness of an empathetic listener in the prompt. +Additionally, we propose to incorporate cause-oriented external knowledge from +COMET into the prompt, which improves the diversity of generation and +alleviates conflicts between internal and external knowledge at the same time. +Experimental results on the benchmark dataset demonstrate that our approach on +LLaMA-7b achieves state-of-the-art performance in both automatic and human +evaluations. + +
+
+
+
+
+ + ☆ Large Language Models are Good Attackers: Efficient and Stealthy Textual + Backdoor Attacks + + +
+ With the burgeoning advancements in the field of natural language processing +(NLP), the demand for training data has increased significantly. To save costs, +it has become common for users and businesses to outsource the labor-intensive +task of data collection to third-party entities. Unfortunately, recent research +has unveiled the inherent risk associated with this practice, particularly in +exposing NLP systems to potential backdoor attacks. Specifically, these attacks +enable malicious control over the behavior of a trained model by poisoning a +small portion of the training data. Unlike backdoor attacks in computer vision, +textual backdoor attacks impose stringent requirements for attack stealthiness. +However, existing attack methods meet significant trade-off between +effectiveness and stealthiness, largely due to the high information entropy +inherent in textual data. In this paper, we introduce the Efficient and +Stealthy Textual backdoor attack method, EST-Bad, leveraging Large Language +Models (LLMs). Our EST-Bad encompasses three core strategies: optimizing the +inherent flaw of models as the trigger, stealthily injecting triggers with +LLMs, and meticulously selecting the most impactful samples for backdoor +injection. Through the integration of these techniques, EST-Bad demonstrates an +efficient achievement of competitive attack performance while maintaining +superior stealthiness compared to prior methods across various text classifier +datasets. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Drama Engine: A Framework for Narrative Agents + + +
+ This technical report presents the Drama Engine, a novel framework for +agentic interaction with large language models designed for narrative purposes. +The framework adapts multi-agent system principles to create dynamic, +context-aware companions that can develop over time and interact with users and +each other. Key features include multi-agent workflows with delegation, dynamic +prompt assembly, and model-agnostic design. The Drama Engine introduces unique +elements such as companion development, mood systems, and automatic context +summarising. It is implemented in TypeScript. The framework's applications +include multi-agent chats and virtual co-workers for creative writing. The +paper discusses the system's architecture, prompt assembly process, delegation +mechanisms, and moderation techniques, as well as potential ethical +considerations and future extensions. + +
+
+ comment: 10 pages, 2 figures, 2 tables +
+
+
+
+
+ + ☆ Differentiating Choices via Commonality for Multiple-Choice Question + Answering ECAI 2024 + + +
+ Multiple-choice question answering (MCQA) becomes particularly challenging +when all choices are relevant to the question and are semantically similar. Yet +this setting of MCQA can potentially provide valuable clues for choosing the +right answer. Existing models often rank each choice separately, overlooking +the context provided by other choices. Specifically, they fail to leverage the +semantic commonalities and nuances among the choices for reasoning. In this +paper, we propose a novel MCQA model by differentiating choices through +identifying and eliminating their commonality, called DCQA. Our model captures +token-level attention of each choice to the question, and separates tokens of +the question attended to by all the choices (i.e., commonalities) from those by +individual choices (i.e., nuances). Using the nuances as refined contexts for +the choices, our model can effectively differentiate choices with subtle +differences and provide justifications for choosing the correct answer. We +conduct comprehensive experiments across five commonly used MCQA benchmarks, +demonstrating that DCQA consistently outperforms baseline models. Furthermore, +our case study illustrates the effectiveness of the approach in directing the +attention of the model to more differentiating features. + +
+
+ comment: 9 pages, accepted to ECAI 2024 +
+
+
+
+
+ + ☆ Memorization In In-Context Learning + + +
+ In-context learning (ICL) has proven to be an effective strategy for +improving the performance of large language models (LLMs) with no additional +training. However, the exact mechanism behind these performance improvements +remains unclear. This study is the first to show how ICL surfaces memorized +training data and to explore the correlation between this memorization and +performance across various ICL regimes: zero-shot, few-shot, and many-shot. Our +most notable findings include: (1) ICL significantly surfaces memorization +compared to zero-shot learning in most cases; (2) demonstrations, without their +labels, are the most effective element in surfacing memorization; (3) ICL +improves performance when the surfaced memorization in few-shot regimes reaches +a high level (about 40%); and (4) there is a very strong correlation between +performance and memorization in ICL when it outperforms zero-shot learning. +Overall, our study uncovers a hidden phenomenon -- memorization -- at the core +of ICL, raising an important question: to what extent do LLMs truly generalize +from demonstrations in ICL, and how much of their success is due to +memorization? + +
+
+ comment: v1 +
+
+
+
+
+ + ☆ Imagining from Images with an AI Storytelling Tool + + +
+ A method for generating narratives by analyzing single images or image +sequences is presented, inspired by the time immemorial tradition of Narrative +Art. The proposed method explores the multimodal capabilities of GPT-4o to +interpret visual content and create engaging stories, which are illustrated by +a Stable Diffusion XL model. The method is supported by a fully implemented +tool, called ImageTeller, which accepts images from diverse sources as input. +Users can guide the narrative's development according to the conventions of +fundamental genres - such as Comedy, Romance, Tragedy, Satire or Mystery -, opt +to generate data-driven stories, or to leave the prototype free to decide how +to handle the narrative structure. User interaction is provided along the +generation process, allowing the user to request alternative chapters or +illustrations, and even reject and restart the story generation based on the +same input. Additionally, users can attach captions to the input images, +influencing the system's interpretation of the visual content. Examples of +generated stories are provided, along with details on how to access the +prototype. + +
+
+
+
+
+ + ☆ IKUN for WMT24 General MT Task: LLMs Are here for Multilingual Machine + Translation + + +
+ This paper introduces two multilingual systems, IKUN and IKUN-C, developed +for the general machine translation task in WMT24. IKUN and IKUN-C represent an +open system and a constrained system, respectively, built on Llama-3-8b and +Mistral-7B-v0.3. Both systems are designed to handle all 11 language directions +using a single model. According to automatic evaluation metrics, IKUN-C +achieved 6 first-place and 3 second-place finishes among all constrained +systems, while IKUN secured 1 first-place and 2 second-place finishes across +both open and constrained systems. These encouraging results suggest that large +language models (LLMs) are nearing the level of proficiency required for +effective multilingual machine translation. The systems are based on a +two-stage approach: first, continuous pre-training on monolingual data in 10 +languages, followed by fine-tuning on high-quality parallel data for 11 +language directions. The primary difference between IKUN and IKUN-C lies in +their monolingual pre-training strategy. IKUN-C is pre-trained using +constrained monolingual data, whereas IKUN leverages monolingual data from the +OSCAR dataset. In the second phase, both systems are fine-tuned on parallel +data sourced from NTREX, Flores, and WMT16-23 for all 11 language pairs. + +
+
+ comment: 5 pages, 1 figure, 3 tables +
+
+
+
+
+ + ☆ DocTabQA: Answering Questions from Long Documents Using Tables + + +
+ We study a new problem setting of question answering (QA), referred to as +DocTabQA. Within this setting, given a long document, the goal is to respond to +questions by organizing the answers into structured tables derived directly +from the document's content. Unlike traditional QA approaches which +predominantly rely on unstructured text to formulate responses, DocTabQA aims +to leverage structured tables as answers to convey information clearly and +systematically, thereby enhancing user comprehension and highlighting +relationships between data points. To the best of our knowledge, this problem +has not been previously explored. In this paper, we introduce the QTabA +dataset, encompassing 300 financial documents, accompanied by manually +annotated 1.5k question-table pairs. Initially, we leverage Large Language +Models (LLMs) such as GPT-4 to establish a baseline. However, it is widely +acknowledged that LLMs encounter difficulties when tasked with generating +intricate, structured outputs from long input sequences. To overcome these +challenges, we present a two-stage framework, called DocTabTalk, which +initially retrieves relevant sentences from extensive documents and +subsequently generates hierarchical tables based on these identified sentences. +DocTabTalk incorporates two key technological innovations: AlignLLaMA and +TabTalk, which are specifically tailored to assist GPT-4 in tackling DocTabQA, +enabling it to generate well-structured, hierarchical tables with improved +organization and clarity. Comprehensive experimental evaluations conducted on +both QTabA and RotoWire datasets demonstrate that our DocTabTalk significantly +enhances the performances of the GPT-4 in our proposed DocTabQA task and the +table generation task. The code and dataset are available at +https://github.com/SmileWHC/DocTabQA for further research. + +
+
+ comment: 18 pages,5 figures +
+
+
+
+
+ + ☆ The Self-Contained Negation Test Set + + +
+ Several methodologies have recently been proposed to evaluate the ability of +Pretrained Language Models (PLMs) to interpret negation. In this article, we +build on Gubelmann and Handschuh (2022), which studies the modification of +PLMs' predictions as a function of the polarity of inputs, in English. +Crucially, this test uses ``self-contained'' inputs ending with a masked +position: depending on the polarity of a verb in the input, a particular token +is either semantically ruled out or allowed at the masked position. By +replicating Gubelmann and Handschuh (2022) experiments, we have uncovered flaws +that weaken the conclusions that can be drawn from this test. We thus propose +an improved version, the Self-Contained Neg Test, which is more controlled, +more systematic, and entirely based on examples forming minimal pairs varying +only in the presence or absence of verbal negation in English. When applying +our test to the roberta and bert base and large models, we show that only +roberta-large shows trends that match the expectations, while bert-base is +mostly insensitive to negation. For all the tested models though, in a +significant number of test instances the top-1 prediction remains the token +that is semantically forbidden by the context, which shows how much room for +improvement remains for a proper treatment of the negation phenomenon. + +
+
+
+
+
+ + ☆ Expanding FLORES+ Benchmark for more Low-Resource Settings: + Portuguese-Emakhuwa Machine Translation Evaluation + + +
+ As part of the Open Language Data Initiative shared tasks, we have expanded +the FLORES+ evaluation set to include Emakhuwa, a low-resource language widely +spoken in Mozambique. We translated the dev and devtest sets from Portuguese +into Emakhuwa, and we detail the translation process and quality assurance +measures used. Our methodology involved various quality checks, including +post-editing and adequacy assessments. The resulting datasets consist of +multiple reference sentences for each source. We present baseline results from +training a Neural Machine Translation system and fine-tuning existing +multilingual translation models. Our findings suggest that spelling +inconsistencies remain a challenge in Emakhuwa. Additionally, the baseline +models underperformed on this evaluation set, underscoring the necessity for +further research to enhance machine translation quality for Emakhuwa. The data +is publicly available at https://huggingface.co/datasets/LIACC/Emakhuwa-FLORES. + +
+
+ comment: Open Language Data Initiative 2024 shared tasks +
+
+
+
+
+ + ☆ Distributional Properties of Subword Regularization + + +
+ Subword regularization, used widely in NLP, improves model performance by +reducing the dependency on exact tokenizations, augmenting the training corpus, +and exposing the model to more unique contexts during training. BPE and +MaxMatch, two popular subword tokenization schemes, have stochastic dropout +regularization variants. However, there has not been an analysis of the +distributions formed by them. We show that these stochastic variants are +heavily biased towards a small set of tokenizations per word. If the benefits +of subword regularization are as mentioned, we hypothesize that biasedness +artificially limits the effectiveness of these schemes. Thus, we propose an +algorithm to uniformly sample tokenizations that we use as a drop-in +replacement for the stochastic aspects of existing tokenizers, and find that it +improves machine translation quality. + +
+
+ comment: 4 pages + 4 page appendix. 3 figures +
+
+
+
+
+ + ☆ LAHAJA: A Robust Multi-accent Benchmark for Evaluating Hindi ASR Systems + + +
+ Hindi, one of the most spoken language of India, exhibits a diverse array of +accents due to its usage among individuals from diverse linguistic origins. To +enable a robust evaluation of Hindi ASR systems on multiple accents, we create +a benchmark, LAHAJA, which contains read and extempore speech on a diverse set +of topics and use cases, with a total of 12.5 hours of Hindi audio, sourced +from 132 speakers spanning 83 districts of India. We evaluate existing +open-source and commercial models on LAHAJA and find their performance to be +poor. We then train models using different datasets and find that our model +trained on multilingual data with good speaker diversity outperforms existing +models by a significant margin. We also present a fine-grained analysis which +shows that the performance declines for speakers from North-East and South +India, especially with content heavy in named entities and specialized +terminology. + +
+
+
+
+
+ + ☆ Diagnosing and Remedying Knowledge Deficiencies in LLMs via Label-free + Curricular Meaningful Learning + + +
+ Large Language Models (LLMs) are versatile and demonstrate impressive +generalization ability by mining and learning information from extensive +unlabeled text. However, they still exhibit reasoning mistakes, often stemming +from knowledge deficiencies, which can affect their trustworthiness and +reliability. Although users can provide diverse and comprehensive queries, +obtaining sufficient and effective feedback is demanding. Furthermore, +evaluating LLMs comprehensively with limited labeled samples is difficult. This +makes it a challenge to diagnose and remedy the deficiencies of LLMs through +rich label-free user queries. To tackle this challenge, we propose a label-free +curricular meaningful learning framework (LaMer). LaMer first employs relative +entropy to automatically diagnose and quantify the knowledge deficiencies of +LLMs in a label-free setting. Next, to remedy the diagnosed knowledge +deficiencies, we apply curricular meaningful learning: first, we adopt +meaningful learning to adaptively synthesize augmentation data according to the +severity of the deficiencies, and then design a curricular deficiency remedy +strategy to remedy the knowledge deficiencies of LLMs progressively. +Experiments show that LaMer efficiently and effectively diagnoses and remedies +knowledge deficiencies in LLMs, improving various LLMs across seven +out-of-distribution (OOD) reasoning and language understanding benchmarks, +achieving comparable results to baselines with just 40\% training data. LaMer +even surpasses methods that rely on labeled datasets for deficiency diagnosis. +In application, our label-free method can offer an effective knowledge +deficiency diagnostic tool for efficient LLM development. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Towards "Differential AI Psychology" and in-context Value-driven + Statement Alignment with Moral Foundations Theory + + +
+ Contemporary research in social sciences is increasingly utilizing +state-of-the-art statistical language models to annotate or generate content. +While these models perform benchmark-leading on common language tasks and show +exemplary task-independent emergent abilities, transferring them to novel +out-of-domain tasks is only insufficiently explored. The implications of the +statistical black-box approach - stochastic parrots - are prominently +criticized in the language model research community; however, the significance +for novel generative tasks is not. + This work investigates the alignment between personalized language models and +survey participants on a Moral Foundation Theory questionnaire. We adapt +text-to-text models to different political personas and survey the +questionnaire repetitively to generate a synthetic population of persona and +model combinations. Analyzing the intra-group variance and cross-alignment +shows significant differences across models and personas. Our findings indicate +that adapted models struggle to represent the survey-captured assessment of +political ideologies. Thus, using language models to mimic social interactions +requires measurable improvements in in-context optimization or parameter +manipulation to align with psychological and sociological stereotypes. Without +quantifiable alignment, generating politically nuanced content remains +unfeasible. To enhance these representations, we propose a testable framework +to generate agents based on moral value statements for future research. + +
+
+ comment: 8 pages, 6 tables +
+
+
+
+
+ + ☆ MoE-LPR: Multilingual Extension of Large Language Models through + Mixture-of-Experts with Language Priors Routing + + +
+ Large Language Models (LLMs) are often English-centric due to the +disproportionate distribution of languages in their pre-training data. +Enhancing non-English language capabilities through post-pretraining often +results in catastrophic forgetting of the ability of original languages. +Previous methods either achieve good expansion with severe forgetting or slight +forgetting with poor expansion, indicating the challenge of balancing language +expansion while preventing forgetting. In this paper, we propose a method +called MoE-LPR (Mixture-of-Experts with Language Priors Routing) to alleviate +this problem. MoE-LPR employs a two-stage training approach to enhance the +multilingual capability. First, the model is post-pretrained into a +Mixture-of-Experts (MoE) architecture by upcycling, where all the original +parameters are frozen and new experts are added. In this stage, we focus +improving the ability on expanded languages, without using any original +language data. Then, the model reviews the knowledge of the original languages +with replay data amounting to less than 1% of post-pretraining, where we +incorporate language priors routing to better recover the abilities of the +original languages. Evaluations on multiple benchmarks show that MoE-LPR +outperforms other post-pretraining methods. Freezing original parameters +preserves original language knowledge while adding new experts preserves the +learning ability. Reviewing with LPR enables effective utilization of +multilingual knowledge within the parameters. Additionally, the MoE +architecture maintains the same inference overhead while increasing total model +parameters. Extensive experiments demonstrate MoE-LPR's effectiveness in +improving expanded languages and preserving original language proficiency with +superior scalability. Code and scripts are freely available at +https://github.com/zjwang21/MoE-LPR.git. + +
+
+
+
+
+ + ☆ First Activations Matter: Training-Free Methods for Dynamic Activation + in Large Language Models + + +
+ Dynamic activation (DA) techniques, such as DejaVu and MoEfication, have +demonstrated their potential to significantly enhance the inference efficiency +of large language models (LLMs). However, these techniques often rely on ReLU +activation functions or require additional parameters and training to maintain +performance. This paper introduces a training-free Threshold-based Dynamic +Activation(TDA) method that leverage sequence information to exploit the +inherent sparsity of models across various architectures. This method is +designed to accelerate generation speed by 18-25\% without significantly +compromising task performance, thereby addressing the limitations of existing +DA techniques. Moreover, we delve into the root causes of LLM sparsity and +theoretically analyze two of its critical features: history-related activation +uncertainty and semantic-irrelevant activation inertia. Our comprehensive +analyses not only provide a robust theoretical foundation for DA methods but +also offer valuable insights to guide future research in optimizing LLMs for +greater efficiency and effectiveness. + +
+
+
+
+
+ + ☆ On the Interchangeability of Positional Embeddings in Multilingual + Neural Machine Translation Models + + +
+ Standard Neural Machine Translation (NMT) models have traditionally been +trained with Sinusoidal Positional Embeddings (PEs), which are inadequate for +capturing long-range dependencies and are inefficient for long-context or +document-level translation. In contrast, state-of-the-art large language models +(LLMs) employ relative PEs, demonstrating superior length generalization. This +work explores the potential for efficiently switching the Positional Embeddings +of pre-trained NMT models from absolute sinusoidal PEs to relative approaches +such as RoPE and ALiBi. Our findings reveal that sinusoidal PEs can be +effectively replaced with RoPE and ALiBi with negligible or no performance +loss, achieved by fine-tuning on a small fraction of high-quality data. +Additionally, models trained without Positional Embeddings (NoPE) are not a +viable solution for Encoder-Decoder architectures, as they consistently +under-perform compared to models utilizing any form of Positional Embedding. +Furthermore, even a model trained from scratch with these relative PEs slightly +under-performs a fine-tuned model, underscoring the efficiency and validity of +our hypothesis. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ RAGLAB: A Modular and Research-Oriented Unified Framework for + Retrieval-Augmented Generation + + +
+ Large Language Models (LLMs) demonstrate human-level capabilities in +dialogue, reasoning, and knowledge retention. However, even the most advanced +LLMs face challenges such as hallucinations and real-time updating of their +knowledge. Current research addresses this bottleneck by equipping LLMs with +external knowledge, a technique known as Retrieval Augmented Generation (RAG). +However, two key issues constrained the development of RAG. First, there is a +growing lack of comprehensive and fair comparisons between novel RAG +algorithms. Second, open-source tools such as LlamaIndex and LangChain employ +high-level abstractions, which results in a lack of transparency and limits the +ability to develop novel algorithms and evaluation metrics. To close this gap, +we introduce RAGLAB, a modular and research-oriented open-source library. +RAGLAB reproduces 6 existing algorithms and provides a comprehensive ecosystem +for investigating RAG algorithms. Leveraging RAGLAB, we conduct a fair +comparison of 6 RAG algorithms across 10 benchmarks. With RAGLAB, researchers +can efficiently compare the performance of various algorithms and develop novel +algorithms. + +
+
+ comment: 6 pages, 3 figures +
+
+
+
+
+ + ☆ GeoReasoner: Reasoning On Geospatially Grounded Context For Natural + Language Understanding + + +
+ In human reading and communication, individuals tend to engage in geospatial +reasoning, which involves recognizing geographic entities and making informed +inferences about their interrelationships. To mimic such cognitive process, +current methods either utilize conventional natural language understanding +toolkits, or directly apply models pretrained on geo-related natural language +corpora. However, these methods face two significant challenges: i) they do not +generalize well to unseen geospatial scenarios, and ii) they overlook the +importance of integrating geospatial context from geographical databases with +linguistic information from the Internet. To handle these challenges, we +propose GeoReasoner, a language model capable of reasoning on geospatially +grounded natural language. Specifically, it first leverages Large Language +Models (LLMs) to generate a comprehensive location description based on +linguistic and geospatial information. It also encodes direction and distance +information into spatial embedding via treating them as pseudo-sentences. +Consequently, the model is trained on both anchor-level and neighbor-level +inputs to learn geo-entity representation. Extensive experimental results +demonstrate GeoReasoner's superiority in three tasks: toponym recognition, +toponym linking, and geo-entity typing, compared to the state-of-the-art +baselines. + +
+
+ comment: Accepted by International Conference on Information and Knowledge + Management 2024 +
+
+
+
+
+ + ☆ Clinical Context-aware Radiology Report Generation from Medical Images + using Transformers + + +
+ Recent developments in the field of Natural Language Processing, especially +language models such as the transformer have brought state-of-the-art results +in language understanding and language generation. In this work, we investigate +the use of the transformer model for radiology report generation from chest +X-rays. We also highlight limitations in evaluating radiology report generation +using only the standard language generation metrics. We then applied a +transformer based radiology report generation architecture, and also compare +the performance of a transformer based decoder with the recurrence based +decoder. Experiments were performed using the IU-CXR dataset, showing superior +results to its LSTM counterpart and being significantly faster. Finally, we +identify the need of evaluating radiology report generation system using both +language generation metrics and classification metrics, which helps to provide +robust measure of generated reports in terms of their coherence and diagnostic +value. + +
+
+ comment: 21 pages, 6 figures, 8 tables +
+
+
+
+
+ + ☆ BURExtract-Llama: An LLM for Clinical Concept Extraction in Breast + Ultrasound Reports + + +
+ Breast ultrasound is essential for detecting and diagnosing abnormalities, +with radiology reports summarizing key findings like lesion characteristics and +malignancy assessments. Extracting this critical information is challenging due +to the unstructured nature of these reports, with varied linguistic styles and +inconsistent formatting. While proprietary LLMs like GPT-4 are effective, they +are costly and raise privacy concerns when handling protected health +information. This study presents a pipeline for developing an in-house LLM to +extract clinical information from radiology reports. We first use GPT-4 to +create a small labeled dataset, then fine-tune a Llama3-8B model on it. +Evaluated on clinician-annotated reports, our model achieves an average F1 +score of 84.6%, which is on par with GPT-4. Our findings demonstrate the +feasibility of developing an in-house LLM that not only matches GPT-4's +performance but also offers cost reductions and enhanced data privacy. + +
+
+ comment: This paper has been accepted as the oral paper for the HCHM workshop, + ACM Multimedia 2024 +
+
+
+
+
+ + ☆ Design Principle Transfer in Neural Architecture Search via Large + Language Models + + +
+ Transferable neural architecture search (TNAS) has been introduced to design +efficient neural architectures for multiple tasks, to enhance the practical +applicability of NAS in real-world scenarios. In TNAS, architectural knowledge +accumulated in previous search processes is reused to warm up the architecture +search for new tasks. However, existing TNAS methods still search in an +extensive search space, necessitating the evaluation of numerous architectures. +To overcome this challenge, this work proposes a novel transfer paradigm, i.e., +design principle transfer. In this work, the linguistic description of various +structural components' effects on architectural performance is termed design +principles. They are learned from established architectures and then can be +reused to reduce the search space by discarding unpromising architectures. +Searching in the refined search space can boost both the search performance and +efficiency for new NAS tasks. To this end, a large language model +(LLM)-assisted design principle transfer (LAPT) framework is devised. In LAPT, +LLM is applied to automatically reason the design principles from a set of +given architectures, and then a principle adaptation method is applied to +refine these principles progressively based on the new search results. +Experimental results show that LAPT can beat the state-of-the-art TNAS methods +on most tasks and achieve comparable performance on others. + +
+
+
+
+
+ + ☆ Plug, Play, and Fuse: Zero-Shot Joint Decoding via Word-Level Re-ranking + Across Diverse Vocabularies + + +
+ Recent advancements in NLP have resulted in models with specialized +strengths, such as processing multimodal inputs or excelling in specific +domains. However, real-world tasks, like multimodal translation, often require +a combination of these strengths, such as handling both translation and image +processing. While individual translation and vision models are powerful, they +typically lack the ability to perform both tasks in a single system. Combining +these models poses challenges, particularly due to differences in their +vocabularies, which limit the effectiveness of traditional ensemble methods to +post-generation techniques like N-best list re-ranking. In this work, we +propose a novel zero-shot ensembling strategy that allows for the integration +of different models during the decoding phase without the need for additional +training. Our approach re-ranks beams during decoding by combining scores at +the word level, using heuristics to predict when a word is completed. We +demonstrate the effectiveness of this method in machine translation scenarios, +showing that it enables the generation of translations that are both speech- +and image-aware while also improving overall translation quality\footnote{We +will release the code upon paper acceptance.}. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Towards Evaluating Large Language Models on Sarcasm Understanding + + +
+ In the era of large language models (LLMs), the task of ``System I''~-~the +fast, unconscious, and intuitive tasks, e.g., sentiment analysis, text +classification, etc., have been argued to be successfully solved. However, +sarcasm, as a subtle linguistic phenomenon, often employs rhetorical devices +like hyperbole and figuration to convey true sentiments and intentions, +involving a higher level of abstraction than sentiment analysis. There is +growing concern that the argument about LLMs' success may not be fully tenable +when considering sarcasm understanding. To address this question, we select +eleven SOTA LLMs and eight SOTA pre-trained language models (PLMs) and present +comprehensive evaluations on six widely used benchmark datasets through +different prompting approaches, i.e., zero-shot input/output (IO) prompting, +few-shot IO prompting, chain of thought (CoT) prompting. Our results highlight +three key findings: (1) current LLMs underperform supervised PLMs based sarcasm +detection baselines across six sarcasm benchmarks. This suggests that +significant efforts are still required to improve LLMs' understanding of human +sarcasm. (2) GPT-4 consistently and significantly outperforms other LLMs across +various prompting methods, with an average improvement of 14.0\%$\uparrow$. +Claude 3 and ChatGPT demonstrate the next best performance after GPT-4. (3) +Few-shot IO prompting method outperforms the other two methods: zero-shot IO +and few-shot CoT. The reason is that sarcasm detection, being a holistic, +intuitive, and non-rational cognitive process, is argued not to adhere to +step-by-step logical reasoning, making CoT less effective in understanding +sarcasm compared to its effectiveness in mathematical reasoning tasks. + +
+
+
+
+
+ + ☆ EEG-Defender: Defending against Jailbreak through Early Exit Generation + of Large Language Models + + +
+ Large Language Models (LLMs) are increasingly attracting attention in various +applications. Nonetheless, there is a growing concern as some users attempt to +exploit these models for malicious purposes, including the synthesis of +controlled substances and the propagation of disinformation. In an effort to +mitigate such risks, the concept of "Alignment" technology has been developed. +However, recent studies indicate that this alignment can be undermined using +sophisticated prompt engineering or adversarial suffixes, a technique known as +"Jailbreak." Our research takes cues from the human-like generate process of +LLMs. We identify that while jailbreaking prompts may yield output logits +similar to benign prompts, their initial embeddings within the model's latent +space tend to be more analogous to those of malicious prompts. Leveraging this +finding, we propose utilizing the early transformer outputs of LLMs as a means +to detect malicious inputs, and terminate the generation immediately. Built +upon this idea, we introduce a simple yet significant defense approach called +EEG-Defender for LLMs. We conduct comprehensive experiments on ten jailbreak +methods across three models. Our results demonstrate that EEG-Defender is +capable of reducing the Attack Success Rate (ASR) by a significant margin, +roughly 85\% in comparison with 50\% for the present SOTAs, with minimal impact +on the utility and effectiveness of LLMs. + +
+
+ comment: 19 pages, 7 figures +
+
+
+
+
+ + ☆ RePair: Automated Program Repair with Process-based Feedback + + +
+ The gap between the trepidation of program reliability and the expense of +repairs underscores the indispensability of Automated Program Repair (APR). APR +is instrumental in transforming vulnerable programs into more robust ones, +bolstering program reliability while simultaneously diminishing the financial +burden of manual repairs. Commercial-scale language models (LM) have taken APR +to unprecedented levels. However, the emergence reveals that for models fewer +than 100B parameters, making single-step modifications may be difficult to +achieve the desired effect. Moreover, humans interact with the LM through +explicit prompts, which hinders the LM from receiving feedback from compiler +and test cases to automatically optimize its repair policies. In this +literature, we explore how small-scale LM (less than 20B) achieve excellent +performance through process supervision and feedback. We start by constructing +a dataset named CodeNet4Repair, replete with multiple repair records, which +supervises the fine-tuning of a foundational model. Building upon the +encouraging outcomes of reinforcement learning, we develop a reward model that +serves as a critic, providing feedback for the fine-tuned LM's action, +progressively optimizing its policy. During inference, we require the LM to +generate solutions iteratively until the repair effect no longer improves or +hits the maximum step limit. The results show that process-based not only +outperforms larger outcome-based generation methods, but also nearly matches +the performance of closed-source commercial large-scale LMs. + +
+
+ comment: 15 pages, 13 figures +
+
+
+
+
+ + ☆ RedWhale: An Adapted Korean LLM Through Efficient Continual Pretraining + + +
+ The field of Natural Language Processing (NLP) has seen significant +advancements with the development of Large Language Models (LLMs). However, +much of this research remains focused on English, often overlooking +low-resource languages like Korean. This oversight presents challenges due to +the unique non-alphabetic token structure of Korean and the substantial memory +and computational demands required for LLM training, which frequently lead to +memory constraints and out-of-memory errors. To address these issues, we +present RedWhale, a model specifically tailored for Korean language processing. +RedWhale is developed using an efficient continual pretraining approach that +includes a comprehensive Korean corpus preprocessing pipeline, a specialized +tokenizer, an optimized model initialization technique, and a multistage +pretraining strategy. These innovations collectively reduce training time and +computational costs while maintaining high levels of accuracy and +comprehension. By leveraging cross-lingual transfer learning, which exploits +shared linguistic similarities across languages, RedWhale builds on English +models to enhance Korean language processing. Experimental results demonstrate +that RedWhale outperforms other leading models on Korean NLP benchmarks, +including the Korean Balanced Evaluation of Significant Tasks (KoBEST), showing +superior understanding and generation of Korean text. Furthermore, RedWhale +showed no signs of convergence even after pretraining on 9.7 billion tokens, +indicating the potential for further improvements with additional training. +This work represents a significant advancement in bridging the linguistic +divide, particularly in enhancing NLP capabilities for the Korean language. + +
+
+
+
+
+ + ☆ Towards Analyzing and Mitigating Sycophancy in Large Vision-Language + Models + + +
+ Large Vision-Language Models (LVLMs) have shown significant capability in +vision-language understanding. However, one critical issue that persists in +these models is sycophancy, which means models are unduly influenced by leading +or deceptive prompts, resulting in biased outputs and hallucinations. Despite +the progress in LVLMs, evaluating and mitigating sycophancy is yet much +under-explored. In this work, we fill this gap by systematically analyzing +sycophancy on various VL benchmarks with curated leading queries and further +proposing a text contrastive decoding method for mitigation. While the specific +sycophantic behavior varies significantly among models, our analysis reveals +the severe deficiency of all LVLMs in resilience of sycophancy across various +tasks. For improvement, we propose Leading Query Contrastive Decoding (LQCD), a +model-agnostic method focusing on calibrating the LVLMs' over-reliance on +leading cues by identifying and suppressing the probabilities of sycophancy +tokens at the decoding stage. Extensive experiments show that LQCD effectively +mitigate sycophancy, outperforming both prompt engineering methods and common +methods for hallucination mitigation. We further demonstrate that LQCD does not +hurt but even slightly improves LVLMs' responses to neutral queries, suggesting +it being a more effective strategy for general-purpose decoding but not limited +to sycophancy. + +
+
+
+
+
+ + ☆ Improving Speech Recognition Error Prediction for Modern and + Off-the-shelf Speech Recognizers + + +
+ Modeling the errors of a speech recognizer can help simulate errorful +recognized speech data from plain text, which has proven useful for tasks like +discriminative language modeling, improving robustness of NLP systems, where +limited or even no audio data is available at train time. Previous work +typically considered replicating behavior of GMM-HMM based systems, but the +behavior of more modern posterior-based neural network acoustic models is not +the same and requires adjustments to the error prediction model. In this work, +we extend a prior phonetic confusion based model for predicting speech +recognition errors in two ways: first, we introduce a sampling-based paradigm +that better simulates the behavior of a posterior-based acoustic model. Second, +we investigate replacing the confusion matrix with a sequence-to-sequence model +in order to introduce context dependency into the prediction. We evaluate the +error predictors in two ways: first by predicting the errors made by a +Switchboard ASR system on unseen data (Fisher), and then using that same +predictor to estimate the behavior of an unrelated cloud-based ASR system on a +novel task. Sampling greatly improves predictive accuracy within a 100-guess +paradigm, while the sequence model performs similarly to the confusion matrix. + +
+
+
+
+
+ + ☆ Counterfactuals As a Means for Evaluating Faithfulness of Attribution + Methods in Autoregressive Language Models + + +
+ Despite the widespread adoption of autoregressive language models, +explainability evaluation research has predominantly focused on span infilling +and masked language models (MLMs). Evaluating the faithfulness of an +explanation method -- how accurately the method explains the inner workings and +decision-making of the model -- is very challenging because it is very hard to +separate the model from its explanation. Most faithfulness evaluation +techniques corrupt or remove some input tokens considered important according +to a particular attribution (feature importance) method and observe the change +in the model's output. This approach creates out-of-distribution inputs for +causal language models (CLMs) due to their training objective of next token +prediction. In this study, we propose a technique that leverages counterfactual +generation to evaluate the faithfulness of attribution methods for +autoregressive language modeling scenarios. Our technique creates fluent and +in-distribution counterfactuals that makes evaluation protocol more reliable. +Code is available at https://github.com/Sepehr-Kamahi/faith + +
+
+ comment: 17 pages, 6 figures +
+
+
+
+
+ + ☆ Reasoning and Tools for Human-Level Forecasting + + +
+ Language models (LMs) trained on web-scale datasets are largely successful +due to their ability to memorize large amounts of training data, even if only +present in a few examples. These capabilities are often desirable in evaluation +on tasks such as question answering but raise questions about whether these +models can exhibit genuine reasoning or succeed only at mimicking patterns from +the training data. This distinction is particularly salient in forecasting +tasks, where the answer is not present in the training data, and the model must +reason to make logical deductions. We present Reasoning and Tools for +Forecasting (RTF), a framework of reasoning-and-acting (ReAct) agents that can +dynamically retrieve updated information and run numerical simulation with +equipped tools. We evaluate our model with questions from competitive +forecasting platforms and demonstrate that our method is competitive with and +can outperform human predictions. This suggests that LMs, with the right tools, +can indeed think and adapt like humans, offering valuable insights for +real-world decision-making. + +
+
+
+
+
+ + ☆ Let Community Rules Be Reflected in Online Content Moderation + + +
+ Content moderation is a widely used strategy to prevent the dissemination of +irregular information on social media platforms. Despite extensive research on +developing automated models to support decision-making in content moderation, +there remains a notable scarcity of studies that integrate the rules of online +communities into content moderation. This study addresses this gap by proposing +a community rule-based content moderation framework that directly integrates +community rules into the moderation of user-generated content. Our experiment +results with datasets collected from two domains demonstrate the superior +performance of models based on the framework to baseline models across all +evaluation metrics. In particular, incorporating community rules substantially +enhances model performance in content moderation. The findings of this research +have significant research and practical implications for improving the +effectiveness and generalizability of content moderation models in online +communities. + +
+
+ comment: 10 pages, 3 figures +
+
+
+
+
+ + ☆ Limitations in Employing Natural Language Supervision for Sensor-Based + Human Activity Recognition -- And Ways to Overcome Them + + +
+ Cross-modal contrastive pre-training between natural language and other +modalities, e.g., vision and audio, has demonstrated astonishing performance +and effectiveness across a diverse variety of tasks and domains. In this paper, +we investigate whether such natural language supervision can be used for +wearable sensor based Human Activity Recognition (HAR), and discover +that-surprisingly-it performs substantially worse than standard end-to-end +training and self-supervision. We identify the primary causes for this as: +sensor heterogeneity and the lack of rich, diverse text descriptions of +activities. To mitigate their impact, we also develop strategies and assess +their effectiveness through an extensive experimental evaluation. These +strategies lead to significant increases in activity recognition, bringing +performance closer to supervised and self-supervised training, while also +enabling the recognition of unseen activities and cross modal retrieval of +videos. Overall, our work paves the way for better sensor-language learning, +ultimately leading to the development of foundational models for HAR using +wearables. + +
+
+
+
+
+ + ☆ Understanding Epistemic Language with a Bayesian Theory of Mind + + +
+ How do people understand and evaluate claims about others' beliefs, even +though these beliefs cannot be directly observed? In this paper, we introduce a +cognitive model of epistemic language interpretation, grounded in Bayesian +inferences about other agents' goals, beliefs, and intentions: a +language-augmented Bayesian theory-of-mind (LaBToM). By translating natural +language into an epistemic ``language-of-thought'', then evaluating these +translations against the inferences produced by inverting a probabilistic +generative model of rational action and perception, LaBToM captures graded +plausibility judgments about epistemic claims. We validate our model in an +experiment where participants watch an agent navigate a maze to find keys +hidden in boxes needed to reach their goal, then rate sentences about the +agent's beliefs. In contrast with multimodal LLMs (GPT-4o, Gemini Pro) and +ablated models, our model correlates highly with human judgments for a wide +range of expressions, including modal language, uncertainty expressions, +knowledge claims, likelihood comparisons, and attributions of false belief. + +
+
+ comment: 21 pages +
+
+
+
+
+ + ☆ RAG-Optimized Tibetan Tourism LLMs: Enhancing Accuracy and + Personalization + + +
+ With the development of the modern social economy, tourism has become an +important way to meet people's spiritual needs, bringing development +opportunities to the tourism industry. However, existing large language models +(LLMs) face challenges in personalized recommendation capabilities and the +generation of content that can sometimes produce hallucinations. This study +proposes an optimization scheme for Tibet tourism LLMs based on +retrieval-augmented generation (RAG) technology. By constructing a database of +tourist viewpoints and processing the data using vectorization techniques, we +have significantly improved retrieval accuracy. The application of RAG +technology effectively addresses the hallucination problem in content +generation. The optimized model shows significant improvements in fluency, +accuracy, and relevance of content generation. This research demonstrates the +potential of RAG technology in the standardization of cultural tourism +information and data analysis, providing theoretical and technical support for +the development of intelligent cultural tourism service systems. + +
+
+ comment: Accepted by AIPR 2024 +
+
+
+
+
+ + ☆ Large Language Models for Page Stream Segmentation + + +
+ Page Stream Segmentation (PSS) is an essential prerequisite for automated +document processing at scale. However, research progress has been limited by +the absence of realistic public benchmarks. This paper works towards addressing +this gap by introducing TABME++, an enhanced benchmark featuring commercial +Optical Character Recognition (OCR) annotations. We evaluate the performance of +large language models (LLMs) on PSS, focusing on decoder-based models +fine-tuned with parameter-efficient methods. Our results show that +decoder-based LLMs outperform smaller multimodal encoders. Through a review of +existing PSS research and datasets, we identify key challenges and advancements +in the field. Our findings highlight the key importance of robust OCR, +providing valuable insights for the development of more effective document +processing systems. + +
+
+
+
+
+ + ☆ Characterizing Online Toxicity During the 2022 Mpox Outbreak: A + Computational Analysis of Topical and Network Dynamics + + +
+ Background: Online toxicity, encompassing behaviors such as harassment, +bullying, hate speech, and the dissemination of misinformation, has become a +pressing social concern in the digital age. The 2022 Mpox outbreak, initially +termed "Monkeypox" but subsequently renamed to mitigate associated stigmas and +societal concerns, serves as a poignant backdrop to this issue. Objective: In +this research, we undertake a comprehensive analysis of the toxic online +discourse surrounding the 2022 Mpox outbreak. Our objective is to dissect its +origins, characterize its nature and content, trace its dissemination patterns, +and assess its broader societal implications, with the goal of providing +insights that can inform strategies to mitigate such toxicity in future crises. +Methods: We collected more than 1.6 million unique tweets and analyzed them +from five dimensions, including context, extent, content, speaker, and intent. +Utilizing BERT-based topic modeling and social network community clustering, we +delineated the toxic dynamics on Twitter. Results: We identified five +high-level topic categories in the toxic online discourse on Twitter, including +disease (46.6%), health policy and healthcare (19.3%), homophobia (23.9%), +politics (6.0%), and racism (4.1%). Through the toxicity diffusion networks of +mentions, retweets, and the top users, we found that retweets of toxic content +were widespread, while influential users rarely engaged with or countered this +toxicity through retweets. Conclusions: By tracking topical dynamics, we can +track the changing popularity of toxic content online, providing a better +understanding of societal challenges. Network dynamics spotlight key social +media influencers and their intents, indicating that addressing these central +figures in toxic discourse can enhance crisis communication and inform +policy-making. + +
+
+ comment: 36 pages, 8 figure, and 12 tables +
+
+
+
+
+ + ☆ Decoding SEC Actions: Enforcement Trends through Analyzing Blockchain + litigation using LLM-based Thematic Factor Mapping + + +
+ The proliferation of blockchain entities (persons or enterprises) exposes +them to potential regulatory actions (e.g., being litigated) by regulatory +authorities. Regulatory frameworks for crypto assets are actively being +developed and refined, increasing the likelihood of such actions. The lack of +systematic analysis of the factors driving litigation against blockchain +entities leaves companies in need of clarity to navigate compliance risks. This +absence of insight also deprives investors of the information for informed +decision-making. This study focuses on U.S. litigation against blockchain +entities, particularly by the U.S. Securities and Exchange Commission (SEC) +given its influence on global crypto regulation. Utilizing frontier pretrained +language models and large language models, we systematically map all SEC +complaints against blockchain companies from 2012 to 2024 to thematic factors +conceptualized by our study to delineate the factors driving SEC actions. We +quantify the thematic factors and assess their influence on specific legal Acts +cited within the complaints on an annual basis, allowing us to discern the +regulatory emphasis, patterns and conduct trend analysis. + +
+
+
+
+
+ + ☆ The State of Commercial Automatic French Legal Speech Recognition + Systems and their Impact on Court Reporters et al + + +
+ In Quebec and Canadian courts, the transcription of court proceedings is a +critical task for appeal purposes and must be certified by an official court +reporter. The limited availability of qualified reporters and the high costs +associated with manual transcription underscore the need for more efficient +solutions. This paper examines the potential of Automatic Speech Recognition +(ASR) systems to assist court reporters in transcribing legal proceedings. We +benchmark three ASR models, including commercial and open-source options, on +their ability to recognize French legal speech using a curated dataset. Our +study evaluates the performance of these systems using the Word Error Rate +(WER) metric and introduces the Sonnex Distance to account for phonetic +accuracy. We also explore the broader implications of ASR adoption on court +reporters, copyists, the legal system, and litigants, identifying both positive +and negative impacts. The findings suggest that while current ASR systems show +promise, they require further refinement to meet the specific needs of the +legal domain. + +
+
+
+
+
+ + ☆ Defining Boundaries: The Impact of Domain Specification on + Cross-Language and Cross-Domain Transfer in Machine Translation + + +
+ Recent advancements in neural machine translation (NMT) have revolutionized +the field, yet the dependency on extensive parallel corpora limits progress for +low-resource languages. Cross-lingual transfer learning offers a promising +solution by utilizing data from high-resource languages but often struggles +with in-domain NMT. In this paper, we investigate three pivotal aspects: +enhancing the domain-specific quality of NMT by fine-tuning domain-relevant +data from different language pairs, identifying which domains are transferable +in zero-shot scenarios, and assessing the impact of language-specific versus +domain-specific factors on adaptation effectiveness. Using English as the +source language and Spanish for fine-tuning, we evaluate multiple target +languages including Portuguese, Italian, French, Czech, Polish, and Greek. Our +findings reveal significant improvements in domain-specific translation +quality, especially in specialized fields such as medical, legal, and IT, +underscoring the importance of well-defined domain data and transparency of the +experiment setup in in-domain transfer learning. + +
+
+
+
+
+ + ☆ Ancient Wisdom, Modern Tools: Exploring Retrieval-Augmented LLMs for + Ancient Indian Philosophy ACL 2024 + + +
+ LLMs have revolutionized the landscape of information retrieval and knowledge +dissemination. However, their application in specialized areas is often +hindered by factual inaccuracies and hallucinations, especially in long-tail +knowledge distributions. We explore the potential of retrieval-augmented +generation (RAG) models for long-form question answering (LFQA) in a +specialized knowledge domain. We present VedantaNY-10M, a dataset curated from +extensive public discourses on the ancient Indian philosophy of Advaita +Vedanta. We develop and benchmark a RAG model against a standard, non-RAG LLM, +focusing on transcription, retrieval, and generation performance. Human +evaluations by computational linguists and domain experts show that the RAG +model significantly outperforms the standard model in producing factual and +comprehensive responses having fewer hallucinations. In addition, a +keyword-based hybrid retriever that emphasizes unique low-frequency terms +further improves results. Our study provides insights into effectively +integrating modern large language models with ancient knowledge systems. +Project page with dataset and code: https://sites.google.com/view/vedantany-10m + +
+
+ comment: Best paper at the Workshop on Machine Learning for Ancient Languages + @ ACL 2024. Proceedings of the 1st Machine Learning for Ancient Languages + Workshop, 2024.ml4al-1.23, Association for Computational Linguistics (ACL) + 2024. Dataset, code, and evaluation is available at: + https://sites.google.com/view/vedantany-10m +
+
+
+
+
+ + ♻ ☆ MagicDec: Breaking the Latency-Throughput Tradeoff for Long Context + Generation with Speculative Decoding + + +
+ Large Language Models (LLMs) have become more prevalent in long-context +applications such as interactive chatbots, document analysis, and agent +workflows, but it is challenging to serve long-context requests with low +latency and high throughput. Speculative decoding (SD) is a widely used +technique to reduce latency without sacrificing performance but the +conventional wisdom suggests that its efficacy is limited to small batch sizes. +In MagicDec, we show that surprisingly SD can achieve speedup even for a high +throughput inference regime for moderate to long sequences. More interestingly, +an intelligent drafting strategy can achieve better speedup with increasing +batch size based on our rigorous analysis. MagicDec first identifies the +bottleneck shifts with increasing batch size and sequence length, and uses +these insights to deploy speculative decoding more effectively for high +throughput inference. Then, it leverages draft models with sparse KV cache to +address the KV bottleneck that scales with both sequence length and batch size. +This finding underscores the broad applicability of speculative decoding in +long-context serving, as it can enhance throughput and reduce latency without +compromising accuracy. For moderate to long sequences, we demonstrate up to 2x +speedup for LLaMA-2-7B-32K and 1.84x speedup for LLaMA-3.1-8B when serving +batch sizes ranging from 32 to 256 on 8 NVIDIA A100 GPUs. The code is available +at https://github.com/Infini-AI-Lab/MagicDec/. + +
+
+
+
+
+ + ♻ ☆ LongVILA: Scaling Long-Context Visual Language Models for Long Videos + + +
+ Long-context capability is critical for multi-modal foundation models, +especially for long video understanding. We introduce LongVILA, a full-stack +solution for long-context visual-language models by co-designing the algorithm +and system. For model training, we upgrade existing VLMs to support long video +understanding by incorporating two additional stages, i.e., long context +extension and long supervised fine-tuning. However, training on long video is +computationally and memory intensive. We introduce the long-context Multi-Modal +Sequence Parallelism (MM-SP) system that efficiently parallelizes long video +training and inference, enabling 2M context length training on 256 GPUs without +any gradient checkpointing. LongVILA efficiently extends the number of video +frames of VILA from 8 to 1024, improving the long video captioning score from +2.00 to 3.26 (out of 5), achieving 99.5% accuracy in 1400-frame (274k context +length) video needle-in-a-haystack. LongVILA-8B demonstrates consistent +accuracy improvements on long videos in the VideoMME benchmark as the number of +frames increases. Besides, MM-SP is 2.1x - 5.7x faster than ring sequence +parallelism and 1.1x - 1.4x faster than Megatron with context parallelism + +tensor parallelism. Moreover, it seamlessly integrates with Hugging Face +Transformers. + +
+
+ comment: Code and models are available at + https://github.com/NVlabs/VILA/blob/main/LongVILA.md +
+
+
+
+
+ + ♻ ☆ Competence-Based Analysis of Language Models + + +
+ Despite the recent successes of large, pretrained neural language models +(LLMs), comparatively little is known about the representations of linguistic +structure they learn during pretraining, which can lead to unexpected behaviors +in response to prompt variation or distribution shift. To better understand +these models and behaviors, we introduce a general model analysis framework to +study LLMs with respect to their representation and use of human-interpretable +linguistic properties. Our framework, CALM (Competence-based Analysis of +Language Models), is designed to investigate LLM competence in the context of +specific tasks by intervening on models' internal representations of different +linguistic properties using causal probing, and measuring models' alignment +under these interventions with a given ground-truth causal model of the task. +We also develop a new approach for performing causal probing interventions +using gradient-based adversarial attacks, which can target a broader range of +properties and representations than prior techniques. Finally, we carry out a +case study of CALM using these interventions to analyze and compare LLM +competence across a variety of lexical inference tasks, showing that CALM can +be used to explain and predict behaviors across these tasks. + +
+
+
+
+
+ + ♻ ☆ Bias and Unfairness in Information Retrieval Systems: New Challenges in + the LLM Era KDD 2024 + + +
+ With the rapid advancements of large language models (LLMs), information +retrieval (IR) systems, such as search engines and recommender systems, have +undergone a significant paradigm shift. This evolution, while heralding new +opportunities, introduces emerging challenges, particularly in terms of biases +and unfairness, which may threaten the information ecosystem. In this paper, we +present a comprehensive survey of existing works on emerging and pressing bias +and unfairness issues in IR systems when the integration of LLMs. We first +unify bias and unfairness issues as distribution mismatch problems, providing a +groundwork for categorizing various mitigation strategies through distribution +alignment. Subsequently, we systematically delve into the specific bias and +unfairness issues arising from three critical stages of LLMs integration into +IR systems: data collection, model development, and result evaluation. In doing +so, we meticulously review and analyze recent literature, focusing on the +definitions, characteristics, and corresponding mitigation strategies +associated with these issues. Finally, we identify and highlight some open +problems and challenges for future work, aiming to inspire researchers and +stakeholders in the IR field and beyond to better understand and mitigate bias +and unfairness issues of IR in this LLM era. We also consistently maintain a +GitHub repository for the relevant papers and resources in this rising +direction at https://github.com/KID-22/LLM-IR-Bias-Fairness-Survey. + +
+
+ comment: KDD 2024 Tutorial&Survey; Tutorial Website: + https://llm-ir-bias-fairness.github.io/ +
+
+
+
+
+ + ♻ ☆ No Such Thing as a General Learner: Language models and their dual + optimization + + +
+ What role can the otherwise successful Large Language Models (LLMs) play in +the understanding of human cognition, and in particular in terms of informing +language acquisition debates? To contribute to this question, we first argue +that neither humans nor LLMs are general learners, in a variety of senses. We +make a novel case for how in particular LLMs follow a dual-optimization +process: they are optimized during their training (which is typically compared +to language acquisition), and modern LLMs have also been selected, through a +process akin to natural selection in a species. From this perspective, we argue +that the performance of LLMs, whether similar or dissimilar to that of humans, +does not weigh easily on important debates about the importance of human +cognitive biases for language. + +
+
+ comment: 11 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ KOSMOS-2.5: A Multimodal Literate Model + + +
+ The automatic reading of text-intensive images represents a significant +advancement toward achieving Artificial General Intelligence (AGI). In this +paper we present KOSMOS-2.5, a multimodal literate model for machine reading of +text-intensive images. Pre-trained on a large-scale corpus of text-intensive +images, KOSMOS-2.5 excels in two distinct yet complementary transcription +tasks: (1) generating spatially-aware text blocks, where each block of text is +assigned spatial coordinates within the image, and (2) producing structured +text output that captures both style and structure in markdown format. This +unified multimodal literate capability is achieved through a shared +decoder-only autoregressive Transformer architecture and task-specific prompts. +Building on this foundation, we fine-tune KOSMOS-2.5 for document understanding +tasks, resulting in a document understanding generalist named KOSMOS-2.5-CHAT. +Additionally, a large corpus of 357.4 million document pages spanning diverse +domains was curated for pre-training. We evaluate KOSMOS-2.5 on two newly +proposed benchmarks, OCREval and MarkdownEval, for document-level text +recognition and image-to-markdown generation, demonstrating impressive literate +capabilities comparable to GPT-4o. KOSMOS-2.5-CHAT achieves performance +comparable to other state-of-the-art generalists that are five times larger +(1.3B vs. 7B) across nine text-rich visual question answering benchmarks. +Models and code have been available at \url{https://aka.ms/kosmos25}. + +
+
+
+
+
+ + ♻ ☆ LBC: Language-Based-Classifier for Out-Of-Variable Generalization + + +
+ Large Language Models (LLMs) have great success in natural language +processing tasks such as response generation. However, their use in tabular +data has been limited due to their inferior performance compared to traditional +machine learning models (TMLs) such as XGBoost. We find that the pre-trained +knowledge of LLMs enables them to interpret new variables that appear in a test +without additional training, a capability central to the concept of +Out-of-Variable (OOV). From the findings, we propose a +Language-Based-Classifier (LBC), a classifier that maximizes the benefits of +LLMs to outperform TMLs on OOV tasks. LBC employs three key methodological +strategies: 1) Categorical changes to adjust data to better fit the model's +understanding, 2) Advanced order and indicator to enhance data representation +to the model, and 3) Using verbalizer to map logit scores to classes during +inference to generate model predictions. These strategies, combined with the +pre-trained knowledge of LBC, emphasize the model's ability to effectively +handle OOV tasks. We empirically and theoretically validate the superiority of +LBC. LBC is the first study to apply an LLM-based model to OOV tasks. The +source code is at +https://github.com/ASDASDanonymous/Language-Based-Classifier-forOOVtasks. + +
+
+ comment: 16 pages, 7 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Verifiable by Design: Aligning Language Models to Quote from + Pre-Training Data + + +
+ To trust the fluent generations of large language models (LLMs), humans must +be able to verify their correctness against trusted, external sources. Recent +efforts, such as providing citations via retrieved documents or post-hoc +provenance, enhance verifiability but still provide no guarantees on their +correctness. To address these limitations, we tackle the verifiability goal +with a different philosophy: trivializing the verification process by +developing models that quote verbatim statements from trusted sources in +pre-training data. We propose Quote-Tuning, and demonstrate it is feasible to +align LLMs to provide quoted statements from data memorized during +pre-training. The core of Quote-Tuning is a fast membership inference function +(Marone and Van Durme, 2023) that efficiently verifies text against a trusted +corpus. We leverage this tool to design a reward function to quantify quotes in +model responses, which is then used to create a dataset for preference +learning. Experimental results show that Quote-Tuning significantly increases +verbatim quotes from high-quality pre-training documents by 55% to 130% +relative to un-tuned models while maintaining response quality. Quote-Tuning +also generalizes quoting to out-of-domain data, is applicable in different +tasks, and provides additional benefits to truthfulness. Our method not only +serves as a hassle-free method to increase quoting but also opens up avenues +for improving LLM trustworthiness through better verifiability. + +
+
+
+
+
+ + ♻ ☆ What Makes and Breaks Safety Fine-tuning? A Mechanistic Study + + +
+ Safety fine-tuning helps align Large Language Models (LLMs) with human +preferences for their safe deployment. To better understand the underlying +factors that make models safe via safety fine-tuning, we design a synthetic +data generation framework that captures salient aspects of an unsafe input by +modeling the interaction between the task the model is asked to perform (e.g., +"design") versus the specific concepts the task is asked to be performed upon +(e.g., a "cycle" vs. a "bomb"). Using this, we investigate three well-known +safety fine-tuning methods -- supervised safety fine-tuning, direct preference +optimization, and unlearning -- and provide significant evidence demonstrating +that these methods minimally transform MLP weights to specifically align unsafe +inputs into its weights' null space. This yields a clustering of inputs based +on whether the model deems them safe or not. Correspondingly, when an +adversarial input (e.g., a jailbreak) is provided, its activations are closer +to safer samples, leading to the model processing such an input as if it were +safe. We validate our findings, wherever possible, on real-world models -- +specifically, Llama-2 7B and Llama-3 8B. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ Tracing Privacy Leakage of Language Models to Training Data via Adjusted + Influence Functions + + +
+ The responses generated by Large Language Models (LLMs) can include sensitive +information from individuals and organizations, leading to potential privacy +leakage. This work implements Influence Functions (IFs) to trace privacy +leakage back to the training data, thereby mitigating privacy concerns of +Language Models (LMs). However, we notice that current IFs struggle to +accurately estimate the influence of tokens with large gradient norms, +potentially overestimating their influence. When tracing the most influential +samples, this leads to frequently tracing back to samples with large gradient +norm tokens, overshadowing the actual most influential samples even if their +influences are well estimated. To address this issue, we propose Heuristically +Adjusted IF (HAIF), which reduces the weight of tokens with large gradient +norms, thereby significantly improving the accuracy of tracing the most +influential samples. To establish easily obtained groundtruth for tracing +privacy leakage, we construct two datasets, PII-E and PII-CR, representing two +distinct scenarios: one with identical text in the model outputs and +pre-training data, and the other where models leverage their reasoning +abilities to generate text divergent from pre-training data. HAIF significantly +improves tracing accuracy, enhancing it by 20.96\% to 73.71\% on the PII-E +dataset and 3.21\% to 45.93\% on the PII-CR dataset, compared to the best SOTA +IFs against various GPT-2 and QWen-1.5 models. HAIF also outperforms SOTA IFs +on real-world pretraining data CLUECorpus2020, demonstrating strong robustness +regardless prompt and response lengths. + +
+
+
+
+
+ + ♻ ☆ Large Language Models in Mental Health Care: a Scoping Review + + +
+ The integration of large language models (LLMs) in mental health care is an +emerging field. There is a need to systematically review the application +outcomes and delineate the advantages and limitations in clinical settings. +This review aims to provide a comprehensive overview of the use of LLMs in +mental health care, assessing their efficacy, challenges, and potential for +future applications. A systematic search was conducted across multiple +databases including PubMed, Web of Science, Google Scholar, arXiv, medRxiv, and +PsyArXiv in November 2023. All forms of original research, peer-reviewed or +not, published or disseminated between October 1, 2019, and December 2, 2023, +are included without language restrictions if they used LLMs developed after T5 +and directly addressed research questions in mental health care settings. From +an initial pool of 313 articles, 34 met the inclusion criteria based on their +relevance to LLM application in mental health care and the robustness of +reported outcomes. Diverse applications of LLMs in mental health care are +identified, including diagnosis, therapy, patient engagement enhancement, etc. +Key challenges include data availability and reliability, nuanced handling of +mental states, and effective evaluation methods. Despite successes in accuracy +and accessibility improvement, gaps in clinical applicability and ethical +considerations were evident, pointing to the need for robust data, standardized +evaluations, and interdisciplinary collaboration. LLMs hold substantial promise +for enhancing mental health care. For their full potential to be realized, +emphasis must be placed on developing robust datasets, development and +evaluation frameworks, ethical guidelines, and interdisciplinary collaborations +to address current limitations. + +
+
+
+
+
+ + ♻ ☆ ML-Bench: Evaluating Large Language Models and Agents for Machine + Learning Tasks on Repository-Level Code + + +
+ Despite Large Language Models (LLMs) like GPT-4 achieving impressive results +in function-level code generation, they struggle with repository-scale code +understanding (e.g., coming up with the right arguments for calling routines), +requiring a deeper comprehension of complex file interactions. Also, recently, +people have developed LLM agents that attempt to interact with repository code +(e.g., compiling and evaluating its execution), prompting the need to evaluate +their performance. These gaps have motivated our development of ML-Bench, a +benchmark rooted in real-world programming applications that leverage existing +code repositories to perform tasks. Addressing the need for LLMs to interpret +long code contexts and translate instructions into precise, executable scripts, +ML-Bench encompasses annotated 9,641 examples across 18 GitHub repositories, +challenging LLMs to accommodate user-specified arguments and documentation +intricacies effectively. To evaluate both LLMs and AI agents, two setups are +employed: ML-LLM-Bench for assessing LLMs' text-to-code conversion within a +predefined deployment environment, and ML-Agent-Bench for testing autonomous +agents in an end-to-end task execution within a Linux sandbox environment. Our +findings indicate that while GPT-4o leads with a Pass@5 rate surpassing 50%, +there remains significant scope for improvement, highlighted by issues such as +hallucinated outputs and difficulties with bash script generation. Notably, in +the more demanding ML-Agent-Bench, GPT-4o achieves a 76.47% success rate, +reflecting the efficacy of iterative action and feedback in complex task +resolution. Our code, dataset, and models are available at +https://github.com/gersteinlab/ML-bench. + +
+
+
+
+
+ + ♻ ☆ Watch Out for Your Guidance on Generation! Exploring Conditional + Backdoor Attacks against Large Language Models + + +
+ Mainstream backdoor attacks on large language models (LLMs) typically set a +fixed trigger in the input instance and specific responses for triggered +queries. However, the fixed trigger setting (e.g., unusual words) may be easily +detected by human detection, limiting the effectiveness and practicality in +real-world scenarios. To enhance the stealthiness of backdoor activation, we +present a new poisoning paradigm against LLMs triggered by specifying +generation conditions, which are commonly adopted strategies by users during +model inference. The poisoned model performs normally for output under +normal/other generation conditions, while becomes harmful for output under +target generation conditions. To achieve this objective, we introduce BrieFool, +an efficient attack framework. It leverages the characteristics of generation +conditions by efficient instruction sampling and poisoning data generation, +thereby influencing the behavior of LLMs under target conditions. Our attack +can be generally divided into two types with different targets: Safety +unalignment attack and Ability degradation attack. Our extensive experiments +demonstrate that BrieFool is effective across safety domains and ability +domains, achieving higher success rates than baseline methods, with 94.3 % on +GPT-3.5-turbo + +
+
+
+
+
+ + ♻ ☆ Multi-Grained Query-Guided Set Prediction Network for Grounded + Multimodal Named Entity Recognition + + +
+ Grounded Multimodal Named Entity Recognition (GMNER) is an emerging +information extraction (IE) task, aiming to simultaneously extract entity +spans, types, and corresponding visual regions of entities from given +sentence-image pairs data. Recent unified methods employing machine reading +comprehension or sequence generation-based frameworks show limitations in this +difficult task. The former, utilizing human-designed queries, struggles to +differentiate ambiguous entities, such as Jordan (Person) and off-White x +Jordan (Shoes). The latter, following the one-by-one decoding order, suffers +from exposure bias issues. We maintain that these works misunderstand the +relationships of multimodal entities. To tackle these, we propose a novel +unified framework named Multi-grained Query-guided Set Prediction Network +(MQSPN) to learn appropriate relationships at intra-entity and inter-entity +levels. Specifically, MQSPN consists of a Multi-grained Query Set (MQS) and a +Multimodal Set Prediction Network (MSP). MQS explicitly aligns entity regions +with entity spans by employing a set of learnable queries to strengthen +intra-entity connections. Based on distinct intra-entity modeling, MSP +reformulates GMNER as a set prediction, guiding models to establish appropriate +inter-entity relationships from a global matching perspective. Additionally, we +incorporate a query-guided Fusion Net (QFNet) to work as a glue network between +MQS and MSP. Extensive experiments demonstrate that our approach achieves +state-of-the-art performances in widely used benchmarks. + +
+
+ comment: 13 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Model Merging in LLMs, MLLMs, and Beyond: Methods, Theories, + Applications and Opportunities + + +
+ Model merging is an efficient empowerment technique in the machine learning +community that does not require the collection of raw training data and does +not require expensive computation. As model merging becomes increasingly +prevalent across various fields, it is crucial to understand the available +model merging techniques comprehensively. However, there is a significant gap +in the literature regarding a systematic and thorough review of these +techniques. This survey provides a comprehensive overview of model merging +methods and theories, their applications in various domains and settings, and +future research directions. Specifically, we first propose a new taxonomic +approach that exhaustively discusses existing model merging methods. Secondly, +we discuss the application of model merging techniques in large language +models, multimodal large language models, and 10+ machine learning subfields, +including continual learning, multi-task learning, few-shot learning, etc. +Finally, we highlight the remaining challenges of model merging and discuss +future research directions. A comprehensive list of papers about model merging +is available at +\url{https://github.com/EnnengYang/Awesome-Model-Merging-Methods-Theories-Applications}. + +
+
+
+
+
+ + ♻ ☆ Inference-Time Selective Debiasing + + +
+ We propose selective debiasing -- an inference-time safety mechanism that +aims to increase the overall quality of models in terms of prediction +performance and fairness in the situation when re-training a model is +prohibitive. The method is inspired by selective prediction, where some +predictions that are considered low quality are discarded at inference time. In +our approach, we identify the potentially biased model predictions and, instead +of discarding them, we debias them using LEACE -- a post-processing debiasing +method. To select problematic predictions, we propose a bias quantification +approach based on KL divergence, which achieves better results than standard UQ +methods. Experiments with text classification datasets demonstrate that +selective debiasing helps to close the performance gap between post-processing +methods and at-training and pre-processing debiasing techniques. + +
+
+
+
+
+ + ♻ ☆ SHIELD: Evaluation and Defense Strategies for Copyright Compliance in + LLM Text Generation + + +
+ Large Language Models (LLMs) have transformed machine learning but raised +significant legal concerns due to their potential to produce text that +infringes on copyrights, resulting in several high-profile lawsuits. The legal +landscape is struggling to keep pace with these rapid advancements, with +ongoing debates about whether generated text might plagiarize copyrighted +materials. Current LLMs may infringe on copyrights or overly restrict +non-copyrighted texts, leading to these challenges: (i) the need for a +comprehensive evaluation benchmark to assess copyright compliance from multiple +aspects; (ii) evaluating robustness against safeguard bypassing attacks; and +(iii) developing effective defense targeted against the generation of +copyrighted text. To tackle these challenges, we introduce a curated dataset to +evaluate methods, test attack strategies, and propose lightweight, real-time +defense to prevent the generation of copyrighted text, ensuring the safe and +lawful use of LLMs. Our experiments demonstrate that current LLMs frequently +output copyrighted text, and that jailbreaking attacks can significantly +increase the volume of copyrighted output. Our proposed defense mechanism +significantly reduces the volume of copyrighted text generated by LLMs by +effectively refusing malicious requests. Code is publicly available at +https://github.com/xz-liu/SHIELD + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ Self-Supervised Visual Preference Alignment + + +
+ This paper makes the first attempt towards unsupervised preference alignment +in Vision-Language Models (VLMs). We generate chosen and rejected responses +with regard to the original and augmented image pairs, and conduct preference +alignment with direct preference optimization. It is based on a core idea: +properly designed augmentation to the image input will induce VLM to generate +false but hard negative responses, which helps the model to learn from and +produce more robust and powerful answers. The whole pipeline no longer hinges +on supervision from GPT-4 or human involvement during alignment, and is highly +efficient with few lines of code. With only 8k randomly sampled unsupervised +data, it achieves 90\% relative score to GPT-4 on complex reasoning in +LLaVA-Bench, and improves LLaVA-7B/13B by 6.7\%/5.6\% score on complex +multi-modal benchmark MM-Vet. Visualizations shows its improved ability to +align with user-intentions. A series of ablations are firmly conducted to +reveal the latent mechanism of the approach, which also indicates its potential +towards further scaling. Code are available in +https://github.com/Kevinz-code/SeVa. + +
+
+ comment: MM2024 oral +
+
+
+
+
+ + ♻ ☆ Architectural Foundations for the Large Language Model Infrastructures + + +
+ The development of a large language model (LLM) infrastructure is a pivotal +undertaking in artificial intelligence. This paper explores the intricate +landscape of LLM infrastructure, software, and data management. By analyzing +these core components, we emphasize the pivotal considerations and safeguards +crucial for successful LLM development. This work presents a concise synthesis +of the challenges and strategies inherent in constructing a robust and +effective LLM infrastructure, offering valuable insights for researchers and +practitioners alike. + +
+
+
+
+
+ + ♻ ☆ Challenges and Responses in the Practice of Large Language Models + + +
+ This paper carefully summarizes extensive and profound questions from all +walks of life, focusing on the current high-profile AI field, covering multiple +dimensions such as industry trends, academic research, technological innovation +and business applications. This paper meticulously curates questions that are +both thought-provoking and practically relevant, providing nuanced and +insightful answers to each. To facilitate readers' understanding and reference, +this paper specifically classifies and organizes these questions systematically +and meticulously from the five core dimensions of computing power +infrastructure, software architecture, data resources, application scenarios, +and brain science. This work aims to provide readers with a comprehensive, +in-depth and cutting-edge AI knowledge framework to help people from all walks +of life grasp the pulse of AI development, stimulate innovative thinking, and +promote industrial progress. + +
+
+
+
+
+ + ♻ ☆ Evaluating Dialect Robustness of Language Models via Conversation + Understanding + + +
+ With an evergrowing number of LLMs reporting superlative performance for +English, their ability to perform equitably for different dialects of English +($\textit{i.e.}$, dialect robustness) needs to be ascertained. Specifically, we +use English language (US English or Indian English) conversations between +humans who play the word-guessing game of 'taboo'. We formulate two evaluative +tasks: target word prediction (TWP) ($\textit{i.e.}$, predict the masked target +word in a conversation) and target word selection (TWS) ($\textit{i.e.}$, +select the most likely masked target word in a conversation, from among a set +of candidate words). Extending MD3, an existing dialectic dataset of +taboo-playing conversations, we introduce M-MD3, a target-word-masked version +of MD3 with the en-US and en-IN subsets. We create two subsets: en-MV (where +en-US is transformed to include dialectal information) and en-TR (where +dialectal information is removed from en-IN). We evaluate one open-source +(Llama3) and two closed-source (GPT-4/3.5) LLMs. LLMs perform significantly +better for US English than Indian English for both TWP and TWS tasks, for all +settings, exhibiting marginalisation against the Indian dialect of English. +While GPT-based models perform the best, the comparatively smaller models work +more equitably after fine-tuning. Our error analysis shows that the LLMs can +understand the dialect better after fine-tuning using dialectal data. Our +evaluation methodology exhibits a novel way to examine attributes of language +models using pre-existing dialogue datasets. + +
+
+ comment: 12 pages, 3 figures, 7 tables +
+
+
+
+
+ + ♻ ☆ One Law, Many Languages: Benchmarking Multilingual Legal Reasoning for + Judicial Support + + +
+ Recent strides in Large Language Models (LLMs) have saturated many Natural +Language Processing (NLP) benchmarks, emphasizing the need for more challenging +ones to properly assess LLM capabilities. However, domain-specific and +multilingual benchmarks are rare because they require in-depth expertise to +develop. Still, most public models are trained predominantly on English +corpora, while other languages remain understudied, particularly for practical +domain-specific NLP tasks. In this work, we introduce a novel NLP benchmark for +the legal domain that challenges LLMs in five key dimensions: processing +\emph{long documents} (up to 50K tokens), using \emph{domain-specific +knowledge} (embodied in legal texts), \emph{multilingual} understanding +(covering five languages), \emph{multitasking} (comprising legal +document-to-document Information Retrieval, Court View Generation, Leading +Decision Summarization, Citation Extraction, and eight challenging Text +Classification tasks) and \emph{reasoning} (comprising especially Court View +Generation, but also the Text Classification tasks). Our benchmark contains +diverse datasets from the Swiss legal system, allowing for a comprehensive +study of the underlying non-English, inherently multilingual legal system. +Despite the large size of our datasets (some with hundreds of thousands of +examples), existing publicly available multilingual models struggle with most +tasks, even after extensive in-domain pre-training and fine-tuning. We publish +all resources (benchmark suite, pre-trained models, code) under permissive open +CC BY-SA licenses. + +
+
+
+
+
+ + ♻ ☆ ML-Mamba: Efficient Multi-Modal Large Language Model Utilizing Mamba-2 + + +
+ Multimodal Large Language Models (MLLMs) have attracted much attention for +their multifunctionality. However, traditional Transformer architectures incur +significant overhead due to their secondary computational complexity. To +address this issue, we introduce ML-Mamba, a multimodal language model, which +utilizes the latest and efficient Mamba-2 model for inference. Mamba-2 is known +for its linear scalability and fast processing of long sequences. We replace +the Transformer-based backbone with a pre-trained Mamba-2 model and explore +methods for integrating 2D visual selective scanning mechanisms into multimodal +learning while also trying various visual encoders and Mamba-2 model variants. +Our extensive experiments in various multimodal benchmark tests demonstrate the +competitive performance of ML-Mamba and highlight the potential of state space +models in multimodal tasks. The experimental results show that: (1) we +empirically explore how to effectively apply the 2D vision selective scan +mechanism for multimodal learning. We propose a novel multimodal connector +called the Mamba-2 Scan Connector (MSC), which enhances representational +capabilities. (2) ML-Mamba achieves performance comparable to state-of-the-art +methods such as TinyLaVA and MobileVLM v2 through its linear sequential +modeling while faster inference speed; (3) Compared to multimodal models +utilizing Mamba-1, the Mamba-2-based ML-Mamba exhibits superior inference +performance and effectiveness. + +
+
+
+
+
+ + ♻ ☆ Training With "Paraphrasing the Original Text" Improves Long-Context + Performance + + +
+ As Large Language Models (LLMs) continue to evolve, more are being designed +to handle long-context inputs. Despite this advancement, most of them still +face challenges in accurately handling long-context tasks, often showing the +"lost in the middle" issue. We identify that insufficient retrieval capability +is one of the important reasons for this issue. To tackle this challenge, we +propose a novel approach to design training data for long-context tasks, aiming +at augmenting LLMs' proficiency in extracting key information from long +context. Specially, we incorporate an additional part named "paraphrasing the +original text" when constructing the answer of training samples and then +fine-tuning the model. Experimenting on LongBench and NaturalQuestions +Multi-document-QA dataset with models of Llama and Qwen series, our method +achieves an improvement of up to 8.48% and 4.48% in average scores, +respectively, showing effectiveness in improving the model' s performance on +long-context tasks. The model and training data have been made available on +HuggingFace(https://huggingface.co/yuyijiong/Qwen-14b-chat-yarn-32k). + +
+
+
+
+
+ + ♻ ☆ A Human Word Association based model for topic detection in social + networks + + +
+ With the widespread use of social networks, detecting the topics discussed on +these platforms has become a significant challenge. Current approaches +primarily rely on frequent pattern mining or semantic relations, often +neglecting the structure of the language. Language structural methods aim to +discover the relationships between words and how humans understand them. +Therefore, this paper introduces a topic detection framework for social +networks based on the concept of imitating the mental ability of word +association. This framework employs the Human Word Association method and +includes a specially designed extraction algorithm. The performance of this +method is evaluated using the FA-CUP dataset, a benchmark in the field of topic +detection. The results indicate that the proposed method significantly improves +topic detection compared to other methods, as evidenced by Topic-recall and the +keyword F1 measure. Additionally, to assess the applicability and +generalizability of the proposed method, a dataset of Telegram posts in the +Persian language is used. The results demonstrate that this method outperforms +other topic detection methods. + +
+
+ comment: This is a preprint of an article published in "Annals of Data + Science". The final authenticated version is available online at: + https://link.springer.com/article/10.1007/s40745-024-00561-0 +
+
+
+
+
+ + ♻ ☆ Enhancing Startup Success Predictions in Venture Capital: A GraphRAG + Augmented Multivariate Time Series Method + + +
+ In the Venture Capital(VC) industry, predicting the success of startups is +challenging due to limited financial data and the need for subjective revenue +forecasts. Previous methods based on time series analysis or deep learning +often fall short as they fail to incorporate crucial inter-company +relationships such as competition and collaboration. Regarding the issues, we +propose a novel approach using GrahphRAG augmented time series model. With +GraphRAG, time series predictive methods are enhanced by integrating these +vital relationships into the analysis framework, allowing for a more dynamic +understanding of the startup ecosystem in venture capital. Our experimental +results demonstrate that our model significantly outperforms previous models in +startup success predictions. To the best of our knowledge, our work is the +first application work of GraphRAG. + +
+
+
+
+
+ + ♻ ☆ Flexora: Flexible Low Rank Adaptation for Large Language Models + + +
+ Large Language Models (LLMs) are driving advancements in artificial +intelligence by increasing the scale of model parameters, which has +significantly enhanced generalization ability and unlocked new capabilities in +practice. However, their performance in specific downstream tasks is usually +hindered by their knowledge boundaries on these tasks. Thus, fine-tuning +techniques, especially the widely used Low-Rank Adaptation (LoRA) method, have +been introduced to expand the boundaries on these tasks, whereas LoRA would +underperform on certain tasks owing to its potential overfitting on these +tasks. To overcome this overfitting and improve the performance of LoRA, we +propose the flexible low rank adaptation (Flexora) method to automatically and +flexibly select the most important layers needing to be fine-tuned to achieve +the best performance on different downstream tasks. Specifically, Flexora +firstly frames this layer selection problem as a well-defined hyperparameter +optimization (HPO) problem, then addresses it using the unrolled +differentiation (UD) method, and finally selects the most useful layers based +on the optimized hyperparameters. Our extensive experiments on many pretrained +models and natural language tasks show that Flexora is able to consistently +improve over the existing baselines, indicating the effectiveness of our +Flexora in practice. We additionally provide insightful theoretical results and +many ablation studies to deliver a comprehensive understanding of our Flexora. + +
+
+ comment: 29 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ Selective Prompt Anchoring for Code Generation + + +
+ Recent advances in large language models (LLMs) such as Copilot and ChatGPT +have transformed software development by automating coding tasks. Despite these +advancements, challenges remain in reducing error rates and fully meeting user +expectations. Our empirical study reveals LLMs tend to dilute their +self-attention on the initial prompt as more code tokens are generated. We +hypothesize this self-attention dilution issue is one of the root causes of +inaccuracies in LLM-generated code. To mitigate this issue, we propose +Selective Prompt Anchoring (SPA). SPA amplifies the influence of the selected +parts in the initial prompt, which we refer to as ``anchored text'', during +code generation. Specifically, SPA calculates the logit distribution difference +with and without the anchored text. We prove this difference approximates the +anchored text's contextual contribution to the output logits. SPA creates an +augmented logit distribution by linearly combining the original logit +distribution and the logit difference. We evaluate SPA with five LLMs on four +benchmarks. Our results demonstrate that using SPA can consistently improve +Pass@1 rates by up to 9.7% in all settings. Notably, with selective text +anchoring, a small version of DeepSeek-Coder (6.7B) can achieve better +performance than an original much larger version (33B). Our code is available +at https://github.com/magic-YuanTian/Selective-Prompt-Anchoring. + +
+
+
+
+
+ + ♻ ☆ Node Level Graph Autoencoder: Unified Pretraining for Textual Graph + Learning + + +
+ Textual graphs are ubiquitous in real-world applications, featuring rich text +information with complex relationships, which enables advanced research across +various fields. Textual graph representation learning aims to generate +low-dimensional feature embeddings from textual graphs that can improve the +performance of downstream tasks. A high-quality feature embedding should +effectively capture both the structural and the textual information in a +textual graph. However, most textual graph dataset benchmarks rely on word2vec +techniques to generate feature embeddings, which inherently limits their +capabilities. Recent works on textual graph representation learning can be +categorized into two folds: supervised and unsupervised methods. Supervised +methods finetune a language model on labeled nodes, which have limited +capabilities when labeled data is scarce. Unsupervised methods, on the other +hand, extract feature embeddings by developing complex training pipelines. To +address these limitations, we propose a novel unified unsupervised learning +autoencoder framework, named Node Level Graph AutoEncoder (NodeGAE). We employ +language models as the backbone of the autoencoder, with pretraining on text +reconstruction. Additionally, we add an auxiliary loss term to make the feature +embeddings aware of the local graph structure. Our method maintains simplicity +in the training process and demonstrates generalizability across diverse +textual graphs and downstream tasks. We evaluate our method on two core graph +representation learning downstream tasks: node classification and link +prediction. Comprehensive experiments demonstrate that our approach +substantially enhances the performance of diverse graph neural networks (GNNs) +across multiple textual graph datasets. + +
+
+
+
+
+ + ♻ ☆ Parameter-Efficient Fine-Tuning via Circular Convolution + + +
+ Low-Rank Adaptation (LoRA) has gained popularity for fine-tuning large +foundation models, leveraging low-rank matrices $\mathbf{A}$ and $\mathbf{B}$ +to represent weight changes (i.e., $\Delta \mathbf{W} = \mathbf{B} +\mathbf{A}$). This method reduces trainable parameters and mitigates heavy +memory consumption associated with full delta matrices by sequentially +multiplying $\mathbf{A}$ and $\mathbf{B}$ with the activation. Despite its +success, the intrinsic low-rank characteristic may limit its performance. +Although several variants have been proposed to address this issue, they often +overlook the crucial computational and memory efficiency brought by LoRA. In +this paper, we propose Circular Convolution Adaptation (C$^3$A), which not only +achieves high-rank adaptation with enhanced performance but also excels in both +computational power and memory utilization. Extensive experiments demonstrate +that C$^3$A consistently outperforms LoRA and its variants across various +fine-tuning tasks. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ Corex: Pushing the Boundaries of Complex Reasoning through Multi-Model + Collaboration ICLR 2024 + + +
+ Large Language Models (LLMs) are evolving at an unprecedented pace and have +exhibited considerable capability in the realm of natural language processing +(NLP) with world knowledge. Benefiting from ultra-large-scale training corpora, +a single LLM can manage typical NLP tasks competently. However, its performance +in executing reasoning tasks is still confined by the limitations of its +internal representations. To push this boundary further, we introduce Corex in +this paper, a suite of novel general-purpose strategies that transform LLMs +into autonomous agents pioneering multi-model collaborations for complex +task-solving. Inspired by human behaviors, Corex is constituted by diverse +collaboration paradigms including Debate, Review, and Retrieve modes, which +collectively work towards enhancing the factuality, faithfulness, and +reliability of the reasoning process. These paradigms foster task-agnostic +approaches that enable LLMs to ''think outside the box,'' thereby overcoming +hallucinations and providing better solutions. Through extensive experiments +across four different types of reasoning tasks, we demonstrate that +orchestrating multiple LLMs to work in concert yields substantially better +performance compared to existing methods. Further results and in-depth analysis +demonstrate the cost-effectiveness of our method, facilitating collaboration +among different LLMs and promoting annotation efficiency. + +
+
+ comment: COLM 2024 / ICLR 2024 Workshop on LLM Agents +
+
+
+
+
+ + ♻ ☆ MMFakeBench: A Mixed-Source Multimodal Misinformation Detection + Benchmark for LVLMs + + +
+ Current multimodal misinformation detection (MMD) methods often assume a +single source and type of forgery for each sample, which is insufficient for +real-world scenarios where multiple forgery sources coexist. The lack of a +benchmark for mixed-source misinformation has hindered progress in this field. +To address this, we introduce MMFakeBench, the first comprehensive benchmark +for mixed-source MMD. MMFakeBench includes 3 critical sources: textual veracity +distortion, visual veracity distortion, and cross-modal consistency distortion, +along with 12 sub-categories of misinformation forgery types. We further +conduct an extensive evaluation of 6 prevalent detection methods and 15 large +vision-language models (LVLMs) on MMFakeBench under a zero-shot setting. The +results indicate that current methods struggle under this challenging and +realistic mixed-source MMD setting. Additionally, we propose an innovative +unified framework, which integrates rationales, actions, and tool-use +capabilities of LVLM agents, significantly enhancing accuracy and +generalization. We believe this study will catalyze future research into more +realistic mixed-source multimodal misinformation and provide a fair evaluation +of misinformation detection methods. + +
+
+ comment: Project page: https://liuxuannan.github.io/MMFakeBench.github.io/ +
+
+
+
+
+ + ♻ ☆ Introducing the NewsPaLM MBR and QE Dataset: LLM-Generated High-Quality + Parallel Data Outperforms Traditional Web-Crawled Data + + +
+ Recent research in neural machine translation (NMT) has shown that training +on high-quality machine-generated data can outperform training on +human-generated data. This work accompanies the first-ever release of a +LLM-generated, MBR-decoded and QE-reranked dataset with both sentence-level and +multi-sentence examples. We perform extensive experiments to demonstrate the +quality of our dataset in terms of its downstream impact on NMT model +performance. We find that training from scratch on our (machine-generated) +dataset outperforms training on the (web-crawled) WMT'23 training dataset +(which is 300 times larger), and also outperforms training on the top-quality +subset of the WMT'23 training dataset. We also find that performing +self-distillation by finetuning the LLM which generated this dataset +outperforms the LLM's strong few-shot baseline. These findings corroborate the +quality of our dataset, and demonstrate the value of high-quality +machine-generated data in improving performance of NMT models. + +
+
+
+
+
+ + ♻ ☆ UniBridge: A Unified Approach to Cross-Lingual Transfer Learning for + Low-Resource Languages ACL 2024 + + +
+ In this paper, we introduce UniBridge (Cross-Lingual Transfer Learning with +Optimized Embeddings and Vocabulary), a comprehensive approach developed to +improve the effectiveness of Cross-Lingual Transfer Learning, particularly in +languages with limited resources. Our approach tackles two essential elements +of a language model: the initialization of embeddings and the optimal +vocabulary size. Specifically, we propose a novel embedding initialization +method that leverages both lexical and semantic alignment for a language. In +addition, we present a method for systematically searching for the optimal +vocabulary size, ensuring a balance between model complexity and linguistic +coverage. Our experiments across multilingual datasets show that our approach +greatly improves the F1-Score in several languages. UniBridge is a robust and +adaptable solution for cross-lingual systems in various languages, highlighting +the significance of initializing embeddings and choosing the right vocabulary +size in cross-lingual environments. + +
+
+ comment: First two authors contribute equally. Accepted at ACL 2024 +
+
+
+
+
+ + ♻ ☆ BEYOND DIALOGUE: A Profile-Dialogue Alignment Framework Towards General + Role-Playing Language Model + + +
+ The rapid advancement of large language models (LLMs) has revolutionized +role-playing, enabling the development of general role-playing models. However, +current role-playing training has two significant issues: (I) Using a +predefined role profile to prompt dialogue training for specific scenarios +usually leads to inconsistencies and even conflicts between the dialogue and +the profile, resulting in training biases. (II) The model learns to imitate the +role based solely on the profile, neglecting profile-dialogue alignment at the +sentence level. In this work, we propose a simple yet effective framework +called BEYOND DIALOGUE, designed to overcome these hurdles. This framework +innovatively introduces "beyond dialogue" tasks to align dialogue with profile +traits based on each specific scenario, thereby eliminating biases during +training. Furthermore, by adopting an innovative prompting mechanism that +generates reasoning outcomes for training, the framework allows the model to +achieve fine-grained alignment between profile and dialogue at the sentence +level. The aforementioned methods are fully automated and low-cost. +Additionally, the integration of automated dialogue and objective evaluation +methods forms a comprehensive framework, paving the way for general +role-playing. Experimental results demonstrate that our model excels in +adhering to and reflecting various dimensions of role profiles, outperforming +most proprietary general and specialized role-playing baselines. All code and +datasets are available at https://github.com/yuyouyu32/BeyondDialogue. + +
+
+
+
+
+ + ♻ ☆ InstructERC: Reforming Emotion Recognition in Conversation with + Multi-task Retrieval-Augmented Large Language Models + + +
+ The field of emotion recognition of conversation (ERC) has been focusing on +separating sentence feature encoding and context modeling, lacking exploration +in generative paradigms based on unified designs. In this study, we propose a +novel approach, InstructERC, to reformulate the ERC task from a discriminative +framework to a generative framework based on Large Language Models (LLMs). +InstructERC makes three significant contributions: (1) it introduces a simple +yet effective retrieval template module, which helps the model explicitly +integrate multi-granularity dialogue supervision information. (2) We introduce +two additional emotion alignment tasks, namely speaker identification and +emotion prediction tasks, to implicitly model the dialogue role relationships +and future emotional tendencies in conversations. (3) Pioneeringly, we unify +emotion labels across benchmarks through the feeling wheel to fit real +application scenarios. InstructERC still perform impressively on this unified +dataset. Our LLM-based plugin framework significantly outperforms all previous +models and achieves comprehensive SOTA on three commonly used ERC datasets. +Extensive analysis of parameter-efficient and data-scaling experiments provides +empirical guidance for applying it in practical scenarios. + +
+
+
+
+
+ + ♻ ☆ Medical MLLM is Vulnerable: Cross-Modality Jailbreak and Mismatched + Attacks on Medical Multimodal Large Language Models + + +
+ Security concerns related to Large Language Models (LLMs) have been +extensively explored, yet the safety implications for Multimodal Large Language +Models (MLLMs), particularly in medical contexts (MedMLLMs), remain +insufficiently studied. This paper delves into the underexplored security +vulnerabilities of MedMLLMs, especially when deployed in clinical environments +where the accuracy and relevance of question-and-answer interactions are +critically tested against complex medical challenges. By combining existing +clinical medical data with atypical natural phenomena, we define the mismatched +malicious attack (2M-attack) and introduce its optimized version, known as the +optimized mismatched malicious attack (O2M-attack or 2M-optimization). Using +the voluminous 3MAD dataset that we construct, which covers a wide range of +medical image modalities and harmful medical scenarios, we conduct a +comprehensive analysis and propose the MCM optimization method, which +significantly enhances the attack success rate on MedMLLMs. Evaluations with +this dataset and attack methods, including white-box attacks on LLaVA-Med and +transfer attacks (black-box) on four other SOTA models, indicate that even +MedMLLMs designed with enhanced security features remain vulnerable to security +breaches. Our work underscores the urgent need for a concerted effort to +implement robust security measures and enhance the safety and efficacy of +open-source MedMLLMs, particularly given the potential severity of jailbreak +attacks and other malicious or clinically significant exploits in medical +settings. Our code is available at https://github.com/dirtycomputer/O2M_attack. + +
+
+
+
+
+ + ♻ ☆ ClaimVer: Explainable Claim-Level Verification and Evidence Attribution + of Text Through Knowledge Graphs + + +
+ In the midst of widespread misinformation and disinformation through social +media and the proliferation of AI-generated texts, it has become increasingly +difficult for people to validate and trust information they encounter. Many +fact-checking approaches and tools have been developed, but they often lack +appropriate explainability or granularity to be useful in various contexts. A +text validation method that is easy to use, accessible, and can perform +fine-grained evidence attribution has become crucial. More importantly, +building user trust in such a method requires presenting the rationale behind +each prediction, as research shows this significantly influences people's +belief in automated systems. Localizing and bringing users' attention to the +specific problematic content is also paramount, instead of providing simple +blanket labels. In this paper, we present ClaimVer, a human-centric framework +tailored to meet users' informational and verification needs by generating rich +annotations and thereby reducing cognitive load. Designed to deliver +comprehensive evaluations of texts, it highlights each claim, verifies it +against a trusted knowledge graph (KG), presents the evidence, and provides +succinct, clear explanations for each claim prediction. Finally, our framework +introduces an attribution score, enhancing applicability across a wide range of +downstream tasks. + +
+
+
+
+
+ + ♻ ☆ QET: Enhancing Quantized LLM Parameters and KV cache Compression through + Element Substitution and Residual Clustering + + +
+ Matrix quantization compresses matrix elements into a more compact form to +reduce storage requirements, with dequantization enabling reconstruction for +use. We define the Quantization Error Minimization (QEM) problem as minimizing +the difference between the original and quantized matrices while ensuring the +quantized matrix remains within fixed memory constraints. This technique is +crucial in applications like Large Language Model (LLM) weight compression and +KV cache compression, where large matrix sizes demand efficient storage +solutions. + As modern LLMs like GPT-4 and BERT continue to grow, effective matrix +compression is increasingly important. These models contain billions of +parameters in matrix form, making efficient weight quantization essential for +both storage and computational efficiency. Similarly, KV caches, storing +intermediate inference results, are matrix-based and benefit significantly from +optimized compression techniques. + To address the QEM problem in the context of LLM weight and KV cache +compression, we propose Quantum Entanglement Trees (QET). QET leverages the +local structure of matrix elements by iteratively swapping elements to create a +locally ordered matrix, which is then grouped and quantized column by column. +To enhance QET, we introduce two optimizations: residual quantization to +further reduce Mean Squared Error (MSE) and masking with batch processing to +accelerate the algorithm. + Our experiments demonstrate that QET can reduce MSE to 12.3% of its original +value at the same compression ratio, outperforming leading baseline methods. +Our contributions include framing the QEM problem specifically for LLM and KV +cache compression, developing the QET algorithm, and implementing optimizations +that improve accuracy and processing speed. + +
+
+
+
+
+ + ♻ ☆ Articulatory Encodec: Coding Speech through Vocal Tract Kinematics + + +
+ Vocal tract articulation is a natural, grounded control space of speech +production. The spatiotemporal coordination of articulators combined with the +vocal source shapes intelligible speech sounds to enable effective spoken +communication. Based on this physiological grounding of speech, we propose a +new framework of neural encoding-decoding of speech -- Articulatory Encodec. +Articulatory Encodec comprises an articulatory analysis model that infers +articulatory features from speech audio, and an articulatory synthesis model +that synthesizes speech audio from articulatory features. The articulatory +features are kinematic traces of vocal tract articulators and source features, +which are intuitively interpretable and controllable, being the actual physical +interface of speech production. An additional speaker identity encoder is +jointly trained with the articulatory synthesizer to inform the voice texture +of individual speakers. By training on large-scale speech data, we achieve a +fully intelligible, high-quality articulatory synthesizer that generalizes to +unseen speakers. Furthermore, the speaker embedding is effectively disentangled +from articulations, which enables accent-perserving zero-shot voice conversion. +To the best of our knowledge, this is the first demonstration of universal, +high-performance articulatory inference and synthesis, suggesting the proposed +framework as a powerful coding system of speech. + +
+
+
+
+
+ + ♻ ☆ UniMEL: A Unified Framework for Multimodal Entity Linking with Large + Language Models CIKM 2024 + + +
+ Multimodal Entity Linking (MEL) is a crucial task that aims at linking +ambiguous mentions within multimodal contexts to the referent entities in a +multimodal knowledge base, such as Wikipedia. Existing methods focus heavily on +using complex mechanisms and extensive model tuning methods to model the +multimodal interaction on specific datasets. However, these methods +overcomplicate the MEL task and overlook the visual semantic information, which +makes them costly and hard to scale. Moreover, these methods can not solve the +issues like textual ambiguity, redundancy, and noisy images, which severely +degrade their performance. Fortunately, the advent of Large Language Models +(LLMs) with robust capabilities in text understanding and reasoning, +particularly Multimodal Large Language Models (MLLMs) that can process +multimodal inputs, provides new insights into addressing this challenge. +However, how to design a universally applicable LLMs-based MEL approach remains +a pressing challenge. To this end, we propose UniMEL, a unified framework which +establishes a new paradigm to process multimodal entity linking tasks using +LLMs. In this framework, we employ LLMs to augment the representation of +mentions and entities individually by integrating textual and visual +information and refining textual information. Subsequently, we employ the +embedding-based method for retrieving and re-ranking candidate entities. Then, +with only ~0.26% of the model parameters fine-tuned, LLMs can make the final +selection from the candidate entities. Extensive experiments on three public +benchmark datasets demonstrate that our solution achieves state-of-the-art +performance, and ablation studies verify the effectiveness of all modules. Our +code is available at https://github.com/Javkonline/UniMEL. + +
+
+ comment: CIKM 2024. The first two authors contributed equally to this work +
+
+
+
+
+ + ♻ ☆ Persona-DB: Efficient Large Language Model Personalization for Response + Prediction with Collaborative Data Refinement + + +
+ The increasing demand for personalized interactions with large language +models (LLMs) calls for methodologies capable of accurately and efficiently +identifying user opinions and preferences. Retrieval augmentation emerges as an +effective strategy, as it can accommodate a vast number of users without the +costs from fine-tuning. Existing research, however, has largely focused on +enhancing the retrieval stage and devoted limited exploration toward optimizing +the representation of the database, a crucial aspect for tasks such as +personalization. In this work, we examine the problem from a novel angle, +focusing on how data can be better represented for more data-efficient +retrieval in the context of LLM customization. To tackle this challenge, we +introduce Persona-DB, a simple yet effective framework consisting of a +hierarchical construction process to improve generalization across task +contexts and collaborative refinement to effectively bridge knowledge gaps +among users. In the evaluation of response prediction, Persona-DB demonstrates +superior context efficiency in maintaining accuracy with a significantly +reduced retrieval size, a critical advantage in scenarios with extensive +histories or limited context windows. Our experiments also indicate a marked +improvement of over 10% under cold-start scenarios, when users have extremely +sparse data. Furthermore, our analysis reveals the increasing importance of +collaborative knowledge as the retrieval capacity expands. + +
+
+
+
+
+ + ♻ ☆ JPEG-LM: LLMs as Image Generators with Canonical Codec Representations + + +
+ Recent work in image and video generation has been adopting the +autoregressive LLM architecture due to its generality and potentially easy +integration into multi-modal systems. The crux of applying autoregressive +training in language generation to visual generation is discretization -- +representing continuous data like images and videos as discrete tokens. Common +methods of discretizing images and videos include modeling raw pixel values, +which are prohibitively lengthy, or vector quantization, which requires +convoluted pre-hoc training. In this work, we propose to directly model images +and videos as compressed files saved on computers via canonical codecs (e.g., +JPEG, AVC/H.264). Using the default Llama architecture without any +vision-specific modifications, we pretrain JPEG-LM from scratch to generate +images (and AVC-LM to generate videos as a proof of concept), by directly +outputting compressed file bytes in JPEG and AVC formats. Evaluation of image +generation shows that this simple and straightforward approach is more +effective than pixel-based modeling and sophisticated vector quantization +baselines (on which our method yields a 31% reduction in FID). Our analysis +shows that JPEG-LM has an especial advantage over vector quantization models in +generating long-tail visual elements. Overall, we show that using canonical +codec representations can help lower the barriers between language generation +and visual generation, facilitating future research on multi-modal +language/image/video LLMs. + +
+
+
+
+
+ + ♻ ☆ Latent Adversarial Training Improves Robustness to Persistent Harmful + Behaviors in LLMs + + +
+ Large language models (LLMs) can often be made to behave in undesirable ways +that they are explicitly fine-tuned not to. For example, the LLM red-teaming +literature has produced a wide variety of 'jailbreaking' techniques to elicit +harmful text from models that were fine-tuned to be harmless. Recent work on +red-teaming, model editing, and interpretability suggests that this challenge +stems from how (adversarial) fine-tuning largely serves to suppress rather than +remove undesirable capabilities from LLMs. Prior work has introduced latent +adversarial training (LAT) as a way to improve robustness to broad classes of +failures. These prior works have considered untargeted latent space attacks +where the adversary perturbs latent activations to maximize loss on examples of +desirable behavior. Untargeted LAT can provide a generic type of robustness but +does not leverage information about specific failure modes. Here, we experiment +with targeted LAT where the adversary seeks to minimize loss on a specific +competing task. We find that it can augment a wide variety of state-of-the-art +methods. First, we use targeted LAT to improve robustness to jailbreaks, +outperforming a strong R2D2 baseline with orders of magnitude less compute. +Second, we use it to more effectively remove backdoors with no knowledge of the +trigger. Finally, we use it to more effectively unlearn knowledge for specific +undesirable tasks in a way that is also more robust to re-learning. Overall, +our results suggest that targeted LAT can be an effective tool for defending +against harmful behaviors from LLMs. + +
+
+
+
+
+ + ♻ ☆ A Study of Backdoors in Instruction Fine-tuned Language Models + + +
+ Backdoor data poisoning, inserted within instruction examples used to +fine-tune a foundation Large Language Model (LLM) for downstream tasks +(\textit{e.g.,} sentiment prediction), is a serious security concern due to the +evasive nature of such attacks. The poisoning is usually in the form of a +(seemingly innocuous) trigger word or phrase inserted into a very small +fraction of the fine-tuning samples from a target class. Such backdoor attacks +can: alter response sentiment, violate censorship, over-refuse (invoke +censorship for legitimate queries), inject false content, or trigger nonsense +responses (hallucinations). In this work we investigate the efficacy of +instruction fine-tuning backdoor attacks as attack "hyperparameters" are varied +under a variety of scenarios, considering: the trigger location in the poisoned +examples; robustness to change in the trigger location, partial triggers, and +synonym substitutions at test time; attack transfer from one (fine-tuning) +domain to a related test domain; and clean-label vs. dirty-label poisoning. +Based on our observations, we propose and evaluate two defenses against these +attacks: i) a \textit{during-fine-tuning defense} based on word-frequency +counts that assumes the (possibly poisoned) fine-tuning dataset is available +and identifies the backdoor trigger tokens; and ii) a \textit{post-fine-tuning +defense} based on downstream clean fine-tuning of the backdoored LLM with a +small defense dataset. Finally, we provide a brief survey of related work on +backdoor attacks and defenses. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ LLM2Vec: Large Language Models Are Secretly Powerful Text Encoders + + +
+ Large decoder-only language models (LLMs) are the state-of-the-art models on +most of today's NLP tasks and benchmarks. Yet, the community is only slowly +adopting these models for text embedding tasks, which require rich +contextualized representations. In this work, we introduce LLM2Vec, a simple +unsupervised approach that can transform any decoder-only LLM into a strong +text encoder. LLM2Vec consists of three simple steps: 1) enabling bidirectional +attention, 2) masked next token prediction, and 3) unsupervised contrastive +learning. We demonstrate the effectiveness of LLM2Vec by applying it to 4 +popular LLMs ranging from 1.3B to 8B parameters and evaluate the transformed +models on English word- and sequence-level tasks. We outperform encoder-only +models by a large margin on word-level tasks and reach a new unsupervised +state-of-the-art performance on the Massive Text Embeddings Benchmark (MTEB). +Moreover, when combining LLM2Vec with supervised contrastive learning, we +achieve state-of-the-art performance on MTEB among models that train only on +publicly available data (as of May 24, 2024). Our strong empirical results and +extensive analysis demonstrate that LLMs can be effectively transformed into +universal text encoders in a parameter-efficient manner without the need for +expensive adaptation or synthetic GPT-4 generated data. + +
+
+ comment: Accepted to COLM 2024 +
+
+
+
+
+ + ♻ ☆ Fight Back Against Jailbreaking via Prompt Adversarial Tuning + + +
+ While Large Language Models (LLMs) have achieved tremendous success in +various applications, they are also susceptible to jailbreak attacks. Several +primary defense strategies have been proposed to protect LLMs from producing +harmful information, mostly with a particular focus on harmful content +filtering or heuristical defensive prompt designs. However, how to achieve +intrinsic robustness through the prompts remains an open problem. In this +paper, motivated by adversarial training paradigms for achieving reliable +robustness, we propose an approach named Prompt Adversarial Tuning (PAT) that +trains a prompt control attached to the user prompt as a guard prefix. To +achieve our defense goal whilst maintaining natural performance, we optimize +the control prompt with both adversarial and benign prompts. Comprehensive +experiments show that our method is effective against both grey-box and +black-box attacks, reducing the success rate of advanced attacks to nearly 0 +while maintaining the model's utility on the benign task. The proposed defense +strategy incurs only negligible computational overhead, charting a new +perspective for future explorations in LLM security. Our code is available at +https://github.com/rain152/PAT. + +
+
+
+
+
+ + ♻ ☆ LBC: Language-Based-Classifier for Out-Of-Variable Generalization + + +
+ Large Language Models (LLMs) have great success in natural language +processing tasks such as response generation. However, their use in tabular +data has been limited due to their inferior performance compared to traditional +machine learning models (TMLs) such as XGBoost. We find that the pre-trained +knowledge of LLMs enables them to interpret new variables that appear in a test +without additional training, a capability central to the concept of +Out-of-Variable (OOV). From the findings, we propose a +Language-Based-Classifier (LBC), a classifier that maximizes the benefits of +LLMs to outperform TMLs on OOV tasks. LBC employs three key methodological +strategies: 1) Categorical changes to adjust data to better fit the model's +understanding, 2) Advanced order and indicator to enhance data representation +to the model, and 3) Using verbalizer to map logit scores to classes during +inference to generate model predictions. These strategies, combined with the +pre-trained knowledge of LBC, emphasize the model's ability to effectively +handle OOV tasks. We empirically and theoretically validate the superiority of +LBC. LBC is the first study to apply an LLM-based model to OOV tasks. The +source code is at https://github.com/sksmssh/LBCforOOVGen + +
+
+ comment: 16 pages, 7 figures, 4 tables +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 168 + +
+
+
+ + ☆ GRAB: A Challenging GRaph Analysis Benchmark for Large Multimodal Models + + +
+ Large multimodal models (LMMs) have exhibited proficiencies across many +visual tasks. Although numerous well-known benchmarks exist to evaluate model +performance, they increasingly have insufficient headroom. As such, there is a +pressing need for a new generation of benchmarks challenging enough for the +next generation of LMMs. One area that LMMs show potential is graph analysis, +specifically, the tasks an analyst might typically perform when interpreting +figures such as estimating the mean, intercepts or correlations of functions +and data series. In this work, we introduce GRAB, a graph analysis benchmark, +fit for current and future frontier LMMs. Our benchmark is entirely synthetic, +ensuring high-quality, noise-free questions. GRAB is comprised of 2170 +questions, covering four tasks and 23 graph properties. We evaluate 20 LMMs on +GRAB, finding it to be a challenging benchmark, with the highest performing +model attaining a score of just 21.7%. Finally, we conduct various ablations to +investigate where the models succeed and struggle. We release GRAB to encourage +progress in this important, growing domain. + +
+
+
+
+
+ + ☆ SynPlay: Importing Real-world Diversity for a Synthetic Human Dataset + + +
+ We introduce Synthetic Playground (SynPlay), a new synthetic human dataset +that aims to bring out the diversity of human appearance in the real world. We +focus on two factors to achieve a level of diversity that has not yet been seen +in previous works: i) realistic human motions and poses and ii) multiple camera +viewpoints towards human instances. We first use a game engine and its +library-provided elementary motions to create games where virtual players can +take less-constrained and natural movements while following the game rules +(i.e., rule-guided motion design as opposed to detail-guided design). We then +augment the elementary motions with real human motions captured with a motion +capture device. To render various human appearances in the games from multiple +viewpoints, we use seven virtual cameras encompassing the ground and aerial +views, capturing abundant aerial-vs-ground and dynamic-vs-static attributes of +the scene. Through extensive and carefully-designed experiments, we show that +using SynPlay in model training leads to enhanced accuracy over existing +synthetic datasets for human detection and segmentation. The benefit of SynPlay +becomes even greater for tasks in the data-scarce regime, such as few-shot and +cross-domain learning tasks. These results clearly demonstrate that SynPlay can +be used as an essential dataset with rich attributes of complex human +appearances and poses suitable for model pretraining. SynPlay dataset +comprising over 73k images and 6.5M human instances, is available for download +at https://synplaydataset.github.io/. + +
+
+ comment: Project Page: https://synplaydataset.github.io/ +
+
+
+
+
+ + ☆ SEA: Supervised Embedding Alignment for Token-Level Visual-Textual + Integration in MLLMs + + +
+ Multimodal Large Language Models (MLLMs) have recently demonstrated +remarkable perceptual and reasoning abilities, typically comprising a Vision +Encoder, an Adapter, and a Large Language Model (LLM). The adapter serves as +the critical bridge between the visual and language components. However, +training adapters with image-level supervision often results in significant +misalignment, undermining the LLMs' capabilities and limiting the potential of +Multimodal LLMs. To address this, we introduce Supervised Embedding Alignment +(SEA), a token-level alignment method that leverages vision-language +pre-trained models, such as CLIP, to align visual tokens with the LLM's +embedding space through contrastive learning. This approach ensures a more +coherent integration of visual and language representations, enhancing the +performance and interpretability of multimodal LLMs while preserving their +inherent capabilities. Extensive experiments show that SEA effectively improves +MLLMs, particularly for smaller models, without adding extra data or inference +computation. SEA also lays the groundwork for developing more general and +adaptable solutions to enhance multimodal systems. + +
+
+
+
+
+ + ☆ EmbodiedSAM: Online Segment Any 3D Thing in Real Time + + +
+ Embodied tasks require the agent to fully understand 3D scenes simultaneously +with its exploration, so an online, real-time, fine-grained and +highly-generalized 3D perception model is desperately needed. Since +high-quality 3D data is limited, directly training such a model in 3D is almost +infeasible. Meanwhile, vision foundation models (VFM) has revolutionized the +field of 2D computer vision with superior performance, which makes the use of +VFM to assist embodied 3D perception a promising direction. However, most +existing VFM-assisted 3D perception methods are either offline or too slow that +cannot be applied in practical embodied tasks. In this paper, we aim to +leverage Segment Anything Model (SAM) for real-time 3D instance segmentation in +an online setting. This is a challenging problem since future frames are not +available in the input streaming RGB-D video, and an instance may be observed +in several frames so object matching between frames is required. To address +these challenges, we first propose a geometric-aware query lifting module to +represent the 2D masks generated by SAM by 3D-aware queries, which is then +iteratively refined by a dual-level query decoder. In this way, the 2D masks +are transferred to fine-grained shapes on 3D point clouds. Benefit from the +query representation for 3D masks, we can compute the similarity matrix between +the 3D masks from different views by efficient matrix operation, which enables +real-time inference. Experiments on ScanNet, ScanNet200, SceneNN and 3RScan +show our method achieves leading performance even compared with offline +methods. Our method also demonstrates great generalization ability in several +zero-shot dataset transferring experiments and show great potential in +open-vocabulary and data-efficient setting. Code and demo are available at +https://xuxw98.github.io/ESAM/, with only one RTX 3090 GPU required for +training and evaluation. + +
+
+ comment: Project page: https://xuxw98.github.io/ESAM/ +
+
+
+
+
+ + ☆ Pixel Is Not A Barrier: An Effective Evasion Attack for Pixel-Domain + Diffusion Models + + +
+ Diffusion Models have emerged as powerful generative models for high-quality +image synthesis, with many subsequent image editing techniques based on them. +However, the ease of text-based image editing introduces significant risks, +such as malicious editing for scams or intellectual property infringement. +Previous works have attempted to safeguard images from diffusion-based editing +by adding imperceptible perturbations. These methods are costly and +specifically target prevalent Latent Diffusion Models (LDMs), while +Pixel-domain Diffusion Models (PDMs) remain largely unexplored and robust +against such attacks. Our work addresses this gap by proposing a novel +attacking framework with a feature representation attack loss that exploits +vulnerabilities in denoising UNets and a latent optimization strategy to +enhance the naturalness of protected images. Extensive experiments demonstrate +the effectiveness of our approach in attacking dominant PDM-based editing +methods (e.g., SDEdit) while maintaining reasonable protection fidelity and +robustness against common defense methods. Additionally, our framework is +extensible to LDMs, achieving comparable performance to existing approaches. + +
+
+
+
+
+ + ☆ ACE: A Cross-Platform Visual-Exoskeletons System for Low-Cost Dexterous + Teleoperation + + +
+ Learning from demonstrations has shown to be an effective approach to robotic +manipulation, especially with the recently collected large-scale robot data +with teleoperation systems. Building an efficient teleoperation system across +diverse robot platforms has become more crucial than ever. However, there is a +notable lack of cost-effective and user-friendly teleoperation systems for +different end-effectors, e.g., anthropomorphic robot hands and grippers, that +can operate across multiple platforms. To address this issue, we develop ACE, a +cross-platform visual-exoskeleton system for low-cost dexterous teleoperation. +Our system utilizes a hand-facing camera to capture 3D hand poses and an +exoskeleton mounted on a portable base, enabling accurate real-time capture of +both finger and wrist poses. Compared to previous systems, which often require +hardware customization according to different robots, our single system can +generalize to humanoid hands, arm-hands, arm-gripper, and quadruped-gripper +systems with high-precision teleoperation. This enables imitation learning for +complex manipulation tasks on diverse platforms. + +
+
+ comment: Webpage: https://ace-teleop.github.io/ +
+
+
+
+
+ + ☆ Story3D-Agent: Exploring 3D Storytelling Visualization with Large + Language Models + + +
+ Traditional visual storytelling is complex, requiring specialized knowledge +and substantial resources, yet often constrained by human creativity and +creation precision. While Large Language Models (LLMs) enhance visual +storytelling, current approaches often limit themselves to 2D visuals or +oversimplify stories through motion synthesis and behavioral simulation, +failing to create comprehensive, multi-dimensional narratives. To this end, we +present Story3D-Agent, a pioneering approach that leverages the capabilities of +LLMs to transform provided narratives into 3D-rendered visualizations. By +integrating procedural modeling, our approach enables precise control over +multi-character actions and motions, as well as diverse decorative elements, +ensuring the long-range and dynamic 3D representation. Furthermore, our method +supports narrative extension through logical reasoning, ensuring that generated +content remains consistent with existing conditions. We have thoroughly +evaluated our Story3D-Agent to validate its effectiveness, offering a basic +framework to advance 3D story representation. + +
+
+ comment: Project page: https://yuzhou914.github.io/Story3D-Agent/ +
+
+
+
+
+ + ☆ EE-MLLM: A Data-Efficient and Compute-Efficient Multimodal Large + Language Model + + +
+ In the realm of multimodal research, numerous studies leverage substantial +image-text pairs to conduct modal alignment learning, transforming Large +Language Models (LLMs) into Multimodal LLMs and excelling in a variety of +visual-language tasks. The prevailing methodologies primarily fall into two +categories: self-attention-based and cross-attention-based methods. While +self-attention-based methods offer superior data efficiency due to their simple +MLP architecture, they often suffer from lower computational efficiency due to +concatenating visual and textual tokens as input for LLM. Conversely, +cross-attention-based methods, although less data-efficient due to additional +learnable parameters, exhibit higher computational efficiency by avoiding long +sequence input for LLM. To address these trade-offs, we introduce the +Data-Efficient and Compute-Efficient Multimodal Large Language Model (EE-MLLM). +Without introducing additional modules or learnable parameters, EE-MLLM +achieves both data and compute efficiency. Specifically, we modify the original +self-attention mechanism in MLLM to a composite attention mechanism. This +mechanism has two key characteristics: 1) Eliminating the computational +overhead of self-attention within visual tokens to achieve compute efficiency, +and 2) Reusing the weights on each layer of LLM to facilitate effective +modality alignment between vision and language for data efficiency. +Experimental results demonstrate the effectiveness of EE-MLLM across a range of +benchmarks, including general-purpose datasets like MMBench and SeedBench, as +well as fine-grained tasks such as TextVQA and DocVQA. + +
+
+
+
+
+ + ☆ DreamFactory: Pioneering Multi-Scene Long Video Generation with a + Multi-Agent Framework + + +
+ Current video generation models excel at creating short, realistic clips, but +struggle with longer, multi-scene videos. We introduce \texttt{DreamFactory}, +an LLM-based framework that tackles this challenge. \texttt{DreamFactory} +leverages multi-agent collaboration principles and a Key Frames Iteration +Design Method to ensure consistency and style across long videos. It utilizes +Chain of Thought (COT) to address uncertainties inherent in large language +models. \texttt{DreamFactory} generates long, stylistically coherent, and +complex videos. Evaluating these long-form videos presents a challenge. We +propose novel metrics such as Cross-Scene Face Distance Score and Cross-Scene +Style Consistency Score. To further research in this area, we contribute the +Multi-Scene Videos Dataset containing over 150 human-rated videos. + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ☆ NuSegDG: Integration of Heterogeneous Space and Gaussian Kernel for + Domain-Generalized Nuclei Segmentation + + +
+ Domain-generalized nuclei segmentation refers to the generalizability of +models to unseen domains based on knowledge learned from source domains and is +challenged by various image conditions, cell types, and stain strategies. +Recently, the Segment Anything Model (SAM) has made great success in universal +image segmentation by interactive prompt modes (e.g., point and box). Despite +its strengths, the original SAM presents limited adaptation to medical images. +Moreover, SAM requires providing manual bounding box prompts for each object to +produce satisfactory segmentation masks, so it is laborious in nuclei +segmentation scenarios. To address these limitations, we propose a +domain-generalizable framework for nuclei image segmentation, abbreviated to +NuSegDG. Specifically, we first devise a Heterogeneous Space Adapter +(HS-Adapter) to learn multi-dimensional feature representations of different +nuclei domains by injecting a small number of trainable parameters into the +image encoder of SAM. To alleviate the labor-intensive requirement of manual +prompts, we introduce a Gaussian-Kernel Prompt Encoder (GKP-Encoder) to +generate density maps driven by a single point, which guides segmentation +predictions by mixing position prompts and semantic prompts. Furthermore, we +present a Two-Stage Mask Decoder (TSM-Decoder) to effectively convert semantic +masks to instance maps without the manual demand for morphological shape +refinement. Based on our experimental evaluations, the proposed NuSegDG +demonstrates state-of-the-art performance in nuclei instance segmentation, +exhibiting superior domain generalization capabilities. The source code is +available at https://github.com/xq141839/NuSegDG. + +
+
+ comment: Under Reivew +
+
+
+
+
+ + ☆ Timeline and Boundary Guided Diffusion Network for Video Shadow + Detection ACM MM2024 + + +
+ Video Shadow Detection (VSD) aims to detect the shadow masks with frame +sequence. Existing works suffer from inefficient temporal learning. Moreover, +few works address the VSD problem by considering the characteristic (i.e., +boundary) of shadow. Motivated by this, we propose a Timeline and Boundary +Guided Diffusion (TBGDiff) network for VSD where we take account of the +past-future temporal guidance and boundary information jointly. In detail, we +design a Dual Scale Aggregation (DSA) module for better temporal understanding +by rethinking the affinity of the long-term and short-term frames for the +clipped video. Next, we introduce Shadow Boundary Aware Attention (SBAA) to +utilize the edge contexts for capturing the characteristics of shadows. +Moreover, we are the first to introduce the Diffusion model for VSD in which we +explore a Space-Time Encoded Embedding (STEE) to inject the temporal guidance +for Diffusion to conduct shadow detection. Benefiting from these designs, our +model can not only capture the temporal information but also the shadow +property. Extensive experiments show that the performance of our approach +overtakes the state-of-the-art methods, verifying the effectiveness of our +components. We release the codes, weights, and results at +\url{https://github.com/haipengzhou856/TBGDiff}. + +
+
+ comment: ACM MM2024 +
+
+
+
+
+ + ☆ Embedding Ordinality to Binary Loss Function for Improving Solar Flare + Forecasting + + +
+ In this paper, we propose a novel loss function aimed at optimizing the +binary flare prediction problem by embedding the intrinsic ordinal flare +characteristics into the binary cross-entropy (BCE) loss function. This +modification is intended to provide the model with better guidance based on the +ordinal characteristics of the data and improve the overall performance of the +models. For our experiments, we employ a ResNet34-based model with transfer +learning to predict $\geq$M-class flares by utilizing the shape-based features +of magnetograms of active region (AR) patches spanning from $-$90$^{\circ}$ to +$+$90$^{\circ}$ of solar longitude as our input data. We use a composite skill +score (CSS) as our evaluation metric, which is calculated as the geometric mean +of the True Skill Score (TSS) and the Heidke Skill Score (HSS) to rank and +compare our models' performance. The primary contributions of this work are as +follows: (i) We introduce a novel approach to encode ordinality into a binary +loss function showing an application to solar flare prediction, (ii) We enhance +solar flare forecasting by enabling flare predictions for each AR across the +entire solar disk, without any longitudinal restrictions, and evaluate and +compare performance. (iii) Our candidate model, optimized with the proposed +loss function, shows an improvement of $\sim$7%, $\sim$4%, and $\sim$3% for AR +patches within $\pm$30$^\circ$, $\pm$60$^\circ$, and $\pm$90$^\circ$ of solar +longitude, respectively in terms of CSS, when compared with standard BCE. +Additionally, we demonstrate the ability to issue flare forecasts for ARs in +near-limb regions (regions between $\pm$60$^{\circ}$ to $\pm$90$^{\circ}$) with +a CSS=0.34 (TSS=0.50 and HSS=0.23), expanding the scope of AR-based models for +solar flare prediction. This advances the reliability of solar flare forecasts, +leading to more effective prediction capabilities. + +
+
+ comment: 10 Pages, 8 Figures. This manuscript is accepted to be published at + DSAA 2024 conference. arXiv admin note: substantial text overlap with + arXiv:2406.11054 +
+
+
+
+
+ + ☆ SBDet: A Symmetry-Breaking Object Detector via Relaxed + Rotation-Equivariance + + +
+ Introducing Group Equivariant Convolution (GConv) empowers models to explore +symmetries hidden in visual data, improving their performance. However, in +real-world scenarios, objects or scenes often exhibit perturbations of a +symmetric system, specifically a deviation from a symmetric architecture, which +can be characterized by a non-trivial action of a symmetry group, known as +Symmetry-Breaking. Traditional GConv methods are limited by the strict +operation rules in the group space, only ensuring features remain strictly +equivariant under limited group transformations, making it difficult to adapt +to Symmetry-Breaking or non-rigid transformations. Motivated by this, we +introduce a novel Relaxed Rotation GConv (R2GConv) with our defined Relaxed +Rotation-Equivariant group $\mathbf{R}_4$. Furthermore, we propose a Relaxed +Rotation-Equivariant Network (R2Net) as the backbone and further develop the +Symmetry-Breaking Object Detector (SBDet) for 2D object detection built upon +it. Experiments demonstrate the effectiveness of our proposed R2GConv in +natural image classification tasks, and SBDet achieves excellent performance in +object detection tasks with improved generalization capabilities and +robustness. + +
+
+
+
+
+ + ☆ MambaCSR: Dual-Interleaved Scanning for Compressed Image + Super-Resolution With SSMs + + +
+ We present MambaCSR, a simple but effective framework based on Mamba for the +challenging compressed image super-resolution (CSR) task. Particularly, the +scanning strategies of Mamba are crucial for effective contextual knowledge +modeling in the restoration process despite it relying on selective state space +modeling for all tokens. In this work, we propose an efficient dual-interleaved +scanning paradigm (DIS) for CSR, which is composed of two scanning strategies: +(i) hierarchical interleaved scanning is designed to comprehensively capture +and utilize the most potential contextual information within an image by +simultaneously taking advantage of the local window-based and sequential +scanning methods; (ii) horizontal-to-vertical interleaved scanning is proposed +to reduce the computational cost by leaving the redundancy between the scanning +of different directions. To overcome the non-uniform compression artifacts, we +also propose position-aligned cross-scale scanning to model multi-scale +contextual information. Experimental results on multiple benchmarks have shown +the great performance of our MambaCSR in the compressed image super-resolution +task. The code will be soon available +in~\textcolor{magenta}{\url{https://github.com/renyulin-f/MambaCSR}}. + +
+
+
+
+
+ + ☆ DH-Bench: Probing Depth and Height Perception of Large Visual-Language + Models + + +
+ Geometric understanding is crucial for navigating and interacting with our +environment. While large Vision Language Models (VLMs) demonstrate impressive +capabilities, deploying them in real-world scenarios necessitates a comparable +geometric understanding in visual perception. In this work, we focus on the +geometric comprehension of these models; specifically targeting the depths and +heights of objects within a scene. Our observations reveal that, although VLMs +excel in basic geometric properties perception such as shape and size, they +encounter significant challenges in reasoning about the depth and height of +objects. To address this, we introduce a suite of benchmark datasets +encompassing Synthetic 2D, Synthetic 3D, and Real-World scenarios to rigorously +evaluate these aspects. We benchmark 17 state-of-the-art VLMs using these +datasets and find that they consistently struggle with both depth and height +perception. Our key insights include detailed analyses of the shortcomings in +depth and height reasoning capabilities of VLMs and the inherent bias present +in these models. This study aims to pave the way for the development of VLMs +with enhanced geometric understanding, crucial for real-world applications. The +code and datasets for our benchmarks will be available at +\url{https://tinyurl.com/DH-Bench1}. + +
+
+
+
+
+ + ☆ Open-Ended 3D Point Cloud Instance Segmentation + + +
+ Open-Vocab 3D Instance Segmentation methods (OV-3DIS) have recently +demonstrated their ability to generalize to unseen objects. However, these +methods still depend on predefined class names during testing, restricting the +autonomy of agents. To mitigate this constraint, we propose a novel problem +termed Open-Ended 3D Instance Segmentation (OE-3DIS), which eliminates the +necessity for predefined class names during testing. Moreover, we contribute a +comprehensive set of strong baselines, derived from OV-3DIS approaches and +leveraging 2D Multimodal Large Language Models. To assess the performance of +our OE-3DIS system, we introduce a novel Open-Ended score, evaluating both the +semantic and geometric quality of predicted masks and their associated class +names, alongside the standard AP score. Our approach demonstrates significant +performance improvements over the baselines on the ScanNet200 and ScanNet++ +datasets. Remarkably, our method surpasses the performance of Open3DIS, the +current state-of-the-art method in OV-3DIS, even in the absence of ground-truth +object class names. + +
+
+
+
+
+ + ☆ JieHua Paintings Style Feature Extracting Model using Stable Diffusion + with ControlNet CCS + + +
+ This study proposes a novel approach to extract stylistic features of Jiehua: +the utilization of the Fine-tuned Stable Diffusion Model with ControlNet +(FSDMC) to refine depiction techniques from artists' Jiehua. The training data +for FSDMC is based on the opensource Jiehua artist's work collected from the +Internet, which were subsequently manually constructed in the format of +(Original Image, Canny Edge Features, Text Prompt). By employing the optimal +hyperparameters identified in this paper, it was observed FSDMC outperforms +CycleGAN, another mainstream style transfer model. FSDMC achieves FID of 3.27 +on the dataset and also surpasses CycleGAN in terms of expert evaluation. This +not only demonstrates the model's high effectiveness in extracting Jiehua's +style features, but also preserves the original pre-trained semantic +information. The findings of this study suggest that the application of FSDMC +with appropriate hyperparameters can enhance the efficacy of the Stable +Diffusion Model in the field of traditional art style migration tasks, +particularly within the context of Jiehua. + +
+
+ comment: accepted by ICCSMT 2024 +
+
+
+
+
+ + ☆ CluMo: Cluster-based Modality Fusion Prompt for Continual Learning in + Visual Question Answering + + +
+ Large vision-language models (VLMs) have shown significant performance boost +in various application domains. However, adopting them to deal with several +sequentially encountered tasks has been challenging because finetuning a VLM on +a task normally leads to reducing its generalization power and the capacity of +learning new tasks as well as causing catastrophic forgetting on previously +learned tasks. Enabling using VLMs in multimodal continual learning (CL) +settings can help to address such scenarios. To improve generalization capacity +and prevent catastrophic forgetting, we propose a novel prompt-based CL method +for VLMs, namely $\textbf{Clu}$ster-based $\textbf{Mo}$dality Fusion Prompt +(\textbf{CluMo}). We design a novel \textbf{Key-Key-Prompt} pair, where each +prompt is associated with a visual prompt key and a textual prompt key. We +adopt a two-stage training strategy. During the first stage, the single-modal +keys are trained via $K$-means clustering algorithm to help select the best +semantically matched prompt. During the second stage, the prompt keys are +frozen, the selected prompt is attached to the input for training the VLM in +the CL scenario. Experiments on two benchmarks demonstrate that our method +achieves SOTA performance. + +
+
+
+
+
+ + ☆ Enhancing Cross-Modal Medical Image Segmentation through + Compositionality MICCAI 2024 + + +
+ Cross-modal medical image segmentation presents a significant challenge, as +different imaging modalities produce images with varying resolutions, +contrasts, and appearances of anatomical structures. We introduce +compositionality as an inductive bias in a cross-modal segmentation network to +improve segmentation performance and interpretability while reducing +complexity. The proposed network is an end-to-end cross-modal segmentation +framework that enforces compositionality on the learned representations using +learnable von Mises-Fisher kernels. These kernels facilitate content-style +disentanglement in the learned representations, resulting in compositional +content representations that are inherently interpretable and effectively +disentangle different anatomical structures. The experimental results +demonstrate enhanced segmentation performance and reduced computational costs +on multiple medical datasets. Additionally, we demonstrate the interpretability +of the learned compositional features. Code and checkpoints will be publicly +available at: +https://github.com/Trustworthy-AI-UU-NKI/Cross-Modal-Segmentation. + +
+
+ comment: 11 pages, 3 figures, 2 tables. Accepted at Deep Generative Models + workshop @ MICCAI 2024 (DGM4MICCAI). This is the submitted manuscript with + added link to github repo, funding acknowledgements and authors' names and + affiliations. No further post submission improvements or corrections were + integrated. Final version not published yet +
+
+
+
+
+ + ☆ Iterative Object Count Optimization for Text-to-image Diffusion Models + + +
+ We address a persistent challenge in text-to-image models: accurately +generating a specified number of objects. Current models, which learn from +image-text pairs, inherently struggle with counting, as training data cannot +depict every possible number of objects for any given object. To solve this, we +propose optimizing the generated image based on a counting loss derived from a +counting model that aggregates an object\'s potential. Employing an +out-of-the-box counting model is challenging for two reasons: first, the model +requires a scaling hyperparameter for the potential aggregation that varies +depending on the viewpoint of the objects, and second, classifier guidance +techniques require modified models that operate on noisy intermediate diffusion +steps. To address these challenges, we propose an iterated online training mode +that improves the accuracy of inferred images while altering the text +conditioning embedding and dynamically adjusting hyperparameters. Our method +offers three key advantages: (i) it can consider non-derivable counting +techniques based on detection models, (ii) it is a zero-shot plug-and-play +solution facilitating rapid changes to the counting techniques and image +generation methods, and (iii) the optimized counting token can be reused to +generate accurate images without additional optimization. We evaluate the +generation of various objects and show significant improvements in accuracy. +The project page is available at https://ozzafar.github.io/count_token. + +
+
+ comment: Pre-print +
+
+
+
+
+ + ☆ On Learnable Parameters of Optimal and Suboptimal Deep Learning Models + + +
+ We scrutinize the structural and operational aspects of deep learning models, +particularly focusing on the nuances of learnable parameters (weight) +statistics, distribution, node interaction, and visualization. By establishing +correlations between variance in weight patterns and overall network +performance, we investigate the varying (optimal and suboptimal) performances +of various deep-learning models. Our empirical analysis extends across widely +recognized datasets such as MNIST, Fashion-MNIST, and CIFAR-10, and various +deep learning models such as deep neural networks (DNNs), convolutional neural +networks (CNNs), and vision transformer (ViT), enabling us to pinpoint +characteristics of learnable parameters that correlate with successful +networks. Through extensive experiments on the diverse architectures of deep +learning models, we shed light on the critical factors that influence the +functionality and efficiency of DNNs. Our findings reveal that successful +networks, irrespective of datasets or models, are invariably similar to other +successful networks in their converged weights statistics and distribution, +while poor-performing networks vary in their weights. In addition, our research +shows that the learnable parameters of widely varied deep learning models such +as DNN, CNN, and ViT exhibit similar learning characteristics. + +
+
+
+
+
+ + ☆ ControlCol: Controllability in Automatic Speaker Video Colorization + + +
+ Adding color to black-and-white speaker videos automatically is a highly +desirable technique. It is an artistic process that requires interactivity with +humans for the best results. Many existing automatic video colorization systems +provide little opportunity for the user to guide the colorization process. In +this work, we introduce a novel automatic speaker video colorization system +which provides controllability to the user while also maintaining high +colorization quality relative to state-of-the-art techniques. We name this +system ControlCol. ControlCol performs 3.5% better than the previous +state-of-the-art DeOldify on the Grid and Lombard Grid datasets when PSNR, +SSIM, FID and FVD are used as metrics. This result is also supported by our +human evaluation, where in a head-to-head comparison, ControlCol is preferred +90% of the time to DeOldify. Example videos can be seen in the supplementary +material. + +
+
+
+
+
+ + ☆ FRAP: Faithful and Realistic Text-to-Image Generation with Adaptive + Prompt Weighting + + +
+ Text-to-image (T2I) diffusion models have demonstrated impressive +capabilities in generating high-quality images given a text prompt. However, +ensuring the prompt-image alignment remains a considerable challenge, i.e., +generating images that faithfully align with the prompt's semantics. Recent +works attempt to improve the faithfulness by optimizing the latent code, which +potentially could cause the latent code to go out-of-distribution and thus +produce unrealistic images. In this paper, we propose FRAP, a simple, yet +effective approach based on adaptively adjusting the per-token prompt weights +to improve prompt-image alignment and authenticity of the generated images. We +design an online algorithm to adaptively update each token's weight +coefficient, which is achieved by minimizing a unified objective function that +encourages object presence and the binding of object-modifier pairs. Through +extensive evaluations, we show FRAP generates images with significantly higher +prompt-image alignment to prompts from complex datasets, while having a lower +average latency compared to recent latent code optimization methods, e.g., 4 +seconds faster than D&B on the COCO-Subject dataset. Furthermore, through +visual comparisons and evaluation on the CLIP-IQA-Real metric, we show that +FRAP not only improves prompt-image alignment but also generates more authentic +images with realistic appearances. We also explore combining FRAP with prompt +rewriting LLM to recover their degraded prompt-image alignment, where we +observe improvements in both prompt-image alignment and image quality. + +
+
+
+
+
+ + ☆ FedGS: Federated Gradient Scaling for Heterogeneous Medical Image + Segmentation MICCAI 2024 + + +
+ Federated Learning (FL) in Deep Learning (DL)-automated medical image +segmentation helps preserving privacy by enabling collaborative model training +without sharing patient data. However, FL faces challenges with data +heterogeneity among institutions, leading to suboptimal global models. +Integrating Disentangled Representation Learning (DRL) in FL can enhance +robustness by separating data into distinct representations. Existing DRL +methods assume heterogeneity lies solely in style features, overlooking +content-based variability like lesion size and shape. We propose FedGS, a novel +FL aggregation method, to improve segmentation performance on small, +under-represented targets while maintaining overall efficacy. FedGS +demonstrates superior performance over FedAvg, particularly for small lesions, +across PolypGen and LiTS datasets. The code and pre-trained checkpoints are +available at the following link: +https://github.com/Trustworthy-AI-UU-NKI/Federated-Learning-Disentanglement + +
+
+ comment: 10 pages, 2 figures, 1 table, accepted at MICCAI 2024 Workshop on + Distributed, Collaborative, & Federated Learning Workshop (DeCaF). This is + the submitted manuscript with added link to github repo, funding + acknowledgements and author names and affiliations. No further post + submission improvements or corrections were integrated. Final version not + published yet +
+
+
+
+
+ + ☆ Supervised Representation Learning towards Generalizable Assembly State + Recognition + + +
+ Assembly state recognition facilitates the execution of assembly procedures, +offering feedback to enhance efficiency and minimize errors. However, +recognizing assembly states poses challenges in scalability, since parts are +frequently updated, and the robustness to execution errors remains +underexplored. To address these challenges, this paper proposes an approach +based on representation learning and the novel intermediate-state informed loss +function modification (ISIL). ISIL leverages unlabeled transitions between +states and demonstrates significant improvements in clustering and +classification performance for all tested architectures and losses. Despite +being trained exclusively on images without execution errors, thorough analysis +on error states demonstrates that our approach accurately distinguishes between +correct states and states with various types of execution errors. The +integration of the proposed algorithm can offer meaningful assistance to +workers and mitigate unexpected losses due to procedural mishaps in industrial +settings. The code is available at: https://timschoonbeek.github.io/state_rec + +
+
+ comment: 8 pages, 8 figures +
+
+
+
+
+ + ☆ Robust 3D Gaussian Splatting for Novel View Synthesis in Presence of + Distractors + + +
+ 3D Gaussian Splatting has shown impressive novel view synthesis results; +nonetheless, it is vulnerable to dynamic objects polluting the input data of an +otherwise static scene, so called distractors. Distractors have severe impact +on the rendering quality as they get represented as view-dependent effects or +result in floating artifacts. Our goal is to identify and ignore such +distractors during the 3D Gaussian optimization to obtain a clean +reconstruction. To this end, we take a self-supervised approach that looks at +the image residuals during the optimization to determine areas that have likely +been falsified by a distractor. In addition, we leverage a pretrained +segmentation network to provide object awareness, enabling more accurate +exclusion of distractors. This way, we obtain segmentation masks of distractors +to effectively ignore them in the loss formulation. We demonstrate that our +approach is robust to various distractors and strongly improves rendering +quality on distractor-polluted scenes, improving PSNR by 1.86dB compared to 3D +Gaussian Splatting. + +
+
+ comment: GCPR 2024, Project Page: + https://paulungermann.github.io/Robust3DGaussians , Video: + https://www.youtube.com/watch?v=P9unyR7yK3E +
+
+
+
+
+ + ☆ Interpretable Long-term Action Quality Assessment BMVC + + +
+ Long-term Action Quality Assessment (AQA) evaluates the execution of +activities in videos. However, the length presents challenges in fine-grained +interpretability, with current AQA methods typically producing a single score +by averaging clip features, lacking detailed semantic meanings of individual +clips. Long-term videos pose additional difficulty due to the complexity and +diversity of actions, exacerbating interpretability challenges. While +query-based transformer networks offer promising long-term modeling +capabilities, their interpretability in AQA remains unsatisfactory due to a +phenomenon we term Temporal Skipping, where the model skips self-attention +layers to prevent output degradation. To address this, we propose an attention +loss function and a query initialization method to enhance performance and +interpretability. Additionally, we introduce a weight-score regression module +designed to approximate the scoring patterns observed in human judgments and +replace conventional single-score regression, improving the rationality of +interpretability. Our approach achieves state-of-the-art results on three +real-world, long-term AQA benchmarks. Our code is available at: +https://github.com/dx199771/Interpretability-AQA + +
+
+ comment: Accepted to British Machine Vision Conference (BMVC) 2024 +
+
+
+
+
+ + ☆ LiFCal: Online Light Field Camera Calibration via Bundle Adjustment + + +
+ We propose LiFCal, a novel geometric online calibration pipeline for +MLA-based light field cameras. LiFCal accurately determines model parameters +from a moving camera sequence without precise calibration targets, integrating +arbitrary metric scaling constraints. It optimizes intrinsic parameters of the +light field camera model, the 3D coordinates of a sparse set of scene points +and camera poses in a single bundle adjustment defined directly on micro image +points. + We show that LiFCal can reliably and repeatably calibrate a focused plenoptic +camera using different input sequences, providing intrinsic camera parameters +extremely close to state-of-the-art methods, while offering two main +advantages: it can be applied in a target-free scene, and it is implemented +online in a complete and continuous pipeline. + Furthermore, we demonstrate the quality of the obtained camera parameters in +downstream tasks like depth estimation and SLAM. + Webpage: https://lifcal.github.io/ + +
+
+ comment: Accepted to the German Conference on Pattern Recognition (GCPR) 2024 +
+
+
+
+
+ + ☆ Exploring Robustness of Visual State Space model against Backdoor + Attacks + + +
+ Visual State Space Model (VSS) has demonstrated remarkable performance in +various computer vision tasks. However, in the process of development, backdoor +attacks have brought severe challenges to security. Such attacks cause an +infected model to predict target labels when a specific trigger is activated, +while the model behaves normally on benign samples. In this paper, we conduct +systematic experiments to comprehend on robustness of VSS through the lens of +backdoor attacks, specifically how the state space model (SSM) mechanism +affects robustness. We first investigate the vulnerability of VSS to different +backdoor triggers and reveal that the SSM mechanism, which captures contextual +information within patches, makes the VSS model more susceptible to backdoor +triggers compared to models without SSM. Furthermore, we analyze the +sensitivity of the VSS model to patch processing techniques and discover that +these triggers are effectively disrupted. Based on these observations, we +consider an effective backdoor for the VSS model that recurs in each patch to +resist patch perturbations. Extensive experiments across three datasets and +various backdoor attacks reveal that the VSS model performs comparably to +Transformers (ViTs) but is less robust than the Gated CNNs, which comprise only +stacked Gated CNN blocks without SSM. + +
+
+ comment: 11 pages, 9 figures, under review +
+
+
+
+
+ + ☆ Video-to-Text Pedestrian Monitoring (VTPM): Leveraging Computer Vision + and Large Language Models for Privacy-Preserve Pedestrian Activity Monitoring + at Intersections + + +
+ Computer vision has advanced research methodologies, enhancing system +services across various fields. It is a core component in traffic monitoring +systems for improving road safety; however, these monitoring systems don't +preserve the privacy of pedestrians who appear in the videos, potentially +revealing their identities. Addressing this issue, our paper introduces +Video-to-Text Pedestrian Monitoring (VTPM), which monitors pedestrian movements +at intersections and generates real-time textual reports, including traffic +signal and weather information. VTPM uses computer vision models for pedestrian +detection and tracking, achieving a latency of 0.05 seconds per video frame. +Additionally, it detects crossing violations with 90.2% accuracy by +incorporating traffic signal data. The proposed framework is equipped with +Phi-3 mini-4k to generate real-time textual reports of pedestrian activity +while stating safety concerns like crossing violations, conflicts, and the +impact of weather on their behavior with latency of 0.33 seconds. To enhance +comprehensive analysis of the generated textual reports, Phi-3 medium is +fine-tuned for historical analysis of these generated textual reports. This +fine-tuning enables more reliable analysis about the pedestrian safety at +intersections, effectively detecting patterns and safety critical events. The +proposed VTPM offers a more efficient alternative to video footage by using +textual reports reducing memory usage, saving up to 253 million percent, +eliminating privacy issues, and enabling comprehensive interactive historical +analysis. + +
+
+
+
+
+ + ☆ MCDubber: Multimodal Context-Aware Expressive Video Dubbing + + +
+ Automatic Video Dubbing (AVD) aims to take the given script and generate +speech that aligns with lip motion and prosody expressiveness. Current AVD +models mainly utilize visual information of the current sentence to enhance the +prosody of synthesized speech. However, it is crucial to consider whether the +prosody of the generated dubbing aligns with the multimodal context, as the +dubbing will be combined with the original context in the final video. This +aspect has been overlooked in previous studies. To address this issue, we +propose a Multimodal Context-aware video Dubbing model, termed +\textbf{MCDubber}, to convert the modeling object from a single sentence to a +longer sequence with context information to ensure the consistency of the +global context prosody. MCDubber comprises three main components: (1) A context +duration aligner aims to learn the context-aware alignment between the text and +lip frames; (2) A context prosody predictor seeks to read the global context +visual sequence and predict the context-aware global energy and pitch; (3) A +context acoustic decoder ultimately predicts the global context mel-spectrogram +with the assistance of adjacent ground-truth mel-spectrograms of the target +sentence. Through this process, MCDubber fully considers the influence of +multimodal context on the prosody expressiveness of the current sentence when +dubbing. The extracted mel-spectrogram belonging to the target sentence from +the output context mel-spectrograms is the final required dubbing audio. +Extensive experiments on the Chem benchmark dataset demonstrate that our +MCDubber significantly improves dubbing expressiveness compared to all advanced +baselines. The code and demos are available at +https://github.com/XiaoYuanJun-zy/MCDubber. + +
+
+
+
+
+ + ☆ Toward Enhancing Vehicle Color Recognition in Adverse Conditions: A + Dataset and Benchmark + + +
+ Vehicle information recognition is crucial in various practical domains, +particularly in criminal investigations. Vehicle Color Recognition (VCR) has +garnered significant research interest because color is a visually +distinguishable attribute of vehicles and is less affected by partial occlusion +and changes in viewpoint. Despite the success of existing methods for this +task, the relatively low complexity of the datasets used in the literature has +been largely overlooked. This research addresses this gap by compiling a new +dataset representing a more challenging VCR scenario. The images - sourced from +six license plate recognition datasets - are categorized into eleven colors, +and their annotations were validated using official vehicle registration +information. We evaluate the performance of four deep learning models on a +widely adopted dataset and our proposed dataset to establish a benchmark. The +results demonstrate that our dataset poses greater difficulty for the tested +models and highlights scenarios that require further exploration in VCR. +Remarkably, nighttime scenes account for a significant portion of the errors +made by the best-performing model. This research provides a foundation for +future studies on VCR, while also offering valuable insights for the field of +fine-grained vehicle classification. + +
+
+ comment: Accepted for presentation at the Conference on Graphics, Patterns and + Images (SIBGRAPI) 2024 +
+
+
+
+
+ + ☆ RaNDT SLAM: Radar SLAM Based on Intensity-Augmented Normal Distributions + Transform + + +
+ Rescue robotics sets high requirements to perception algorithms due to the +unstructured and potentially vision-denied environments. Pivoting +Frequency-Modulated Continuous Wave radars are an emerging sensing modality for +SLAM in this kind of environment. However, the complex noise characteristics of +radar SLAM makes, particularly indoor, applications computationally demanding +and slow. In this work, we introduce a novel radar SLAM framework, RaNDT SLAM, +that operates fast and generates accurate robot trajectories. The method is +based on the Normal Distributions Transform augmented by radar intensity +measures. Motion estimation is based on fusion of motion model, IMU data, and +registration of the intensity-augmented Normal Distributions Transform. We +evaluate RaNDT SLAM in a new benchmark dataset and the Oxford Radar RobotCar +dataset. The new dataset contains indoor and outdoor environments besides +multiple sensing modalities (LiDAR, radar, and IMU). + +
+
+ comment: This work was accepted by the IEEE/RSJ International Conference on + Intelligent Robots and Systems, 2024 +
+
+
+
+
+ + ☆ Finite element-based space-time total variation-type regularization of + the inverse problem in electrocardiographic imaging + + +
+ Reconstructing cardiac electrical activity from body surface electric +potential measurements results in the severely ill-posed inverse problem in +electrocardiography. Many different regularization approaches have been +proposed to improve numerical results and provide unique results. This work +presents a novel approach for reconstructing the epicardial potential from body +surface potential maps based on a space-time total variation-type +regularization using finite elements, where a first-order primal-dual algorithm +solves the underlying convex optimization problem. In several numerical +experiments, the superior performance of this method and the benefit of +space-time regularization for the reconstruction of epicardial potential on +two-dimensional torso data and a three-dimensional rabbit heart compared to +state-of-the-art methods are demonstrated. + +
+
+
+
+
+ + ☆ CHOTA: A Higher Order Accuracy Metric for Cell Tracking + + +
+ The evaluation of cell tracking results steers the development of tracking +methods, significantly impacting biomedical research. This is quantitatively +achieved by means of evaluation metrics. Unfortunately, current metrics favor +local correctness and weakly reward global coherence, impeding high-level +biological analysis. To also foster global coherence, we propose the CHOTA +metric (Cell-specific Higher Order Tracking Accuracy) which unifies the +evaluation of all relevant aspects of cell tracking: cell detections and local +associations, global coherence, and lineage tracking. We achieve this by +introducing a new definition of the term 'trajectory' that includes the entire +cell lineage and by including this into the well-established HOTA metric from +general multiple object tracking. Furthermore, we provide a detailed survey of +contemporary cell tracking metrics to compare our novel CHOTA metric and to +show its advantages. All metrics are extensively evaluated on state-of-the-art +real-data cell tracking results and synthetic results that simulate specific +tracking errors. We show that CHOTA is sensitive to all tracking errors and +gives a good indication of the biologically relevant capability of a method to +reconstruct the full lineage of cells. It introduces a robust and comprehensive +alternative to the currently used metrics in cell tracking. Python code is +available at https://github.com/CellTrackingChallenge/py-ctcmetrics . + +
+
+ comment: Accepted at BIC Workshop at European Conference on Computer Vision + 2024, 14 pages, 4 figures, 2 tables +
+
+
+
+
+ + ☆ Positional Prompt Tuning for Efficient 3D Representation Learning + + +
+ Point cloud analysis has achieved significant development and is +well-performed in multiple downstream tasks like point cloud classification and +segmentation, etc. Being conscious of the simplicity of the position encoding +structure in Transformer-based architectures, we attach importance to the +position encoding as a high-dimensional part and the patch encoder to offer +multi-scale information. Together with the sequential Transformer, the whole +module with position encoding comprehensively constructs a multi-scale feature +abstraction module that considers both the local parts from the patch and the +global parts from center points as position encoding. With only a few +parameters, the position embedding module fits the setting of PEFT +(Parameter-Efficient Fine-Tuning) tasks pretty well. Thus we unfreeze these +parameters as a fine-tuning part. At the same time, we review the existing +prompt and adapter tuning methods, proposing a fresh way of prompts and +synthesizing them with adapters as dynamic adjustments. Our Proposed method of +PEFT tasks, namely PPT, with only 1.05% of parameters for training, gets +state-of-the-art results in several mainstream datasets, such as 95.01% +accuracy in the ScanObjectNN OBJ_BG dataset. Codes will be released at +https://github.com/zsc000722/PPT. + +
+
+ comment: tech report +
+
+
+
+
+ + ☆ AutoDirector: Online Auto-scheduling Agents for Multi-sensory + Composition + + +
+ With the advancement of generative models, the synthesis of different sensory +elements such as music, visuals, and speech has achieved significant realism. +However, the approach to generate multi-sensory outputs has not been fully +explored, limiting the application on high-value scenarios such as of directing +a film. Developing a movie director agent faces two major challenges: (1) Lack +of parallelism and online scheduling with production steps: In the production +of multi-sensory films, there are complex dependencies between different +sensory elements, and the production time for each element varies. (2) Diverse +needs and clear communication demands with users: Users often cannot clearly +express their needs until they see a draft, which requires human-computer +interaction and iteration to continually adjust and optimize the film content +based on user feedback. To address these issues, we introduce AutoDirector, an +interactive multi-sensory composition framework that supports long shots, +special effects, music scoring, dubbing, and lip-syncing. This framework +improves the efficiency of multi-sensory film production through automatic +scheduling and supports the modification and improvement of interactive tasks +to meet user needs. AutoDirector not only expands the application scope of +human-machine collaboration but also demonstrates the potential of AI in +collaborating with humans in the role of a film director to complete +multi-sensory films. + +
+
+
+
+
+ + ☆ Self-Supervised Iterative Refinement for Anomaly Detection in Industrial + Quality Control + + +
+ This study introduces the Iterative Refinement Process (IRP), a robust +anomaly detection methodology designed for high-stakes industrial quality +control. The IRP enhances defect detection accuracy through a cyclic data +refinement strategy, iteratively removing misleading data points to improve +model performance and robustness. We validate the IRP's effectiveness using two +benchmark datasets, Kolektor SDD2 (KSDD2) and MVTec AD, covering a wide range +of industrial products and defect types. Our experimental results demonstrate +that the IRP consistently outperforms traditional anomaly detection models, +particularly in environments with high noise levels. This study highlights the +IRP's potential to significantly enhance anomaly detection processes in +industrial settings, effectively managing the challenges of sparse and noisy +data. + +
+
+
+
+
+ + ☆ Semi-supervised 3D Semantic Scene Completion with 2D Vision Foundation + Model Guidance + + +
+ Accurate prediction of 3D semantic occupancy from 2D visual images is vital +in enabling autonomous agents to comprehend their surroundings for planning and +navigation. State-of-the-art methods typically employ fully supervised +approaches, necessitating a huge labeled dataset acquired through expensive +LiDAR sensors and meticulous voxel-wise labeling by human annotators. The +resource-intensive nature of this annotating process significantly hampers the +application and scalability of these methods. We introduce a novel +semi-supervised framework to alleviate the dependency on densely annotated +data. Our approach leverages 2D foundation models to generate essential 3D +scene geometric and semantic cues, facilitating a more efficient training +process. Our framework exhibits notable properties: (1) Generalizability, +applicable to various 3D semantic scene completion approaches, including 2D-3D +lifting and 3D-2D transformer methods. (2) Effectiveness, as demonstrated +through experiments on SemanticKITTI and NYUv2, wherein our method achieves up +to 85% of the fully-supervised performance using only 10% labeled data. This +approach not only reduces the cost and labor associated with data annotation +but also demonstrates the potential for broader adoption in camera-based +systems for 3D semantic occupancy prediction. + +
+
+
+
+
+ + ☆ GSTran: Joint Geometric and Semantic Coherence for Point Cloud + Segmentation ICPR 2024 + + +
+ Learning meaningful local and global information remains a challenge in point +cloud segmentation tasks. When utilizing local information, prior studies +indiscriminately aggregates neighbor information from different classes to +update query points, potentially compromising the distinctive feature of query +points. In parallel, inaccurate modeling of long-distance contextual +dependencies when utilizing global information can also impact model +performance. To address these issues, we propose GSTran, a novel transformer +network tailored for the segmentation task. The proposed network mainly +consists of two principal components: a local geometric transformer and a +global semantic transformer. In the local geometric transformer module, we +explicitly calculate the geometric disparity within the local region. This +enables amplifying the affinity with geometrically similar neighbor points +while suppressing the association with other neighbors. In the global semantic +transformer module, we design a multi-head voting strategy. This strategy +evaluates semantic similarity across the entire spatial range, facilitating the +precise capture of contextual dependencies. Experiments on ShapeNetPart and +S3DIS benchmarks demonstrate the effectiveness of the proposed method, showing +its superiority over other algorithms. The code is available at +https://github.com/LAB123-tech/GSTran. + +
+
+ comment: ICPR 2024 +
+
+
+
+
+ + ☆ AnyDesign: Versatile Area Fashion Editing via Mask-Free Diffusion + + +
+ Fashion image editing aims to modify a person's appearance based on a given +instruction. Existing methods require auxiliary tools like segmenters and +keypoint extractors, lacking a flexible and unified framework. Moreover, these +methods are limited in the variety of clothing types they can handle, as most +datasets focus on people in clean backgrounds and only include generic garments +such as tops, pants, and dresses. These limitations restrict their +applicability in real-world scenarios. In this paper, we first extend an +existing dataset for human generation to include a wider range of apparel and +more complex backgrounds. This extended dataset features people wearing diverse +items such as tops, pants, dresses, skirts, headwear, scarves, shoes, socks, +and bags. Additionally, we propose AnyDesign, a diffusion-based method that +enables mask-free editing on versatile areas. Users can simply input a human +image along with a corresponding prompt in either text or image format. Our +approach incorporates Fashion DiT, equipped with a Fashion-Guidance Attention +(FGA) module designed to fuse explicit apparel types and CLIP-encoded apparel +features. Both Qualitative and quantitative experiments demonstrate that our +method delivers high-quality fashion editing and outperforms contemporary +text-guided fashion editing methods. + +
+
+
+
+
+ + ☆ UNetMamba: Efficient UNet-Like Mamba for Semantic Segmentation of + High-Resolution Remote Sensing Images + + +
+ The semantic segmentation of high-resolution remote sensing images plays a +crucial role in downstream applications such as urban planning and disaster +assessment. However, existing Transformer-based methods suffer from the +constraint between accuracy and efficiency. To overcome this dilemma, we +propose UNetMamba, a novel Mamba-based semantic segmentation model. It +incorporates a Mamba Segmentation Decoder (MSD) that can efficiently decode the +complex information within high-resolution images, and a Local Supervision +Module (LSM), which is train-only but can significantly enhance the perception +of local contents. Extensive experiments demonstrate that UNet-Mamba +outperforms the state-of-the-art methods with the mIoU increased by 0.87% on +LoveDA and 0.36% on ISPRS Vaihingen, while achieving high efficiency through +light weight, low memory footprint and low computational cost. The source code +will soon be publicly available at https://github.com/EnzeZhu2001/UNetMamba. + +
+
+
+
+
+ + ☆ Evolution of Detection Performance throughout the Online Lifespan of + Synthetic Images + + +
+ Synthetic images disseminated online significantly differ from those used +during the training and evaluation of the state-of-the-art detectors. In this +work, we analyze the performance of synthetic image detectors as deceptive +synthetic images evolve throughout their online lifespan. Our study reveals +that, despite advancements in the field, current state-of-the-art detectors +struggle to distinguish between synthetic and real images in the wild. +Moreover, we show that the time elapsed since the initial online appearance of +a synthetic image negatively affects the performance of most detectors. +Ultimately, by employing a retrieval-assisted detection approach, we +demonstrate the feasibility to maintain initial detection performance +throughout the whole online lifespan of an image and enhance the average +detection efficacy across several state-of-the-art detectors by 6.7% and 7.8% +for balanced accuracy and AUC metrics, respectively. + +
+
+
+
+
+ + ☆ DeRainGS: Gaussian Splatting for Enhanced Scene Reconstruction in Rainy + + +
+ Reconstruction under adverse rainy conditions poses significant challenges +due to reduced visibility and the distortion of visual perception. These +conditions can severely impair the quality of geometric maps, which is +essential for applications ranging from autonomous planning to environmental +monitoring. In response to these challenges, this study introduces the novel +task of 3D Reconstruction in Rainy Environments (3DRRE), specifically designed +to address the complexities of reconstructing 3D scenes under rainy conditions. +To benchmark this task, we construct the HydroViews dataset that comprises a +diverse collection of both synthesized and real-world scene images +characterized by various intensities of rain streaks and raindrops. +Furthermore, we propose DeRainGS, the first 3DGS method tailored for +reconstruction in adverse rainy environments. Extensive experiments across a +wide range of rain scenarios demonstrate that our method delivers +state-of-the-art performance, remarkably outperforming existing occlusion-free +methods by a large margin. + +
+
+
+
+
+ + ☆ A Survey of Embodied Learning for Object-Centric Robotic Manipulation + + +
+ Embodied learning for object-centric robotic manipulation is a rapidly +developing and challenging area in embodied AI. It is crucial for advancing +next-generation intelligent robots and has garnered significant interest +recently. Unlike data-driven machine learning methods, embodied learning +focuses on robot learning through physical interaction with the environment and +perceptual feedback, making it especially suitable for robotic manipulation. In +this paper, we provide a comprehensive survey of the latest advancements in +this field and categorize the existing work into three main branches: 1) +Embodied perceptual learning, which aims to predict object pose and affordance +through various data representations; 2) Embodied policy learning, which +focuses on generating optimal robotic decisions using methods such as +reinforcement learning and imitation learning; 3) Embodied task-oriented +learning, designed to optimize the robot's performance based on the +characteristics of different tasks in object grasping and manipulation. In +addition, we offer an overview and discussion of public datasets, evaluation +metrics, representative applications, current challenges, and potential future +research directions. A project associated with this survey has been established +at https://github.com/RayYoh/OCRM_survey. + +
+
+
+
+
+ + ☆ SAM-REF: Rethinking Image-Prompt Synergy for Refinement in Segment + Anything + + +
+ The advent of the Segment Anything Model (SAM) marks a significant milestone +for interactive segmentation using generalist models. As a late fusion model, +SAM extracts image embeddings once and merges them with prompts in later +interactions. This strategy limits the models ability to extract detailed +information from the prompted target zone. Current specialist models utilize +the early fusion strategy that encodes the combination of images and prompts to +target the prompted objects, yet repetitive complex computations on the images +result in high latency. The key to these issues is efficiently synergizing the +images and prompts. We propose SAM-REF, a two-stage refinement framework that +fully integrates images and prompts globally and locally while maintaining the +accuracy of early fusion and the efficiency of late fusion. The first-stage +GlobalDiff Refiner is a lightweight early fusion network that combines the +whole image and prompts, focusing on capturing detailed information for the +entire object. The second-stage PatchDiff Refiner locates the object detail +window according to the mask and prompts, then refines the local details of the +object. Experimentally, we demonstrated the high effectiveness and efficiency +of our method in tackling complex cases with multiple interactions. Our SAM-REF +model outperforms the current state-of-the-art method in most metrics on +segmentation quality without compromising efficiency. + +
+
+
+
+
+ + ☆ Just Project! Multi-Channel Despeckling, the Easy Way + + +
+ Reducing speckle fluctuations in multi-channel SAR images is essential in +many applications of SAR imaging such as polarimetric classification or +interferometric height estimation. While single-channel despeckling has widely +benefited from the application of deep learning techniques, extensions to +multi-channel SAR images are much more challenging.This paper introduces +MuChaPro, a generic framework that exploits existing single-channel despeckling +methods. The key idea is to generate numerous single-channel projections, +restore these projections, and recombine them into the final multi-channel +estimate. This simple approach is shown to be effective in polarimetric and/or +interferometric modalities. A special appeal of MuChaPro is the possibility to +apply a self-supervised training strategy to learn sensor-specific networks for +single-channel despeckling. + +
+
+
+
+
+ + ☆ EmoFace: Emotion-Content Disentangled Speech-Driven 3D Talking Face with + Mesh Attention + + +
+ The creation of increasingly vivid 3D virtual digital humans has become a hot +topic in recent years. Currently, most speech-driven work focuses on training +models to learn the relationship between phonemes and visemes to achieve more +realistic lips. However, they fail to capture the correlations between emotions +and facial expressions effectively. To solve this problem, we propose a new +model, termed EmoFace. EmoFace employs a novel Mesh Attention mechanism, which +helps to learn potential feature dependencies between mesh vertices in time and +space. We also adopt, for the first time to our knowledge, an effective +self-growing training scheme that combines teacher-forcing and scheduled +sampling in a 3D face animation task. Additionally, since EmoFace is an +autoregressive model, there is no requirement that the first frame of the +training data must be a silent frame, which greatly reduces the data +limitations and contributes to solve the current dilemma of insufficient +datasets. Comprehensive quantitative and qualitative evaluations on our +proposed high-quality reconstructed 3D emotional facial animation dataset, +3D-RAVDESS ($5.0343\times 10^{-5}$mm for LVE and $1.0196\times 10^{-5}$mm for +EVE), and publicly available dataset VOCASET ($2.8669\times 10^{-5}$mm for LVE +and $0.4664\times 10^{-5}$mm for EVE), demonstrate that our algorithm achieves +state-of-the-art performance. + +
+
+
+
+
+ + ☆ MSCPT: Few-shot Whole Slide Image Classification with Multi-scale and + Context-focused Prompt Tuning + + +
+ Multiple instance learning (MIL) has become a standard paradigm for weakly +supervised classification of whole slide images (WSI). However, this paradigm +relies on the use of a large number of labelled WSIs for training. The lack of +training data and the presence of rare diseases present significant challenges +for these methods. Prompt tuning combined with the pre-trained Vision-Language +models (VLMs) is an effective solution to the Few-shot Weakly Supervised WSI +classification (FSWC) tasks. Nevertheless, applying prompt tuning methods +designed for natural images to WSIs presents three significant challenges: 1) +These methods fail to fully leverage the prior knowledge from the VLM's text +modality; 2) They overlook the essential multi-scale and contextual information +in WSIs, leading to suboptimal results; and 3) They lack exploration of +instance aggregation methods. To address these problems, we propose a +Multi-Scale and Context-focused Prompt Tuning (MSCPT) method for FSWC tasks. +Specifically, MSCPT employs the frozen large language model to generate +pathological visual language prior knowledge at multi-scale, guiding +hierarchical prompt tuning. Additionally, we design a graph prompt tuning +module to learn essential contextual information within WSI, and finally, a +non-parametric cross-guided instance aggregation module has been introduced to +get the WSI-level features. Based on two VLMs, extensive experiments and +visualizations on three datasets demonstrated the powerful performance of our +MSCPT. + +
+
+ comment: 11 pages, 5 figures, 5tables +
+
+
+
+
+ + ☆ XDT-CXR: Investigating Cross-Disease Transferability in Zero-Shot Binary + Classification of Chest X-Rays + + +
+ This study explores the concept of cross-disease transferability (XDT) in +medical imaging, focusing on the potential of binary classifiers trained on one +disease to perform zero-shot classification on another disease affecting the +same organ. Utilizing chest X-rays (CXR) as the primary modality, we +investigate whether a model trained on one pulmonary disease can make +predictions about another novel pulmonary disease, a scenario with significant +implications for medical settings with limited data on emerging diseases. The +XDT framework leverages the embedding space of a vision encoder, which, through +kernel transformation, aids in distinguishing between diseased and non-diseased +classes in the latent space. This capability is especially beneficial in +resource-limited environments or in regions with low prevalence of certain +diseases, where conventional diagnostic practices may fail. However, the XDT +framework is currently limited to binary classification, determining only the +presence or absence of a disease rather than differentiating among multiple +diseases. This limitation underscores the supplementary role of XDT to +traditional diagnostic tests in clinical settings. Furthermore, results show +that XDT-CXR as a framework is able to make better predictions compared to +other zero-shot learning (ZSL) baselines. + +
+
+ comment: Accepted in Machine Learning for Healthcare Conference MLHC 2024 +
+
+
+
+
+ + ☆ E-Bench: Subjective-Aligned Benchmark Suite for Text-Driven Video + Editing Quality Assessment + + +
+ Text-driven video editing has recently experienced rapid development. Despite +this, evaluating edited videos remains a considerable challenge. Current +metrics tend to fail to align with human perceptions, and effective +quantitative metrics for video editing are still notably absent. To address +this, we introduce E-Bench, a benchmark suite tailored to the assessment of +text-driven video editing. This suite includes E-Bench DB, a video quality +assessment (VQA) database for video editing. E-Bench DB encompasses a diverse +set of source videos featuring various motions and subjects, along with +multiple distinct editing prompts, editing results from 8 different models, and +the corresponding Mean Opinion Scores (MOS) from 24 human annotators. Based on +E-Bench DB, we further propose E-Bench QA, a quantitative human-aligned +measurement for the text-driven video editing task. In addition to the +aesthetic, distortion, and other visual quality indicators that traditional VQA +methods emphasize, E-Bench QA focuses on the text-video alignment and the +relevance modeling between source and edited videos. It proposes a new +assessment network for video editing that attains superior performance in +alignment with human preferences. To the best of our knowledge, E-Bench +introduces the first quality assessment dataset for video editing and an +effective subjective-aligned quantitative metric for this domain. All data and +code will be publicly available at https://github.com/littlespray/E-Bench. + +
+
+
+
+
+ + ☆ OAPT: Offset-Aware Partition Transformer for Double JPEG Artifacts + Removal + + +
+ Deep learning-based methods have shown remarkable performance in single JPEG +artifacts removal task. However, existing methods tend to degrade on double +JPEG images, which are prevalent in real-world scenarios. To address this +issue, we propose Offset-Aware Partition Transformer for double JPEG artifacts +removal, termed as OAPT. We conduct an analysis of double JPEG compression that +results in up to four patterns within each 8x8 block and design our model to +cluster the similar patterns to remedy the difficulty of restoration. Our OAPT +consists of two components: compression offset predictor and image +reconstructor. Specifically, the predictor estimates pixel offsets between the +first and second compression, which are then utilized to divide different +patterns. The reconstructor is mainly based on several Hybrid Partition +Attention Blocks (HPAB), combining vanilla window-based self-attention and +sparse attention for clustered pattern features. Extensive experiments +demonstrate that OAPT outperforms the state-of-the-art method by more than +0.16dB in double JPEG image restoration task. Moreover, without increasing any +computation cost, the pattern clustering module in HPAB can serve as a plugin +to enhance other transformer-based image restoration methods. The code will be +available at https://github.com/QMoQ/OAPT.git . + +
+
+ comment: 14 pages, 9 figures. Codes and models are available at + https://github.com/QMoQ/OAPT.git +
+
+
+
+
+ + ☆ LAKD-Activation Mapping Distillation Based on Local Learning + + +
+ Knowledge distillation is widely applied in various fundamental vision models +to enhance the performance of compact models. Existing knowledge distillation +methods focus on designing different distillation targets to acquire knowledge +from teacher models. However, these methods often overlook the efficient +utilization of distilled information, crudely coupling different types of +information, making it difficult to explain how the knowledge from the teacher +network aids the student network in learning. This paper proposes a novel +knowledge distillation framework, Local Attention Knowledge Distillation +(LAKD), which more efficiently utilizes the distilled information from teacher +networks, achieving higher interpretability and competitive performance. The +framework establishes an independent interactive training mechanism through a +separation-decoupling mechanism and non-directional activation mapping. LAKD +decouples the teacher's features and facilitates progressive interaction +training from simple to complex. Specifically, the student network is divided +into local modules with independent gradients to decouple the knowledge +transferred from the teacher. The non-directional activation mapping helps the +student network integrate knowledge from different local modules by learning +coarse-grained feature knowledge. We conducted experiments on the CIFAR-10, +CIFAR-100, and ImageNet datasets, and the results show that our LAKD method +significantly outperforms existing methods, consistently achieving +state-of-the-art performance across different datasets. + +
+
+ comment: 8 pages,7 figures +
+
+
+
+
+ + ☆ TrackGo: A Flexible and Efficient Method for Controllable Video + Generation + + +
+ Recent years have seen substantial progress in diffusion-based controllable +video generation. However, achieving precise control in complex scenarios, +including fine-grained object parts, sophisticated motion trajectories, and +coherent background movement, remains a challenge. In this paper, we introduce +TrackGo, a novel approach that leverages free-form masks and arrows for +conditional video generation. This method offers users with a flexible and +precise mechanism for manipulating video content. We also propose the +TrackAdapter for control implementation, an efficient and lightweight adapter +designed to be seamlessly integrated into the temporal self-attention layers of +a pretrained video generation model. This design leverages our observation that +the attention map of these layers can accurately activate regions corresponding +to motion in videos. Our experimental results demonstrate that our new +approach, enhanced by the TrackAdapter, achieves state-of-the-art performance +on key metrics such as FVD, FID, and ObjMC scores. The project page of TrackGo +can be found at: https://zhtjtcz.github.io/TrackGo-Page/ + +
+
+
+
+
+ + ☆ MeTTA: Single-View to 3D Textured Mesh Reconstruction with Test-Time + Adaptation BMVC 2024 + + +
+ Reconstructing 3D from a single view image is a long-standing challenge. One +of the popular approaches to tackle this problem is learning-based methods, but +dealing with the test cases unfamiliar with training data (Out-of-distribution; +OoD) introduces an additional challenge. To adapt for unseen samples in test +time, we propose MeTTA, a test-time adaptation (TTA) exploiting generative +prior. We design joint optimization of 3D geometry, appearance, and pose to +handle OoD cases with only a single view image. However, the alignment between +the reference image and the 3D shape via the estimated viewpoint could be +erroneous, which leads to ambiguity. To address this ambiguity, we carefully +design learnable virtual cameras and their self-calibration. In our +experiments, we demonstrate that MeTTA effectively deals with OoD scenarios at +failure cases of existing learning-based 3D reconstruction models and enables +obtaining a realistic appearance with physically based rendering (PBR) +textures. + +
+
+ comment: Accepted at BMVC 2024. [Project page] https://metta3d.github.io/ +
+
+
+
+
+ + ☆ MambaOcc: Visual State Space Model for BEV-based Occupancy Prediction + with Local Adaptive Reordering + + +
+ Occupancy prediction has attracted intensive attention and shown great +superiority in the development of autonomous driving systems. The fine-grained +environmental representation brought by occupancy prediction in terms of both +geometry and semantic information has facilitated the general perception and +safe planning under open scenarios. However, it also brings high computation +costs and heavy parameters in existing works that utilize voxel-based 3d dense +representation and Transformer-based quadratic attention. To address these +challenges, in this paper, we propose a Mamba-based occupancy prediction method +(MambaOcc) adopting BEV features to ease the burden of 3D scenario +representation, and linear Mamba-style attention to achieve efficient +long-range perception. Besides, to address the sensitivity of Mamba to sequence +order, we propose a local adaptive reordering (LAR) mechanism with deformable +convolution and design a hybrid BEV encoder comprised of convolution layers and +Mamba. Extensive experiments on the Occ3D-nuScenes dataset demonstrate that +MambaOcc achieves state-of-the-art performance in terms of both accuracy and +computational efficiency. For example, compared to FlashOcc, MambaOcc delivers +superior results while reducing the number of parameters by 42\% and +computational costs by 39\%. Code will be available at +https://github.com/Hub-Tian/MambaOcc. + +
+
+
+
+
+ + ☆ Low-Light Object Tracking: A Benchmark + + +
+ In recent years, the field of visual tracking has made significant progress +with the application of large-scale training datasets. These datasets have +supported the development of sophisticated algorithms, enhancing the accuracy +and stability of visual object tracking. However, most research has primarily +focused on favorable illumination circumstances, neglecting the challenges of +tracking in low-ligh environments. In low-light scenes, lighting may change +dramatically, targets may lack distinct texture features, and in some +scenarios, targets may not be directly observable. These factors can lead to a +severe decline in tracking performance. To address this issue, we introduce +LLOT, a benchmark specifically designed for Low-Light Object Tracking. LLOT +comprises 269 challenging sequences with a total of over 132K frames, each +carefully annotated with bounding boxes. This specially designed dataset aims +to promote innovation and advancement in object tracking techniques for +low-light conditions, addressing challenges not adequately covered by existing +benchmarks. To assess the performance of existing methods on LLOT, we conducted +extensive tests on 39 state-of-the-art tracking algorithms. The results +highlight a considerable gap in low-light tracking performance. In response, we +propose H-DCPT, a novel tracker that incorporates historical and darkness clue +prompts to set a stronger baseline. H-DCPT outperformed all 39 evaluated +methods in our experiments, demonstrating significant improvements. We hope +that our benchmark and H-DCPT will stimulate the development of novel and +accurate methods for tracking objects in low-light conditions. The LLOT and +code are available at https://github.com/OpenCodeGithub/H-DCPT. + +
+
+
+
+
+ + ☆ Lookism: The overlooked bias in computer vision ECCV 2024 + + +
+ In recent years, there have been significant advancements in computer vision +which have led to the widespread deployment of image recognition and generation +systems in socially relevant applications, from hiring to security screening. +However, the prevalence of biases within these systems has raised significant +ethical and social concerns. The most extensively studied biases in this +context are related to gender, race and age. Yet, other biases are equally +pervasive and harmful, such as lookism, i.e., the preferential treatment of +individuals based on their physical appearance. Lookism remains under-explored +in computer vision but can have profound implications not only by perpetuating +harmful societal stereotypes but also by undermining the fairness and +inclusivity of AI technologies. Thus, this paper advocates for the systematic +study of lookism as a critical bias in computer vision models. Through a +comprehensive review of existing literature, we identify three areas of +intersection between lookism and computer vision. We illustrate them by means +of examples and a user study. We call for an interdisciplinary approach to +address lookism, urging researchers, developers, and policymakers to prioritize +the development of equitable computer vision systems that respect and reflect +the diversity of human appearances. + +
+
+ comment: Paper accepted at the ECCV 2024 workshop named "Fairness and ethics + towards transparent AI: facing the chalLEnge through model Debiasing + (FAILED)", https://failed-workshop-eccv-2024.github.io/ +
+
+
+
+
+ + ☆ GaussianOcc: Fully Self-supervised and Efficient 3D Occupancy Estimation + with Gaussian Splatting + + +
+ We introduce GaussianOcc, a systematic method that investigates the two +usages of Gaussian splatting for fully self-supervised and efficient 3D +occupancy estimation in surround views. First, traditional methods for +self-supervised 3D occupancy estimation still require ground truth 6D poses +from sensors during training. To address this limitation, we propose Gaussian +Splatting for Projection (GSP) module to provide accurate scale information for +fully self-supervised training from adjacent view projection. Additionally, +existing methods rely on volume rendering for final 3D voxel representation +learning using 2D signals (depth maps, semantic maps), which is both +time-consuming and less effective. We propose Gaussian Splatting from Voxel +space (GSV) to leverage the fast rendering properties of Gaussian splatting. As +a result, the proposed GaussianOcc method enables fully self-supervised (no +ground truth pose) 3D occupancy estimation in competitive performance with low +computational cost (2.7 times faster in training and 5 times faster in +rendering). + +
+
+ comment: Project page: https://ganwanshui.github.io/GaussianOcc/ +
+
+
+
+
+ + ☆ BAdd: Bias Mitigation through Bias Addition + + +
+ Computer vision (CV) datasets often exhibit biases that are perpetuated by +deep learning models. While recent efforts aim to mitigate these biases and +foster fair representations, they fail in complex real-world scenarios. In +particular, existing methods excel in controlled experiments involving +benchmarks with single-attribute injected biases, but struggle with +multi-attribute biases being present in well-established CV datasets. Here, we +introduce BAdd, a simple yet effective method that allows for learning fair +representations invariant to the attributes introducing bias by incorporating +features representing these attributes into the backbone. BAdd is evaluated on +seven benchmarks and exhibits competitive performance, surpassing +state-of-the-art methods on both single- and multi-attribute benchmarks. +Notably, BAdd achieves +27.5% and +5.5% absolute accuracy improvements on the +challenging multi-attribute benchmarks, FB-Biased-MNIST and CelebA, +respectively. + +
+
+
+
+
+ + ☆ DABench: A Benchmark Dataset for Data-Driven Weather Data Assimilation + + +
+ Recent advancements in deep learning (DL) have led to the development of +several Large Weather Models (LWMs) that rival state-of-the-art (SOTA) +numerical weather prediction (NWP) systems. Up to now, these models still rely +on traditional NWP-generated analysis fields as input and are far from being an +autonomous system. While researchers are exploring data-driven data +assimilation (DA) models to generate accurate initial fields for LWMs, the lack +of a standard benchmark impedes the fair evaluation among different data-driven +DA algorithms. Here, we introduce DABench, a benchmark dataset utilizing ERA5 +data as ground truth to guide the development of end-to-end data-driven weather +prediction systems. DABench contributes four standard features: (1) sparse and +noisy simulated observations under the guidance of the observing system +simulation experiment method; (2) a skillful pre-trained weather prediction +model to generate background fields while fairly evaluating the impact of +assimilation outcomes on predictions; (3) standardized evaluation metrics for +model comparison; (4) a strong baseline called the DA Transformer (DaT). DaT +integrates the four-dimensional variational DA prior knowledge into the +Transformer model and outperforms the SOTA in physical state reconstruction, +named 4DVarNet. Furthermore, we exemplify the development of an end-to-end +data-driven weather prediction system by integrating DaT with the prediction +model. Researchers can leverage DABench to develop their models and compare +performance against established baselines, which will benefit the future +advancements of data-driven weather prediction systems. The code is available +on this Github repository and the dataset is available at the Baidu Drive. + +
+
+ comment: 37pages, 12 figures, 6 tables +
+
+
+
+
+ + ☆ T2VIndexer: A Generative Video Indexer for Efficient Text-Video + Retrieval + + +
+ Current text-video retrieval methods mainly rely on cross-modal matching +between queries and videos to calculate their similarity scores, which are then +sorted to obtain retrieval results. This method considers the matching between +each candidate video and the query, but it incurs a significant time cost and +will increase notably with the increase of candidates. Generative models are +common in natural language processing and computer vision, and have been +successfully applied in document retrieval, but their application in multimodal +retrieval remains unexplored. To enhance retrieval efficiency, in this paper, +we introduce a model-based video indexer named T2VIndexer, which is a +sequence-to-sequence generative model directly generating video identifiers and +retrieving candidate videos with constant time complexity. T2VIndexer aims to +reduce retrieval time while maintaining high accuracy. To achieve this goal, we +propose video identifier encoding and query-identifier augmentation approaches +to represent videos as short sequences while preserving their semantic +information. Our method consistently enhances the retrieval efficiency of +current state-of-the-art models on four standard datasets. It enables baselines +with only 30\%-50\% of the original retrieval time to achieve better retrieval +performance on MSR-VTT (+1.0%), MSVD (+1.8%), ActivityNet (+1.5%), and DiDeMo +(+0.2%). The code is available at +https://github.com/Lilidamowang/T2VIndexer-generativeSearch. + +
+
+
+
+
+ + ☆ EMO-LLaMA: Enhancing Facial Emotion Understanding with Instruction + Tuning + + +
+ Facial expression recognition (FER) is an important research topic in +emotional artificial intelligence. In recent decades, researchers have made +remarkable progress. However, current FER paradigms face challenges in +generalization, lack semantic information aligned with natural language, and +struggle to process both images and videos within a unified framework, making +their application in multimodal emotion understanding and human-computer +interaction difficult. Multimodal Large Language Models (MLLMs) have recently +achieved success, offering advantages in addressing these issues and +potentially overcoming the limitations of current FER paradigms. However, +directly applying pre-trained MLLMs to FER still faces several challenges. Our +zero-shot evaluations of existing open-source MLLMs on FER indicate a +significant performance gap compared to GPT-4V and current supervised +state-of-the-art (SOTA) methods. In this paper, we aim to enhance MLLMs' +capabilities in understanding facial expressions. We first generate instruction +data for five FER datasets with Gemini. We then propose a novel MLLM, named +EMO-LLaMA, which incorporates facial priors from a pretrained facial analysis +network to enhance human facial information. Specifically, we design a Face +Info Mining module to extract both global and local facial information. +Additionally, we utilize a handcrafted prompt to introduce age-gender-race +attributes, considering the emotional differences across different human +groups. Extensive experiments show that EMO-LLaMA achieves SOTA-comparable or +competitive results across both static and dynamic FER datasets. The +instruction dataset and code are available at +https://github.com/xxtars/EMO-LLaMA. + +
+
+
+
+
+ + ☆ Pano2Room: Novel View Synthesis from a Single Indoor Panorama SIGGRAPH + + +
+ Recent single-view 3D generative methods have made significant advancements +by leveraging knowledge distilled from extensive 3D object datasets. However, +challenges persist in the synthesis of 3D scenes from a single view, primarily +due to the complexity of real-world environments and the limited availability +of high-quality prior resources. In this paper, we introduce a novel approach +called Pano2Room, designed to automatically reconstruct high-quality 3D indoor +scenes from a single panoramic image. These panoramic images can be easily +generated using a panoramic RGBD inpainter from captures at a single location +with any camera. The key idea is to initially construct a preliminary mesh from +the input panorama, and iteratively refine this mesh using a panoramic RGBD +inpainter while collecting photo-realistic 3D-consistent pseudo novel views. +Finally, the refined mesh is converted into a 3D Gaussian Splatting field and +trained with the collected pseudo novel views. This pipeline enables the +reconstruction of real-world 3D scenes, even in the presence of large +occlusions, and facilitates the synthesis of photo-realistic novel views with +detailed geometry. Extensive qualitative and quantitative experiments have been +conducted to validate the superiority of our method in single-panorama indoor +novel synthesis compared to the state-of-the-art. Our code and data are +available at \url{https://github.com/TrickyGo/Pano2Room}. + +
+
+ comment: SIGGRAPH Asia 2024 Conference Papers (SA Conference Papers '24), + December 3--6, 2024, Tokyo, Japan +
+
+
+
+
+ + ☆ SelfDRSC++: Self-Supervised Learning for Dual Reversed Rolling Shutter + Correction SC + + +
+ Modern consumer cameras commonly employ the rolling shutter (RS) imaging +mechanism, via which images are captured by scanning scenes row-by-row, +resulting in RS distortion for dynamic scenes. To correct RS distortion, +existing methods adopt a fully supervised learning manner that requires high +framerate global shutter (GS) images as ground-truth for supervision. In this +paper, we propose an enhanced Self-supervised learning framework for Dual +reversed RS distortion Correction (SelfDRSC++). Firstly, we introduce a +lightweight DRSC network that incorporates a bidirectional correlation matching +block to refine the joint optimization of optical flows and corrected RS +features, thereby improving correction performance while reducing network +parameters. Subsequently, to effectively train the DRSC network, we propose a +self-supervised learning strategy that ensures cycle consistency between input +and reconstructed dual reversed RS images. The RS reconstruction in SelfDRSC++ +can be interestingly formulated as a specialized instance of video frame +interpolation, where each row in reconstructed RS images is interpolated from +predicted GS images by utilizing RS distortion time maps. By achieving superior +performance while simplifying the training process, SelfDRSC++ enables feasible +one-stage self-supervised training. Additionally, besides start and end RS +scanning time, SelfDRSC++ allows supervision of GS images at arbitrary +intermediate scanning times, thus enabling the learned DRSC network to generate +high framerate GS videos. The code and trained models are available at +\url{https://github.com/shangwei5/SelfDRSC_plusplus}. + +
+
+ comment: 13 pages, 9 figures, and the code is available at + \url{https://github.com/shangwei5/SelfDRSC_plusplus} +
+
+
+
+
+ + ☆ Latent Feature and Attention Dual Erasure Attack against Multi-View + Diffusion Models for 3D Assets Protection + + +
+ Multi-View Diffusion Models (MVDMs) enable remarkable improvements in the +field of 3D geometric reconstruction, but the issue regarding intellectual +property has received increasing attention due to unauthorized imitation. +Recently, some works have utilized adversarial attacks to protect copyright. +However, all these works focus on single-image generation tasks which only need +to consider the inner feature of images. Previous methods are inefficient in +attacking MVDMs because they lack the consideration of disrupting the geometric +and visual consistency among the generated multi-view images. This paper is the +first to address the intellectual property infringement issue arising from +MVDMs. Accordingly, we propose a novel latent feature and attention dual +erasure attack to disrupt the distribution of latent feature and the +consistency across the generated images from multi-view and multi-domain +simultaneously. The experiments conducted on SOTA MVDMs indicate that our +approach achieves superior performances in terms of attack effectiveness, +transferability, and robustness against defense methods. Therefore, this paper +provides an efficient solution to protect 3D assets from MVDMs-based 3D +geometry reconstruction. + +
+
+
+
+
+ + ☆ Domain-invariant Progressive Knowledge Distillation for UAV-based Object + Detection + + +
+ Knowledge distillation (KD) is an effective method for compressing models in +object detection tasks. Due to limited computational capability, UAV-based +object detection (UAV-OD) widely adopt the KD technique to obtain lightweight +detectors. Existing methods often overlook the significant differences in +feature space caused by the large gap in scale between the teacher and student +models. This limitation hampers the efficiency of knowledge transfer during the +distillation process. Furthermore, the complex backgrounds in UAV images make +it challenging for the student model to efficiently learn the object features. +In this paper, we propose a novel knowledge distillation framework for UAV-OD. +Specifically, a progressive distillation approach is designed to alleviate the +feature gap between teacher and student models. Then a new feature alignment +method is provided to extract object-related features for enhancing student +model's knowledge reception efficiency. Finally, extensive experiments are +conducted to validate the effectiveness of our proposed approach. The results +demonstrate that our proposed method achieves state-of-the-art (SoTA) +performance in two UAV-OD datasets. + +
+
+
+
+
+ + ☆ Video Diffusion Models are Strong Video Inpainter + + +
+ Propagation-based video inpainting using optical flow at the pixel or feature +level has recently garnered significant attention. However, it has limitations +such as the inaccuracy of optical flow prediction and the propagation of noise +over time. These issues result in non-uniform noise and time consistency +problems throughout the video, which are particularly pronounced when the +removed area is large and involves substantial movement. To address these +issues, we propose a novel First Frame Filling Video Diffusion Inpainting model +(FFF-VDI). We design FFF-VDI inspired by the capabilities of pre-trained +image-to-video diffusion models that can transform the first frame image into a +highly natural video. To apply this to the video inpainting task, we propagate +the noise latent information of future frames to fill the masked areas of the +first frame's noise latent code. Next, we fine-tune the pre-trained +image-to-video diffusion model to generate the inpainted video. The proposed +model addresses the limitations of existing methods that rely on optical flow +quality, producing much more natural and temporally consistent videos. This +proposed approach is the first to effectively integrate image-to-video +diffusion models into video inpainting tasks. Through various comparative +experiments, we demonstrate that the proposed model can robustly handle diverse +inpainting types with high quality. + +
+
+
+
+
+ + ☆ Revisiting FunnyBirds evaluation framework for prototypical parts + networks + + +
+ Prototypical parts networks, such as ProtoPNet, became popular due to their +potential to produce more genuine explanations than post-hoc methods. However, +for a long time, this potential has been strictly theoretical, and no +systematic studies have existed to support it. That changed recently with the +introduction of the FunnyBirds benchmark, which includes metrics for evaluating +different aspects of explanations. + However, this benchmark employs attribution maps visualization for all +explanation techniques except for the ProtoPNet, for which the bounding boxes +are used. This choice significantly influences the metric scores and questions +the conclusions stated in FunnyBirds publication. + In this study, we comprehensively compare metric scores obtained for two +types of ProtoPNet visualizations: bounding boxes and similarity maps. Our +analysis indicates that employing similarity maps aligns better with the +essence of ProtoPNet, as evidenced by different metric scores obtained from +FunnyBirds. Therefore, we advocate using similarity maps as a visualization +technique for prototypical parts networks in explainability evaluation +benchmarks. + +
+
+ comment: Published at 2nd XAI World Conference +
+
+
+
+
+ + ☆ EAGLE: Elevating Geometric Reasoning through LLM-empowered Visual + Instruction Tuning + + +
+ Multi-modal Large Language Models have recently experienced rapid +developments and excel in various multi-modal tasks. However, they still +struggle with mathematical geometric problem solving, which requires +exceptional visual perception proficiency. Existing MLLMs mostly optimize the +LLM backbone to acquire geometric reasoning capabilities, while rarely +emphasizing improvements in visual comprehension. In this paper, we first +investigate the visual perception performance of MLLMs when facing geometric +diagrams. Our findings reveal that current MLLMs severely suffer from +inaccurate geometric perception and hallucinations. To address these +limitations, we propose EAGLE, a novel two-stage end-to-end visual enhancement +MLLM framework designed to ElevAte Geometric reasoning through LLM-Empowered +visual instruction tuning. Specifically, in the preliminary stage, we feed +geometric image-caption pairs into our MLLM that contains a fully fine-tuning +CLIP ViT and a frozen LLM, aiming to endow our model with basic geometric +knowledge. In the subsequent advanced stage, we incorporate LoRA modules into +the vision encoder and unfreeze the LLM backbone. This enables the model to +leverage the inherent CoT rationales within question-answer pairs, guiding the +MLLM to focus on nuanced visual cues and enhancing its overall perceptual +capacity. Moreover, we optimize the cross-modal projector in both stages to +foster adaptive visual-linguistic alignments. After the two-stage visual +enhancement, we develop the geometry expert model EAGLE-7B. Extensive +experiments on popular benchmarks demonstrate the effectiveness of our model. +For example, on the GeoQA benchmark, EAGLE-7B not only surpasses the exemplary +G-LLaVA 7B model by 2.9%, but also marginally outperforms the larger G-LLaVA +13B model. On the MathVista benchmark, EAGLE-7B achieves remarkable 3.8% +improvements compared with the proprietary model GPT-4V. + +
+
+
+
+
+ + ☆ Fairness measures for biometric quality assessment + + +
+ Quality assessment algorithms measure the quality of a captured biometric +sample. Since the sample quality strongly affects the recognition performance +of a biometric system, it is essential to only process samples of sufficient +quality and discard samples of low-quality. Even though quality assessment +algorithms are not intended to yield very different quality scores across +demographic groups, quality score discrepancies are possible, resulting in +different discard ratios. To ensure that quality assessment algorithms do not +take demographic characteristics into account when assessing sample quality and +consequently to ensure that the quality algorithms perform equally for all +individuals, it is crucial to develop a fairness measure. In this work we +propose and compare multiple fairness measures for evaluating quality +components across demographic groups. Proposed measures, could be used as +potential candidates for an upcoming standard in this important field. + +
+
+
+
+
+ + ☆ Current Status and Trends in Image Anti-Forensics Research: A + Bibliometric Analysis + + +
+ Image anti-forensics is a critical topic in the field of image privacy and +security research. With the increasing ease of manipulating or generating human +faces in images, the potential misuse of such forged images is a growing +concern. This study aims to comprehensively review the knowledge structure and +research hotspots related to image anti-forensics by analyzing publications in +the Web of Science Core Collection (WoSCC) database. The bibliometric analysis +conducted using VOSViewer software has revealed the research trends, major +research institutions, most influential publications, top publishing venues, +and most active contributors in this field. This is the first comprehensive +bibliometric study summarizing research trends and developments in image +anti-forensics. The information highlights recent and primary research +directions, serving as a reference for future research in image anti-forensics. + +
+
+
+
+
+ + ☆ HumanCoser: Layered 3D Human Generation via Semantic-Aware Diffusion + Model + + +
+ This paper aims to generate physically-layered 3D humans from text prompts. +Existing methods either generate 3D clothed humans as a whole or support only +tight and simple clothing generation, which limits their applications to +virtual try-on and part-level editing. To achieve physically-layered 3D human +generation with reusable and complex clothing, we propose a novel layer-wise +dressed human representation based on a physically-decoupled diffusion model. +Specifically, to achieve layer-wise clothing generation, we propose a +dual-representation decoupling framework for generating clothing decoupled from +the human body, in conjunction with an innovative multi-layer fusion volume +rendering method. To match the clothing with different body shapes, we propose +an SMPL-driven implicit field deformation network that enables the free +transfer and reuse of clothing. Extensive experiments demonstrate that our +approach not only achieves state-of-the-art layered 3D human generation with +complex clothing but also supports virtual try-on and layered human animation. + +
+
+
+
+
+ + ☆ Image Score: Learning and Evaluating Human Preferences for Mercari + Search + + +
+ Mercari is the largest C2C e-commerce marketplace in Japan, having more than +20 million active monthly users. Search being the fundamental way to discover +desired items, we have always had a substantial amount of data with implicit +feedback. Although we actively take advantage of that to provide the best +service for our users, the correlation of implicit feedback for such tasks as +image quality assessment is not trivial. Many traditional lines of research in +Machine Learning (ML) are similarly motivated by the insatiable appetite of +Deep Learning (DL) models for well-labelled training data. Weak supervision is +about leveraging higher-level and/or noisier supervision over unlabeled data. +Large Language Models (LLMs) are being actively studied and used for data +labelling tasks. We present how we leverage a Chain-of-Thought (CoT) to enable +LLM to produce image aesthetics labels that correlate well with human behavior +in e-commerce settings. Leveraging LLMs is more cost-effective compared to +explicit human judgment, while significantly improving the explainability of +deep image quality evaluation which is highly important for customer journey +optimization at Mercari. We propose a cost-efficient LLM-driven approach for +assessing and predicting image quality in e-commerce settings, which is very +convenient for proof-of-concept testing. We show that our LLM-produced labels +correlate with user behavior on Mercari. Finally, we show our results from an +online experimentation, where we achieved a significant growth in sales on the +web platform. + +
+
+
+
+
+ + ☆ FATE: Focal-modulated Attention Encoder for Temperature Prediction + + +
+ One of the major challenges of the twenty-first century is climate change, +evidenced by rising sea levels, melting glaciers, and increased storm +frequency. Accurate temperature forecasting is vital for understanding and +mitigating these impacts. Traditional data-driven models often use recurrent +neural networks (RNNs) but face limitations in parallelization, especially with +longer sequences. To address this, we introduce a novel approach based on the +FocalNet Transformer architecture. Our Focal modulation Attention Encoder +(FATE) framework operates in a multi-tensor format, utilizing tensorized +modulation to capture spatial and temporal nuances in meteorological data. +Comparative evaluations against existing transformer encoders, 3D CNNs, LSTM, +and ConvLSTM models show that FATE excels at identifying complex patterns in +temperature data. Additionally, we present a new labeled dataset, the Climate +Change Parameter dataset (CCPD), containing 40 years of data from Jammu and +Kashmir on seven climate-related parameters. Experiments with real-world +temperature datasets from the USA, Canada, and Europe show accuracy +improvements of 12\%, 23\%, and 28\%, respectively, over current +state-of-the-art models. Our CCPD dataset also achieved a 24\% improvement in +accuracy. To support reproducible research, we have released the source code +and pre-trained FATE model at +\href{https://github.com/Tajamul21/FATE}{https://github.com/Tajamul21/FATE}. + +
+
+
+
+
+ + ☆ Optimizing Transmit Field Inhomogeneity of Parallel RF Transmit Design + in 7T MRI using Deep Learning + + +
+ Ultrahigh field (UHF) Magnetic Resonance Imaging (MRI) provides a higher +signal-to-noise ratio and, thereby, higher spatial resolution. However, UHF MRI +introduces challenges such as transmit radiofrequency (RF) field (B1+) +inhomogeneities, leading to uneven flip angles and image intensity anomalies. +These issues can significantly degrade imaging quality and its medical +applications. This study addresses B1+ field homogeneity through a novel deep +learning-based strategy. Traditional methods like Magnitude Least Squares (MLS) +optimization have been effective but are time-consuming and dependent on the +patient's presence. Recent machine learning approaches, such as RF Shim +Prediction by Iteratively Projected Ridge Regression and deep learning +frameworks, have shown promise but face limitations like extensive training +times and oversimplified architectures. We propose a two-step deep learning +strategy. First, we obtain the desired reference RF shimming weights from +multi-channel B1+ fields using random-initialized Adaptive Moment Estimation. +Then, we employ Residual Networks (ResNets) to train a model that maps B1+ +fields to target RF shimming outputs. Our approach does not rely on +pre-calculated reference optimizations for the testing process and efficiently +learns residual functions. Comparative studies with traditional MLS +optimization demonstrate our method's advantages in terms of speed and +accuracy. The proposed strategy achieves a faster and more efficient RF +shimming design, significantly improving imaging quality at UHF. This +advancement holds potential for broader applications in medical imaging and +diagnostics. + +
+
+
+
+
+ + ☆ TWLV-I: Analysis and Insights from Holistic Evaluation on Video + Foundation Models + + +
+ In this work, we discuss evaluating video foundation models in a fair and +robust manner. Unlike language or image foundation models, many video +foundation models are evaluated with differing parameters (such as sampling +rate, number of frames, pretraining steps, etc.), making fair and robust +comparisons challenging. Therefore, we present a carefully designed evaluation +framework for measuring two core capabilities of video comprehension: +appearance and motion understanding. Our findings reveal that existing video +foundation models, whether text-supervised like UMT or InternVideo2, or +self-supervised like V-JEPA, exhibit limitations in at least one of these +capabilities. As an alternative, we introduce TWLV-I, a new video foundation +model that constructs robust visual representations for both motion- and +appearance-based videos. Based on the average top-1 accuracy of linear probing +on five action recognition benchmarks, pretrained only on publicly accessible +datasets, our model shows a 4.6%p improvement compared to V-JEPA (ViT-L) and a +7.7%p improvement compared to UMT (ViT-L). Even when compared to much larger +models, our model demonstrates a 7.2%p improvement compared to DFN (ViT-H), a +2.7%p improvement compared to V-JEPA~(ViT-H) and a 2.8%p improvement compared +to InternVideo2 (ViT-g). We provide embedding vectors obtained by TWLV-I from +videos of several commonly used video benchmarks, along with evaluation source +code that can directly utilize these embeddings. The code is available on +"https://github.com/twelvelabs-io/video-embeddings-evaluation-framework". + +
+
+ comment: 17 pages; Twelve Labs Technical Report +
+
+
+
+
+ + ☆ Swarm Intelligence in Geo-Localization: A Multi-Agent Large + Vision-Language Model Collaborative Framework + + +
+ Visual geo-localization demands in-depth knowledge and advanced reasoning +skills to associate images with real-world geographic locations precisely. In +general, traditional methods based on data-matching are hindered by the +impracticality of storing adequate visual records of global landmarks. +Recently, Large Vision-Language Models (LVLMs) have demonstrated the capability +of geo-localization through Visual Question Answering (VQA), enabling a +solution that does not require external geo-tagged image records. However, the +performance of a single LVLM is still limited by its intrinsic knowledge and +reasoning capabilities. Along this line, in this paper, we introduce a novel +visual geo-localization framework called \name\ that integrates the inherent +knowledge of multiple LVLM agents via inter-agent communication to achieve +effective geo-localization of images. Furthermore, our framework employs a +dynamic learning strategy to optimize the communication patterns among agents, +reducing unnecessary discussions among agents and improving the efficiency of +the framework. To validate the effectiveness of the proposed framework, we +construct GeoGlobe, a novel dataset for visual geo-localization tasks. +Extensive testing on the dataset demonstrates that our approach significantly +outperforms state-of-the-art methods. + +
+
+
+
+
+ + ☆ Improving Out-of-Distribution Data Handling and Corruption Resistance + via Modern Hopfield Networks + + +
+ This study explores the potential of Modern Hopfield Networks (MHN) in +improving the ability of computer vision models to handle out-of-distribution +data. While current computer vision models can generalize to unseen samples +from the same distribution, they are susceptible to minor perturbations such as +blurring, which limits their effectiveness in real-world applications. We +suggest integrating MHN into the baseline models to enhance their robustness. +This integration can be implemented during the test time for any model and +combined with any adversarial defense method. Our research shows that the +proposed integration consistently improves model performance on the MNIST-C +dataset, achieving a state-of-the-art increase of 13.84% in average corruption +accuracy, a 57.49% decrease in mean Corruption Error (mCE), and a 60.61% +decrease in relative mCE compared to the baseline model. Additionally, we +investigate the capability of MHN to converge to the original non-corrupted +data. Notably, our method does not require test-time adaptation or augmentation +with corruptions, underscoring its practical viability for real-world +deployment. (Source code publicly available at: +https://github.com/salehsargolzaee/Hopfield-integrated-test) + +
+
+
+
+
+ + ☆ UniFashion: A Unified Vision-Language Model for Multimodal Fashion + Retrieval and Generation + + +
+ The fashion domain encompasses a variety of real-world multimodal tasks, +including multimodal retrieval and multimodal generation. The rapid +advancements in artificial intelligence generated content, particularly in +technologies like large language models for text generation and diffusion +models for visual generation, have sparked widespread research interest in +applying these multimodal models in the fashion domain. However, tasks +involving embeddings, such as image-to-text or text-to-image retrieval, have +been largely overlooked from this perspective due to the diverse nature of the +multimodal fashion domain. And current research on multi-task single models +lack focus on image generation. In this work, we present UniFashion, a unified +framework that simultaneously tackles the challenges of multimodal generation +and retrieval tasks within the fashion domain, integrating image generation +with retrieval tasks and text generation tasks. UniFashion unifies embedding +and generative tasks by integrating a diffusion model and LLM, enabling +controllable and high-fidelity generation. Our model significantly outperforms +previous single-task state-of-the-art models across diverse fashion tasks, and +can be readily adapted to manage complex vision-language tasks. This work +demonstrates the potential learning synergy between multimodal generation and +retrieval, offering a promising direction for future research in the fashion +domain. The source code is available at +https://github.com/xiangyu-mm/UniFashion. + +
+
+
+
+
+ + ☆ Making Large Vision Language Models to be Good Few-shot Learners + + +
+ Few-shot classification (FSC) is a fundamental yet challenging task in +computer vision that involves recognizing novel classes from limited data. +While previous methods have focused on enhancing visual features or +incorporating additional modalities, Large Vision Language Models (LVLMs) offer +a promising alternative due to their rich knowledge and strong visual +perception. However, LVLMs risk learning specific response formats rather than +effectively extracting useful information from support data in FSC tasks. In +this paper, we investigate LVLMs' performance in FSC and identify key issues +such as insufficient learning and the presence of severe positional biases. To +tackle the above challenges, we adopt the meta-learning strategy to teach +models "learn to learn". By constructing a rich set of meta-tasks for +instruction fine-tuning, LVLMs enhance the ability to extract information from +few-shot support data for classification. Additionally, we further boost LVLM's +few-shot learning capabilities through label augmentation and candidate +selection in the fine-tuning and inference stage, respectively. Label +augmentation is implemented via a character perturbation strategy to ensure the +model focuses on support information. Candidate selection leverages attribute +descriptions to filter out unreliable candidates and simplify the task. +Extensive experiments demonstrate that our approach achieves superior +performance on both general and fine-grained datasets. Furthermore, our +candidate selection strategy has been proven beneficial for training-free +LVLMs. + +
+
+
+
+
+ + ☆ HMT-UNet: A hybird Mamba-Transformer Vision UNet for Medical Image + Segmentation + + +
+ In the field of medical image segmentation, models based on both CNN and +Transformer have been thoroughly investigated. However, CNNs have limited +modeling capabilities for long-range dependencies, making it challenging to +exploit the semantic information within images fully. On the other hand, the +quadratic computational complexity poses a challenge for Transformers. State +Space Models (SSMs), such as Mamba, have been recognized as a promising method. +They not only demonstrate superior performance in modeling long-range +interactions, but also preserve a linear computational complexity. The hybrid +mechanism of SSM (State Space Model) and Transformer, after meticulous design, +can enhance its capability for efficient modeling of visual features. Extensive +experiments have demonstrated that integrating the self-attention mechanism +into the hybrid part behind the layers of Mamba's architecture can greatly +improve the modeling capacity to capture long-range spatial dependencies. In +this paper, leveraging the hybrid mechanism of SSM, we propose a U-shape +architecture model for medical image segmentation, named Hybird Transformer +vision Mamba UNet (HTM-UNet). We conduct comprehensive experiments on the +ISIC17, ISIC18, CVC-300, CVC-ClinicDB, Kvasir, CVC-ColonDB, ETIS-Larib PolypDB +public datasets and ZD-LCI-GIM private dataset. The results indicate that +HTM-UNet exhibits competitive performance in medical image segmentation tasks. +Our code is available at https://github.com/simzhangbest/HMT-Unet. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2403.09157; text overlap + with arXiv:2407.08083 by other authors +
+
+
+
+
+ + ☆ Taming Generative Diffusion for Universal Blind Image Restoration + + +
+ Diffusion models have been widely utilized for image restoration. However, +previous blind image restoration methods still need to assume the type of +degradation model while leaving the parameters to be optimized, limiting their +real-world applications. Therefore, we aim to tame generative diffusion prior +for universal blind image restoration dubbed BIR-D, which utilizes an +optimizable convolutional kernel to simulate the degradation model and +dynamically update the parameters of the kernel in the diffusion steps, +enabling it to achieve blind image restoration results even in various complex +situations. Besides, based on mathematical reasoning, we have provided an +empirical formula for the chosen of adaptive guidance scale, eliminating the +need for a grid search for the optimal parameter. Experimentally, Our BIR-D has +demonstrated superior practicality and versatility than off-the-shelf +unsupervised methods across various tasks both on real-world and synthetic +datasets, qualitatively and quantitatively. BIR-D is able to fulfill +multi-guidance blind image restoration. Moreover, BIR-D can also restore images +that undergo multiple and complicated degradations, demonstrating the practical +applications. + +
+
+ comment: 14 pages, 9 figures, 8 tables +
+
+
+
+
+ + ☆ Video Emotion Open-vocabulary Recognition Based on Multimodal Large + Language Model + + +
+ Multimodal emotion recognition is a task of great concern. However, +traditional data sets are based on fixed labels, resulting in models that often +focus on main emotions and ignore detailed emotional changes in complex scenes. +This report introduces the solution of using MLLMs technology to generate +open-vocabulary emotion labels from a video. The solution includes the use of +framework, data generation and processing, training methods, results generation +and multi-model co-judgment. In the MER-OV (Open-Word Emotion Recognition) of +the MER2024 challenge, our method achieved significant advantages, leading to +its superior capabilities in complex emotion computation. + +
+
+
+
+
+ + ☆ Exploring Scene Coherence for Semi-Supervised 3D Semantic Segmentation + + +
+ Semi-supervised semantic segmentation, which efficiently addresses the +limitation of acquiring dense annotations, is essential for 3D scene +understanding. Most methods leverage the teacher model to generate pseudo +labels, and then guide the learning of the student model on unlabeled scenes. +However, they focus only on points with pseudo labels while directly +overlooking points without pseudo labels, namely intra-scene inconsistency, +leading to semantic ambiguity. Moreover, inter-scene correlation between +labeled and unlabeled scenes contribute to transferring rich annotation +information, yet this has not been explored for the semi-supervised tasks. To +address these two problems, we propose to explore scene coherence for +semi-supervised 3D semantic segmentation, dubbed CoScene. Inspired by the +unstructured and unordered nature of the point clouds, our CoScene adopts the +straightforward point erasure strategy to ensure the intra-scene consistency. +Moreover, patch-based data augmentation is proposed to enhance the inter-scene +information transfer between labeled and unlabeled scenes at both scene and +instance levels. Extensive experimental results on SemanticKITTI and nuScenes +show that our approach outperforms existing methods. + +
+
+
+
+
+ + ☆ The Key of Parameter Skew in Federated Learning + + +
+ Federated Learning (FL) has emerged as an excellent solution for performing +deep learning on different data owners without exchanging raw data. However, +statistical heterogeneity in FL presents a key challenge, leading to a +phenomenon of skewness in local model parameter distributions that researchers +have largely overlooked. In this work, we propose the concept of parameter skew +to describe the phenomenon that can substantially affect the accuracy of global +model parameter estimation. Additionally, we introduce FedSA, an aggregation +strategy to obtain a high-quality global model, to address the implication from +parameter skew. Specifically, we categorize parameters into high-dispersion and +low-dispersion groups based on the coefficient of variation. For +high-dispersion parameters, Micro-Classes (MIC) and Macro-Classes (MAC) +represent the dispersion at the micro and macro levels, respectively, forming +the foundation of FedSA. To evaluate the effectiveness of FedSA, we conduct +extensive experiments with different FL algorithms on three computer vision +datasets. FedSA outperforms eight state-of-the-art baselines by about 4.7% in +test accuracy. + +
+
+
+
+
+ + ☆ On Missing Scores in Evolving Multibiometric Systems ICPR + + +
+ The use of multiple modalities (e.g., face and fingerprint) or multiple +algorithms (e.g., three face comparators) has shown to improve the recognition +accuracy of an operational biometric system. Over time a biometric system may +evolve to add new modalities, retire old modalities, or be merged with other +biometric systems. This can lead to scenarios where there are missing scores +corresponding to the input probe set. Previous work on this topic has focused +on either the verification or identification tasks, but not both. Further, the +proportion of missing data considered has been less than 50%. In this work, we +study the impact of missing score data for both the verification and +identification tasks. We show that the application of various score imputation +methods along with simple sum fusion can improve recognition accuracy, even +when the proportion of missing scores increases to 90%. Experiments show that +fusion after score imputation outperforms fusion with no imputation. +Specifically, iterative imputation with K nearest neighbors consistently +surpasses other imputation methods in both the verification and identification +tasks, regardless of the amount of scores missing, and provides imputed values +that are consistent with the ground truth complete dataset. + +
+
+ comment: 2022 26th International Conference on Pattern Recognition (ICPR) +
+
+
+
+
+ + ☆ Automatic Image Annotation (AIA) of AlmondNet-20 Method for Almond + Detection by Improved CNN-based Model + + +
+ In response to the burgeoning global demand for premium agricultural +products, particularly within the competitive nut market, this paper introduces +an innovative methodology aimed at enhancing the grading process for almonds +and their shells. Leveraging state-of-the-art Deep Convolutional Neural +Networks (CNNs), specifically the AlmondNet-20 architecture, our study achieves +exceptional accuracy exceeding 99%, facilitated by the utilization of a +20-layer CNN model. To bolster robustness in differentiating between almonds +and shells, data augmentation techniques are employed, ensuring the reliability +and accuracy of our classification system. Our model, meticulously trained over +1000 epochs, demonstrates remarkable performance, boasting an accuracy rate of +99% alongside a minimal loss function of 0.0567. Rigorous evaluation through +test datasets further validates the efficacy of our approach, revealing +impeccable precision, recall, and F1-score metrics for almond detection. Beyond +its technical prowess, this advanced classification system offers tangible +benefits to both industry experts and non-specialists alike, ensuring globally +reliable almond classification. The application of deep learning algorithms, as +showcased in our study, not only enhances grading accuracy but also presents +opportunities for product patents, thereby contributing to the economic value +of our nation. Through the adoption of cutting-edge technologies such as the +AlmondNet-20 model, we pave the way for future advancements in agricultural +product classification, ultimately enriching global trade and economic +prosperity. + +
+
+
+
+
+ + ☆ FUSELOC: Fusing Global and Local Descriptors to Disambiguate 2D-3D + Matching in Visual Localization + + +
+ Hierarchical methods represent state-of-the-art visual localization, +optimizing search efficiency by using global descriptors to focus on relevant +map regions. However, this state-of-the-art performance comes at the cost of +substantial memory requirements, as all database images must be stored for +feature matching. In contrast, direct 2D-3D matching algorithms require +significantly less memory but suffer from lower accuracy due to the larger and +more ambiguous search space. We address this ambiguity by fusing local and +global descriptors using a weighted average operator within a 2D-3D search +framework. This fusion rearranges the local descriptor space such that +geographically nearby local descriptors are closer in the feature space +according to the global descriptors. Therefore, the number of irrelevant +competing descriptors decreases, specifically if they are geographically +distant, thereby increasing the likelihood of correctly matching a query +descriptor. We consistently improve the accuracy over local-only systems and +achieve performance close to hierarchical methods while halving memory +requirements. Extensive experiments using various state-of-the-art local and +global descriptors across four different datasets demonstrate the effectiveness +of our approach. For the first time, our approach enables direct matching +algorithms to benefit from global descriptors while maintaining memory +efficiency. The code for this paper will be published at +\href{https://github.com/sontung/descriptor-disambiguation}{github.com/sontung/descriptor-disambiguation}. + +
+
+
+
+
+ + ☆ Limitations in Employing Natural Language Supervision for Sensor-Based + Human Activity Recognition -- And Ways to Overcome Them + + +
+ Cross-modal contrastive pre-training between natural language and other +modalities, e.g., vision and audio, has demonstrated astonishing performance +and effectiveness across a diverse variety of tasks and domains. In this paper, +we investigate whether such natural language supervision can be used for +wearable sensor based Human Activity Recognition (HAR), and discover +that-surprisingly-it performs substantially worse than standard end-to-end +training and self-supervision. We identify the primary causes for this as: +sensor heterogeneity and the lack of rich, diverse text descriptions of +activities. To mitigate their impact, we also develop strategies and assess +their effectiveness through an extensive experimental evaluation. These +strategies lead to significant increases in activity recognition, bringing +performance closer to supervised and self-supervised training, while also +enabling the recognition of unseen activities and cross modal retrieval of +videos. Overall, our work paves the way for better sensor-language learning, +ultimately leading to the development of foundational models for HAR using +wearables. + +
+
+
+
+
+ + ☆ Detection of Under-represented Samples Using Dynamic Batch Training for + Brain Tumor Segmentation from MR Images + + +
+ Brain tumors in magnetic resonance imaging (MR) are difficult, +time-consuming, and prone to human error. These challenges can be resolved by +developing automatic brain tumor segmentation methods from MR images. Various +deep-learning models based on the U-Net have been proposed for the task. These +deep-learning models are trained on a dataset of tumor images and then used for +segmenting the masks. Mini-batch training is a widely used method in deep +learning for training. However, one of the significant challenges associated +with this approach is that if the training dataset has under-represented +samples or samples with complex latent representations, the model may not +generalize well to these samples. The issue leads to skewed learning of the +data, where the model learns to fit towards the majority representations while +underestimating the under-represented samples. The proposed dynamic batch +training method addresses the challenges posed by under-represented data +points, data points with complex latent representation, and imbalances within +the class, where some samples may be harder to learn than others. Poor +performance of such samples can be identified only after the completion of the +training, leading to the wastage of computational resources. Also, training +easy samples after each epoch is an inefficient utilization of computation +resources. To overcome these challenges, the proposed method identifies hard +samples and trains such samples for more iterations compared to easier samples +on the BraTS2020 dataset. Additionally, the samples trained multiple times are +identified and it provides a way to identify hard samples in the BraTS2020 +dataset. The comparison of the proposed training approach with U-Net and other +models in the literature highlights the capabilities of the proposed training +approach. + +
+
+
+
+
+ + ☆ CaRDiff: Video Salient Object Ranking Chain of Thought Reasoning for + Saliency Prediction with Diffusion + + +
+ Video saliency prediction aims to identify the regions in a video that +attract human attention and gaze, driven by bottom-up features from the video +and top-down processes like memory and cognition. Among these top-down +influences, language plays a crucial role in guiding attention by shaping how +visual information is interpreted. Existing methods primarily focus on modeling +perceptual information while neglecting the reasoning process facilitated by +language, where ranking cues are crucial outcomes of this process and practical +guidance for saliency prediction. In this paper, we propose CaRDiff (Caption, +Rank, and generate with Diffusion), a framework that imitates the process by +integrating a multimodal large language model (MLLM), a grounding module, and a +diffusion model, to enhance video saliency prediction. Specifically, we +introduce a novel prompting method VSOR-CoT (Video Salient Object Ranking Chain +of Thought), which utilizes an MLLM with a grounding module to caption video +content and infer salient objects along with their rankings and positions. This +process derives ranking maps that can be sufficiently leveraged by the +diffusion model to decode the saliency maps for the given video accurately. +Extensive experiments show the effectiveness of VSOR-CoT in improving the +performance of video saliency prediction. The proposed CaRDiff performs better +than state-of-the-art models on the MVS dataset and demonstrates cross-dataset +capabilities on the DHF1k dataset through zero-shot evaluation. + +
+
+
+
+
+ + ☆ MBSS-T1: Model-Based Self-Supervised Motion Correction for Robust + Cardiac T1 Mapping + + +
+ T1 mapping is a valuable quantitative MRI technique for diagnosing diffuse +myocardial diseases. Traditional methods, relying on breath-hold sequences and +echo triggering, face challenges with patient compliance and arrhythmias, +limiting their effectiveness. Image registration can enable motion-robust T1 +mapping, but inherent intensity differences between time points pose a +challenge. We introduce MBSS-T1, a self-supervised model for motion correction +in cardiac T1 mapping, constrained by physical and anatomical principles. The +physical constraints ensure expected signal decay behavior, while the +anatomical constraints maintain realistic deformations. The unique combination +of these constraints ensures accurate T1 mapping along the longitudinal +relaxation axis. MBSS-T1 outperformed baseline deep-learning-based image +registration approaches in a 5-fold experiment on a public dataset of 210 +patients (STONE sequence) and an internal dataset of 19 patients (MOLLI +sequence). MBSS-T1 excelled in model fitting quality (R2: 0.974 vs. 0.941, +0.946), anatomical alignment (Dice score: 0.921 vs. 0.984, 0.988), and expert +visual quality assessment for the presence of visible motion artifacts (4.33 +vs. 3.34, 3.62). MBSS-T1 has the potential to enable motion-robust T1 mapping +for a broader range of patients, overcoming challenges such as arrhythmias, and +suboptimal compliance, and allowing for free-breathing T1 mapping without +requiring large training datasets. + +
+
+
+
+
+ + ☆ AIM 2024 Challenge on Compressed Video Quality Assessment: Methods and + Results + + +
+ Video quality assessment (VQA) is a crucial task in the development of video +compression standards, as it directly impacts the viewer experience. This paper +presents the results of the Compressed Video Quality Assessment challenge, held +in conjunction with the Advances in Image Manipulation (AIM) workshop at ECCV +2024. The challenge aimed to evaluate the performance of VQA methods on a +diverse dataset of 459 videos, encoded with 14 codecs of various compression +standards (AVC/H.264, HEVC/H.265, AV1, and VVC/H.266) and containing a +comprehensive collection of compression artifacts. To measure the methods +performance, we employed traditional correlation coefficients between their +predictions and subjective scores, which were collected via large-scale +crowdsourced pairwise human comparisons. For training purposes, participants +were provided with the Compressed Video Quality Assessment Dataset (CVQAD), a +previously developed dataset of 1022 videos. Up to 30 participating teams +registered for the challenge, while we report the results of 6 teams, which +submitted valid final solutions and code for reproducing the results. Moreover, +we calculated and present the performance of state-of-the-art VQA methods on +the developed dataset, providing a comprehensive benchmark for future research. +The dataset, results, and online leaderboard are publicly available at +https://challenges.videoprocessing.ai/challenges/compressed-video-quality-assessment.html. + +
+
+
+
+
+ + ☆ Visual Localization in 3D Maps: Comparing Point Cloud, Mesh, and NeRF + Representations + + +
+ This paper introduces and assesses a cross-modal global visual localization +system that can localize camera images within a color 3D map representation +built using both visual and lidar sensing. We present three different +state-of-the-art methods for creating the color 3D maps: point clouds, meshes, +and neural radiance fields (NeRF). Our system constructs a database of +synthetic RGB and depth image pairs from these representations. This database +serves as the basis for global localization. We present an automatic approach +that builds this database by synthesizing novel images of the scene and +exploiting the 3D structure encoded in the different representations. Next, we +present a global localization system that relies on the synthetic image +database to accurately estimate the 6 DoF camera poses of monocular query +images. Our localization approach relies on different learning-based global +descriptors and feature detectors which enable robust image retrieval and +matching despite the domain gap between (real) query camera images and the +synthetic database images. We assess the system's performance through extensive +real-world experiments in both indoor and outdoor settings, in order to +evaluate the effectiveness of each map representation and the benefits against +traditional structure-from-motion localization approaches. Our results show +that all three map representations can achieve consistent localization success +rates of 55% and higher across various environments. NeRF synthesized images +show superior performance, localizing query images at an average success rate +of 72%. Furthermore, we demonstrate that our synthesized database enables +global localization even when the map creation data and the localization +sequence are captured when travelling in opposite directions. Our system, +operating in real-time on a mobile laptop equipped with a GPU, achieves a +processing rate of 1Hz. + +
+
+
+
+
+ + ☆ CT-AGRG: Automated Abnormality-Guided Report Generation from 3D Chest CT + Volumes + + +
+ The rapid increase of computed tomography (CT) scans and their time-consuming +manual analysis have created an urgent need for robust automated analysis +techniques in clinical settings. These aim to assist radiologists and help them +managing their growing workload. Existing methods typically generate entire +reports directly from 3D CT images, without explicitly focusing on observed +abnormalities. This unguided approach often results in repetitive content or +incomplete reports, failing to prioritize anomaly-specific descriptions. We +propose a new anomaly-guided report generation model, which first predicts +abnormalities and then generates targeted descriptions for each. Evaluation on +a public dataset demonstrates significant improvements in report quality and +clinical relevance. We extend our work by conducting an ablation study to +demonstrate its effectiveness. + +
+
+ comment: 15 pages, 9 figures, submitted to ISBI 2025 +
+
+
+
+
+ + ☆ Real-Time Incremental Explanations for Object Detectors + + +
+ Existing black box explainability tools for object detectors rely on multiple +calls to the model, which prevents them from computing explanations in real +time. In this paper we introduce IncX, an algorithm for real-time incremental +approximations of explanations, based on linear transformations of saliency +maps. We implement IncX on top of D-RISE, a state-of-the-art black-box +explainability tool for object detectors. We show that IncX's explanations are +comparable in quality to those of D-RISE, with insertion curves being within +8%, and are computed two orders of magnitude faster that D-RISE's explanations. + +
+
+
+
+
+ + ☆ CARLA Drone: Monocular 3D Object Detection from a Different Perspective + + +
+ Existing techniques for monocular 3D detection have a serious restriction. +They tend to perform well only on a limited set of benchmarks, faring well +either on ego-centric car views or on traffic camera views, but rarely on both. +To encourage progress, this work advocates for an extended evaluation of 3D +detection frameworks across different camera perspectives. We make two key +contributions. First, we introduce the CARLA Drone dataset, CDrone. Simulating +drone views, it substantially expands the diversity of camera perspectives in +existing benchmarks. Despite its synthetic nature, CDrone represents a +real-world challenge. To show this, we confirm that previous techniques +struggle to perform well both on CDrone and a real-world 3D drone dataset. +Second, we develop an effective data augmentation pipeline called GroundMix. +Its distinguishing element is the use of the ground for creating 3D-consistent +augmentation of a training image. GroundMix significantly boosts the detection +accuracy of a lightweight one-stage detector. In our expanded evaluation, we +achieve the average precision on par with or substantially higher than the +previous state of the art across all tested datasets. + +
+
+
+
+
+ + ☆ Video-Foley: Two-Stage Video-To-Sound Generation via Temporal Event + Condition For Foley Sound + + +
+ Foley sound synthesis is crucial for multimedia production, enhancing user +experience by synchronizing audio and video both temporally and semantically. +Recent studies on automating this labor-intensive process through +video-to-sound generation face significant challenges. Systems lacking explicit +temporal features suffer from poor controllability and alignment, while +timestamp-based models require costly and subjective human annotation. We +propose Video-Foley, a video-to-sound system using Root Mean Square (RMS) as a +temporal event condition with semantic timbre prompts (audio or text). RMS, a +frame-level intensity envelope feature closely related to audio semantics, +ensures high controllability and synchronization. The annotation-free +self-supervised learning framework consists of two stages, Video2RMS and +RMS2Sound, incorporating novel ideas including RMS discretization and +RMS-ControlNet with a pretrained text-to-audio model. Our extensive evaluation +shows that Video-Foley achieves state-of-the-art performance in audio-visual +alignment and controllability for sound timing, intensity, timbre, and nuance. +Code, model weights, and demonstrations are available on the accompanying +website. (https://jnwnlee.github.io/video-foley-demo) + +
+
+
+
+
+ + ♻ ☆ LongVILA: Scaling Long-Context Visual Language Models for Long Videos + + +
+ Long-context capability is critical for multi-modal foundation models, +especially for long video understanding. We introduce LongVILA, a full-stack +solution for long-context visual-language models by co-designing the algorithm +and system. For model training, we upgrade existing VLMs to support long video +understanding by incorporating two additional stages, i.e., long context +extension and long supervised fine-tuning. However, training on long video is +computationally and memory intensive. We introduce the long-context Multi-Modal +Sequence Parallelism (MM-SP) system that efficiently parallelizes long video +training and inference, enabling 2M context length training on 256 GPUs without +any gradient checkpointing. LongVILA efficiently extends the number of video +frames of VILA from 8 to 1024, improving the long video captioning score from +2.00 to 3.26 (out of 5), achieving 99.5% accuracy in 1400-frame (274k context +length) video needle-in-a-haystack. LongVILA-8B demonstrates consistent +accuracy improvements on long videos in the VideoMME benchmark as the number of +frames increases. Besides, MM-SP is 2.1x - 5.7x faster than ring sequence +parallelism and 1.1x - 1.4x faster than Megatron with context parallelism + +tensor parallelism. Moreover, it seamlessly integrates with Hugging Face +Transformers. + +
+
+ comment: Code and models are available at + https://github.com/NVlabs/VILA/blob/main/LongVILA.md +
+
+
+
+
+ + ♻ ☆ A Novel State Space Model with Local Enhancement and State Sharing for + Image Fusion + + +
+ In image fusion tasks, images from different sources possess distinct +characteristics. This has driven the development of numerous methods to explore +better ways of fusing them while preserving their respective +characteristics.Mamba, as a state space model, has emerged in the field of +natural language processing. Recently, many studies have attempted to extend +Mamba to vision tasks. However, due to the nature of images different from +causal language sequences, the limited state capacity of Mamba weakens its +ability to model image information. Additionally, the sequence modeling ability +of Mamba is only capable of spatial information and cannot effectively capture +the rich spectral information in images. Motivated by these challenges, we +customize and improve the vision Mamba network designed for the image fusion +task. Specifically, we propose the local-enhanced vision Mamba block, dubbed as +LEVM. The LEVM block can improve local information perception of the network +and simultaneously learn local and global spatial information. Furthermore, we +propose the state sharing technique to enhance spatial details and integrate +spatial and spectral information. Finally, the overall network is a multi-scale +structure based on vision Mamba, called LE-Mamba. Extensive experiments show +the proposed methods achieve state-of-the-art results on multispectral +pansharpening and multispectral and hyperspectral image fusion datasets, and +demonstrate the effectiveness of the proposed approach. Codes can be accessed +at \url{https://github.com/294coder/Efficient-MIF}. + +
+
+
+
+
+ + ♻ ☆ Exploiting Diffusion Prior for Out-of-Distribution Detection + + +
+ Out-of-distribution (OOD) detection is crucial for deploying robust machine +learning models, especially in areas where security is critical. However, +traditional OOD detection methods often fail to capture complex data +distributions from large scale date. In this paper, we present a novel approach +for OOD detection that leverages the generative ability of diffusion models and +the powerful feature extraction capabilities of CLIP. By using these features +as conditional inputs to a diffusion model, we can reconstruct the images after +encoding them with CLIP. The difference between the original and reconstructed +images is used as a signal for OOD identification. The practicality and +scalability of our method is increased by the fact that it does not require +class-specific labeled ID data, as is the case with many other methods. +Extensive experiments on several benchmark datasets demonstrates the robustness +and effectiveness of our method, which have significantly improved the +detection accuracy. + +
+
+
+
+
+ + ♻ ☆ A Survey for Foundation Models in Autonomous Driving + + +
+ The advent of foundation models has revolutionized the fields of natural +language processing and computer vision, paving the way for their application +in autonomous driving (AD). This survey presents a comprehensive review of more +than 40 research papers, demonstrating the role of foundation models in +enhancing AD. Large language models contribute to planning and simulation in +AD, particularly through their proficiency in reasoning, code generation and +translation. In parallel, vision foundation models are increasingly adapted for +critical tasks such as 3D object detection and tracking, as well as creating +realistic driving scenarios for simulation and testing. Multi-modal foundation +models, integrating diverse inputs, exhibit exceptional visual understanding +and spatial reasoning, crucial for end-to-end AD. This survey not only provides +a structured taxonomy, categorizing foundation models based on their modalities +and functionalities within the AD domain but also delves into the methods +employed in current research. It identifies the gaps between existing +foundation models and cutting-edge AD approaches, thereby charting future +research directions and proposing a roadmap for bridging these gaps. + +
+
+
+
+
+ + ♻ ☆ KOSMOS-2.5: A Multimodal Literate Model + + +
+ The automatic reading of text-intensive images represents a significant +advancement toward achieving Artificial General Intelligence (AGI). In this +paper we present KOSMOS-2.5, a multimodal literate model for machine reading of +text-intensive images. Pre-trained on a large-scale corpus of text-intensive +images, KOSMOS-2.5 excels in two distinct yet complementary transcription +tasks: (1) generating spatially-aware text blocks, where each block of text is +assigned spatial coordinates within the image, and (2) producing structured +text output that captures both style and structure in markdown format. This +unified multimodal literate capability is achieved through a shared +decoder-only autoregressive Transformer architecture and task-specific prompts. +Building on this foundation, we fine-tune KOSMOS-2.5 for document understanding +tasks, resulting in a document understanding generalist named KOSMOS-2.5-CHAT. +Additionally, a large corpus of 357.4 million document pages spanning diverse +domains was curated for pre-training. We evaluate KOSMOS-2.5 on two newly +proposed benchmarks, OCREval and MarkdownEval, for document-level text +recognition and image-to-markdown generation, demonstrating impressive literate +capabilities comparable to GPT-4o. KOSMOS-2.5-CHAT achieves performance +comparable to other state-of-the-art generalists that are five times larger +(1.3B vs. 7B) across nine text-rich visual question answering benchmarks. +Models and code have been available at \url{https://aka.ms/kosmos25}. + +
+
+
+
+
+ + ♻ ☆ Diversity and stylization of the contemporary user-generated visual arts + in the complexity-entropy plane + + +
+ The advent of computational and numerical methods in recent times has +provided new avenues for analyzing art historiographical narratives and tracing +the evolution of art styles therein. Here, we investigate an evolutionary +process underpinning the emergence and stylization of contemporary +user-generated visual art styles using the complexity-entropy (C-H) plane, +which quantifies local structures in paintings. Informatizing 149,780 images +curated in DeviantArt and Behance platforms from 2010 to 2020, we analyze the +relationship between local information of the C-H space and multi-level image +features generated by a deep neural network and a feature extraction algorithm. +The results reveal significant statistical relationships between the C-H +information of visual artistic styles and the dissimilarities of the +multi-level image features over time within groups of artworks. By disclosing a +particular C-H region where the diversity of image representations is +noticeably manifested, our analyses reveal an empirical condition of emerging +styles that are both novel in the C-H plane and characterized by greater +stylistic diversity. Our research shows that visual art analyses combined with +physics-inspired methodologies and machine learning, can provide macroscopic +insights into quantitatively mapping relevant characteristics of an +evolutionary process underpinning the creative stylization of uncharted visual +arts of given groups and time. + +
+
+ comment: 18 pages, 3 figures, 1 table, SI(4 figures, 3 tables) +
+
+
+
+
+ + ♻ ☆ The Tug-of-War Between Deepfake Generation and Detection + + +
+ Multimodal generative models are rapidly evolving, leading to a surge in the +generation of realistic video and audio that offers exciting possibilities but +also serious risks. Deepfake videos, which can convincingly impersonate +individuals, have particularly garnered attention due to their potential misuse +in spreading misinformation and creating fraudulent content. This survey paper +examines the dual landscape of deepfake video generation and detection, +emphasizing the need for effective countermeasures against potential abuses. We +provide a comprehensive overview of current deepfake generation techniques, +including face swapping, reenactment, and audio-driven animation, which +leverage cutting-edge technologies like GANs and diffusion models to produce +highly realistic fake videos. Additionally, we analyze various detection +approaches designed to differentiate authentic from altered videos, from +detecting visual artifacts to deploying advanced algorithms that pinpoint +inconsistencies across video and audio signals. + The effectiveness of these detection methods heavily relies on the diversity +and quality of datasets used for training and evaluation. We discuss the +evolution of deepfake datasets, highlighting the importance of robust, diverse, +and frequently updated collections to enhance the detection accuracy and +generalizability. As deepfakes become increasingly indistinguishable from +authentic content, developing advanced detection techniques that can keep pace +with generation technologies is crucial. We advocate for a proactive approach +in the "tug-of-war" between deepfake creators and detectors, emphasizing the +need for continuous research collaboration, standardization of evaluation +metrics, and the creation of comprehensive benchmarks. + +
+
+
+
+
+ + ♻ ☆ SOAP: Enhancing Spatio-Temporal Relation and Motion Information + Capturing for Few-Shot Action Recognition ACM MM 2024 + + +
+ High frame-rate (HFR) videos of action recognition improve fine-grained +expression while reducing the spatio-temporal relation and motion information +density. Thus, large amounts of video samples are continuously required for +traditional data-driven training. However, samples are not always sufficient in +real-world scenarios, promoting few-shot action recognition (FSAR) research. We +observe that most recent FSAR works build spatio-temporal relation of video +samples via temporal alignment after spatial feature extraction, cutting apart +spatial and temporal features within samples. They also capture motion +information via narrow perspectives between adjacent frames without considering +density, leading to insufficient motion information capturing. Therefore, we +propose a novel plug-and-play architecture for FSAR called Spatio-tempOral +frAme tuPle enhancer (SOAP) in this paper. The model we designed with such +architecture refers to SOAP-Net. Temporal connections between different feature +channels and spatio-temporal relation of features are considered instead of +simple feature extraction. Comprehensive motion information is also captured, +using frame tuples with multiple frames containing more motion information than +adjacent frames. Combining frame tuples of diverse frame counts further +provides a broader perspective. SOAP-Net achieves new state-of-the-art +performance across well-known benchmarks such as SthSthV2, Kinetics, UCF101, +and HMDB51. Extensive empirical evaluations underscore the competitiveness, +pluggability, generalization, and robustness of SOAP. The code is released at +https://github.com/wenbohuang1002/SOAP. + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ♻ ☆ Predicting Gradient is Better: Exploring Self-Supervised Learning for + SAR ATR with a Joint-Embedding Predictive Architecture + + +
+ The growing Synthetic Aperture Radar (SAR) data has the potential to build a +foundation model through Self-Supervised Learning (SSL) methods, which can +achieve various SAR Automatic Target Recognition (ATR) tasks with pre-training +in large-scale unlabeled data and fine-tuning in small labeled samples. SSL +aims to construct supervision signals directly from the data, which minimizes +the need for expensive expert annotation and maximizes the use of the expanding +data pool for a foundational model. This study investigates an effective SSL +method for SAR ATR, which can pave the way for a foundation model in SAR ATR. +The primary obstacles faced in SSL for SAR ATR are the small targets in remote +sensing and speckle noise in SAR images, corresponding to the SSL approach and +signals. To overcome these challenges, we present a novel Joint-Embedding +Predictive Architecture for SAR ATR (SAR-JEPA), which leverages local masked +patches to predict the multi-scale SAR gradient representations of unseen +context. The key aspect of SAR-JEPA is integrating SAR domain features to +ensure high-quality self-supervised signals as target features. Besides, we +employ local masks and multi-scale features to accommodate the various small +targets in remote sensing. By fine-tuning and evaluating our framework on three +target recognition datasets (vehicle, ship, and aircraft) with four other +datasets as pre-training, we demonstrate its outperformance over other SSL +methods and its effectiveness with increasing SAR data. This study showcases +the potential of SSL for SAR target recognition across diverse targets, scenes, +and sensors.Our codes and weights are available in +\url{https://github.com/waterdisappear/SAR-JEPA. + +
+
+ comment: 15 pages, 7 figures, +
+
+
+
+
+ + ♻ ☆ CMAB: A First National-Scale Multi-Attribute Building Dataset in China + Derived from Open Source Data and GeoAI + + +
+ Rapidly acquiring three-dimensional (3D) building data, including geometric +attributes like rooftop, height and orientations, as well as indicative +attributes like function, quality, and age, is essential for accurate urban +analysis, simulations, and policy updates. Current building datasets suffer +from incomplete coverage of building multi-attributes. This paper introduces a +geospatial artificial intelligence (GeoAI) framework for large-scale building +modeling, presenting the first national-scale Multi-Attribute Building dataset +(CMAB), covering 3,667 spatial cities, 29 million buildings, and 21.3 billion +square meters of rooftops with an F1-Score of 89.93% in OCRNet-based +extraction, totaling 337.7 billion cubic meters of building stock. We trained +bootstrap aggregated XGBoost models with city administrative classifications, +incorporating features such as morphology, location, and function. Using +multi-source data, including billions of high-resolution Google Earth images +and 60 million street view images (SVIs), we generated rooftop, height, +function, age, and quality attributes for each building. Accuracy was validated +through model benchmarks, existing similar products, and manual SVI validation, +mostly above 80%. Our dataset and results are crucial for global SDGs and urban +planning. + +
+
+ comment: 43 pages, 20 figures +
+
+
+
+
+ + ♻ ☆ Mamba-FSCIL: Dynamic Adaptation with Selective State Space Model for + Few-Shot Class-Incremental Learning SC + + +
+ Few-shot class-incremental learning (FSCIL) confronts the challenge of +integrating new classes into a model with minimal training samples while +preserving the knowledge of previously learned classes. Traditional methods +widely adopt static adaptation relying on a fixed parameter space to learn from +data that arrive sequentially, prone to overfitting to the current session. +Existing dynamic strategies require the expansion of the parameter space +continually, leading to increased complexity. In this study, we explore the +potential of Selective State Space Models (SSMs) for FSCIL, leveraging its +dynamic weights and strong ability in sequence modeling to address these +challenges. Concretely, we propose a dual selective SSM projector that +dynamically adjusts the projection parameters based on the intermediate +features for dynamic adaptation. The dual design enables the model to maintain +the robust features of base classes, while adaptively learning distinctive +feature shifts for novel classes. Additionally, we develop a class-sensitive +selective scan mechanism to guide dynamic adaptation. It minimizes the +disruption to base-class representations caused by training on novel data, and +meanwhile, forces the selective scan to perform in distinct patterns between +base and novel classes. Experiments on miniImageNet, CUB-200, and CIFAR-100 +demonstrate that our framework outperforms the existing state-of-the-art +methods. The code is available at +\url{https://github.com/xiaojieli0903/Mamba-FSCIL}. + +
+
+ comment: Code: https://github.com/xiaojieli0903/Mamba-FSCIL +
+
+
+
+
+ + ♻ ☆ Vessel-Promoted OCT to OCTA Image Translation by Heuristic Contextual + Constraints + + +
+ Optical Coherence Tomography Angiography (OCTA) is a crucial tool in the +clinical screening of retinal diseases, allowing for accurate 3D imaging of +blood vessels through non-invasive scanning. However, the hardware-based +approach for acquiring OCTA images presents challenges due to the need for +specialized sensors and expensive devices. In this paper, we introduce a novel +method called TransPro, which can translate the readily available 3D Optical +Coherence Tomography (OCT) images into 3D OCTA images without requiring any +additional hardware modifications. Our TransPro method is primarily driven by +two novel ideas that have been overlooked by prior work. The first idea is +derived from a critical observation that the OCTA projection map is generated +by averaging pixel values from its corresponding B-scans along the Z-axis. +Hence, we introduce a hybrid architecture incorporating a 3D adversarial +generative network and a novel Heuristic Contextual Guidance (HCG) module, +which effectively maintains the consistency of the generated OCTA images +between 3D volumes and projection maps. The second idea is to improve the +vessel quality in the translated OCTA projection maps. As a result, we propose +a novel Vessel Promoted Guidance (VPG) module to enhance the attention of +network on retinal vessels. Experimental results on two datasets demonstrate +that our TransPro outperforms state-of-the-art approaches, with relative +improvements around 11.4% in MAE, 2.7% in PSNR, 2% in SSIM, 40% in VDE, and +9.1% in VDC compared to the baseline method. The code is available at: +https://github.com/ustlsh/TransPro. + +
+
+ comment: Accepted by Medical Image Analysis +
+
+
+
+
+ + ♻ ☆ Surgical Workflow Recognition and Blocking Effectiveness Detection in + Laparoscopic Liver Resections with Pringle Maneuver + + +
+ Pringle maneuver (PM) in laparoscopic liver resection aims to reduce blood +loss and provide a clear surgical view by intermittently blocking blood inflow +of the liver, whereas prolonged PM may cause ischemic injury. To +comprehensively monitor this surgical procedure and provide timely warnings of +ineffective and prolonged blocking, we suggest two complementary AI-assisted +surgical monitoring tasks: workflow recognition and blocking effectiveness +detection in liver resections. The former presents challenges in real-time +capturing of short-term PM, while the latter involves the intraoperative +discrimination of long-term liver ischemia states. To address these challenges, +we meticulously collect a novel dataset, called PmLR50, consisting of 25,037 +video frames covering various surgical phases from 50 laparoscopic liver +resection procedures. Additionally, we develop an online baseline for PmLR50, +termed PmNet. This model embraces Masked Temporal Encoding (MTE) and Compressed +Sequence Modeling (CSM) for efficient short-term and long-term temporal +information modeling, and embeds Contrastive Prototype Separation (CPS) to +enhance action discrimination between similar intraoperative operations. +Experimental results demonstrate that PmNet outperforms existing +state-of-the-art surgical workflow recognition methods on the PmLR50 benchmark. +Our research offers potential clinical applications for the laparoscopic liver +surgery community. Source code and data will be publicly available. + +
+
+
+
+
+ + ♻ ☆ MotionBooth: Motion-Aware Customized Text-to-Video Generation + + +
+ In this work, we present MotionBooth, an innovative framework designed for +animating customized subjects with precise control over both object and camera +movements. By leveraging a few images of a specific object, we efficiently +fine-tune a text-to-video model to capture the object's shape and attributes +accurately. Our approach presents subject region loss and video preservation +loss to enhance the subject's learning performance, along with a subject token +cross-attention loss to integrate the customized subject with motion control +signals. Additionally, we propose training-free techniques for managing subject +and camera motions during inference. In particular, we utilize cross-attention +map manipulation to govern subject motion and introduce a novel latent shift +module for camera movement control as well. MotionBooth excels in preserving +the appearance of subjects while simultaneously controlling the motions in +generated videos. Extensive quantitative and qualitative evaluations +demonstrate the superiority and effectiveness of our method. Our project page +is at https://jianzongwu.github.io/projects/motionbooth + +
+
+ comment: Project page at https://jianzongwu.github.io/projects/motionbooth +
+
+
+
+
+ + ♻ ☆ MIS-ME: A Multi-modal Framework for Soil Moisture Estimation + + +
+ Soil moisture estimation is an important task to enable precision agriculture +in creating optimal plans for irrigation, fertilization, and harvest. It is +common to utilize statistical and machine learning models to estimate soil +moisture from traditional data sources such as weather forecasts, soil +properties, and crop properties. However, there is a growing interest in +utilizing aerial and geospatial imagery to estimate soil moisture. Although +these images capture high-resolution crop details, they are expensive to curate +and challenging to interpret. Imagine, an AI-enhanced software tool that +predicts soil moisture using visual cues captured by smartphones and +statistical data given by weather forecasts. This work is a first step towards +that goal of developing a multi-modal approach for soil moisture estimation. In +particular, we curate a dataset consisting of real-world images taken from +ground stations and their corresponding weather data. We also propose MIS-ME - +Meteorological & Image based Soil Moisture Estimator, a multi-modal framework +for soil moisture estimation. Our extensive analysis shows that MIS-ME achieves +a MAPE of 10.14%, outperforming traditional unimodal approaches with a +reduction of 3.25% in MAPE for meteorological data and 2.15% in MAPE for image +data, highlighting the effectiveness of tailored multi-modal approaches. Our +code and dataset will be available at +https://github.com/OSU-Complex-Systems/MIS-ME.git. + +
+
+ comment: Accepted by DSAA2024 +
+
+
+
+
+ + ♻ ☆ Unfolded proximal neural networks for robust image Gaussian denoising + + +
+ A common approach to solve inverse imaging problems relies on finding a +maximum a posteriori (MAP) estimate of the original unknown image, by solving a +minimization problem. In thiscontext, iterative proximal algorithms are widely +used, enabling to handle non-smooth functions and linear operators. Recently, +these algorithms have been paired with deep learning strategies, to further +improve the estimate quality. In particular, proximal neural networks (PNNs) +have been introduced, obtained by unrolling a proximal algorithm as for finding +a MAP estimate, but over a fixed number of iterations, with learned linear +operators and parameters. As PNNs are based on optimization theory, they are +very flexible, and can be adapted to any image restoration task, as soon as a +proximal algorithm can solve it. They further have much lighter architectures +than traditional networks. In this article we propose a unified framework to +build PNNs for the Gaussian denoising task, based on both the dual-FB and the +primal-dual Chambolle-Pock algorithms. We further show that accelerated +inertial versions of these algorithms enable skip connections in the associated +NN layers. We propose different learning strategies for our PNN framework, and +investigate their robustness (Lipschitz property) and denoising efficiency. +Finally, we assess the robustness of our PNNs when plugged in a +forward-backward algorithm for an image deblurring problem. + +
+
+
+
+
+ + ♻ ☆ ContextualStory: Consistent Visual Storytelling with Spatially-Enhanced + and Storyline Context + + +
+ Visual storytelling involves generating a sequence of coherent frames from a +textual storyline while maintaining consistency in characters and scenes. +Existing autoregressive methods, which rely on previous frame-sentence pairs, +struggle with high memory usage, slow generation speeds, and limited context +integration. To address these issues, we propose ContextualStory, a novel +framework designed to generate coherent story frames and extend frames for +story continuation. ContextualStory utilizes Spatially-Enhanced Temporal +Attention to capture spatial and temporal dependencies, handling significant +character movements effectively. Additionally, we introduces a Storyline +Contextualizer to enrich context in storyline embedding and a StoryFlow Adapter +to measure scene changes between frames for guiding model. Extensive +experiments on PororoSV and FlintstonesSV benchmarks demonstrate that +ContextualStory significantly outperforms existing methods in both story +visualization and story continuation. + +
+
+
+
+
+ + ♻ ☆ A New Chinese Landscape Paintings Generation Model based on Stable + Diffusion using DreamBooth HPCA + + +
+ This study mainly introduces a method combining the Stable Diffusion Model +(SDM) and Parameter-Efficient Fine-Tuning method for generating Chinese +Landscape Paintings. This training process is accelerated by combining LoRA +with pre-trained SDM and DreamBooth with pre-trained SDM, respectively. On the +Chinese Landscape Paintings Internet dataset used in this paper, this study +finds that SDM combined with DreamBooth exhibits superior performance, +outperforming other models, including the generic pre-trained SDM and +LoRA-based fine-tuning SDM. The SDM combined with DreamBooth achieves a FID of +12.75 on the dataset and outperforms all other models in terms of expert +evaluation, highlighting the model's versatility in the field of Chinese +Landscape Paintings given the unique identifier, high fidelity and high +quality. This study illustrates the potential of specialised fine-tuning method +to improve the performance of SDM on domain-specific tasks, particularly in the +domain of Landscape Paintings. + +
+
+ comment: accepted by AHPCAI +
+
+
+
+
+ + ♻ ☆ Hierarchical Salient Patch Identification for Interpretable Fundus + Disease Localization + + +
+ With the widespread application of deep learning technology in medical image +analysis, the effective explanation of model predictions and improvement of +diagnostic accuracy have become urgent problems that need to be solved. +Attribution methods have become key tools to help doctors better understand the +diagnostic basis of models, and are used to explain and localize diseases in +medical images. However, previous methods suffer from inaccurate and incomplete +localization problems for fundus diseases with complex and diverse structures. +To solve these problems, we propose a weakly supervised interpretable fundus +disease localization method called hierarchical salient patch identification +(HSPI) that can achieve interpretable disease localization using only +image-level labels and a neural network classifier (NNC). First, we propose +salient patch identification (SPI), which divides the image into several +patches and optimizes consistency loss to identify which patch in the input +image is most important for the network's prediction, in order to locate the +disease. Second, we propose a hierarchical identification strategy to force SPI +to analyze the importance of different areas to neural network classifier's +prediction to comprehensively locate disease areas. Conditional peak focusing +is then introduced to ensure that the mask vector can accurately locate the +disease area. Finally, we propose patch selection based on multi-sized +intersections to filter out incorrectly or additionally identified non-disease +regions. We conduct disease localization experiments on fundus image datasets +and achieve the best performance on multiple evaluation metrics compared to +previous interpretable attribution methods. Additional ablation studies are +conducted to verify the effectiveness of each method. + +
+
+
+
+
+ + ♻ ☆ FALIP: Visual Prompt as Foveal Attention Boosts CLIP Zero-Shot + Performance ECCV 2024 + + +
+ CLIP has achieved impressive zero-shot performance after pre-training on a +large-scale dataset consisting of paired image-text data. Previous works have +utilized CLIP by incorporating manually designed visual prompts like colored +circles and blur masks into the images to guide the model's attention, showing +enhanced zero-shot performance in downstream tasks. Although these methods have +achieved promising results, they inevitably alter the original information of +the images, which can lead to failure in specific tasks. We propose a +train-free method Foveal-Attention CLIP (FALIP), which adjusts the CLIP's +attention by inserting foveal attention masks into the multi-head +self-attention module. We demonstrate FALIP effectively boosts CLIP zero-shot +performance in tasks such as referring expressions comprehension, image +classification, and 3D point cloud recognition. Experimental results further +show that FALIP outperforms existing methods on most metrics and can augment +current methods to enhance their performance. + +
+
+ comment: Accepted by ECCV 2024, code released +
+
+
+
+
+ + ♻ ☆ Multi-Grained Query-Guided Set Prediction Network for Grounded + Multimodal Named Entity Recognition + + +
+ Grounded Multimodal Named Entity Recognition (GMNER) is an emerging +information extraction (IE) task, aiming to simultaneously extract entity +spans, types, and corresponding visual regions of entities from given +sentence-image pairs data. Recent unified methods employing machine reading +comprehension or sequence generation-based frameworks show limitations in this +difficult task. The former, utilizing human-designed queries, struggles to +differentiate ambiguous entities, such as Jordan (Person) and off-White x +Jordan (Shoes). The latter, following the one-by-one decoding order, suffers +from exposure bias issues. We maintain that these works misunderstand the +relationships of multimodal entities. To tackle these, we propose a novel +unified framework named Multi-grained Query-guided Set Prediction Network +(MQSPN) to learn appropriate relationships at intra-entity and inter-entity +levels. Specifically, MQSPN consists of a Multi-grained Query Set (MQS) and a +Multimodal Set Prediction Network (MSP). MQS explicitly aligns entity regions +with entity spans by employing a set of learnable queries to strengthen +intra-entity connections. Based on distinct intra-entity modeling, MSP +reformulates GMNER as a set prediction, guiding models to establish appropriate +inter-entity relationships from a global matching perspective. Additionally, we +incorporate a query-guided Fusion Net (QFNet) to work as a glue network between +MQS and MSP. Extensive experiments demonstrate that our approach achieves +state-of-the-art performances in widely used benchmarks. + +
+
+ comment: 13 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Model Merging in LLMs, MLLMs, and Beyond: Methods, Theories, + Applications and Opportunities + + +
+ Model merging is an efficient empowerment technique in the machine learning +community that does not require the collection of raw training data and does +not require expensive computation. As model merging becomes increasingly +prevalent across various fields, it is crucial to understand the available +model merging techniques comprehensively. However, there is a significant gap +in the literature regarding a systematic and thorough review of these +techniques. This survey provides a comprehensive overview of model merging +methods and theories, their applications in various domains and settings, and +future research directions. Specifically, we first propose a new taxonomic +approach that exhaustively discusses existing model merging methods. Secondly, +we discuss the application of model merging techniques in large language +models, multimodal large language models, and 10+ machine learning subfields, +including continual learning, multi-task learning, few-shot learning, etc. +Finally, we highlight the remaining challenges of model merging and discuss +future research directions. A comprehensive list of papers about model merging +is available at +\url{https://github.com/EnnengYang/Awesome-Model-Merging-Methods-Theories-Applications}. + +
+
+
+
+
+ + ♻ ☆ Generative AI in Industrial Machine Vision -- A Review + + +
+ Machine vision enhances automation, quality control, and operational +efficiency in industrial applications by enabling machines to interpret and act +on visual data. While traditional computer vision algorithms and approaches +remain widely utilized, machine learning has become pivotal in current research +activities. In particular, generative AI demonstrates promising potential by +improving pattern recognition capabilities, through data augmentation, +increasing image resolution, and identifying anomalies for quality control. +However, the application of generative AI in machine vision is still in its +early stages due to challenges in data diversity, computational requirements, +and the necessity for robust validation methods. A comprehensive literature +review is essential to understand the current state of generative AI in +industrial machine vision, focusing on recent advancements, applications, and +research trends. Thus, a literature review based on the PRISMA guidelines was +conducted, analyzing over 1,200 papers on generative AI in industrial machine +vision. Our findings reveal various patterns in current research, with the +primary use of generative AI being data augmentation, for machine vision tasks +such as classification and object detection. Furthermore, we gather a +collection of application challenges together with data requirements to enable +a successful application of generative AI in industrial machine vision. This +overview aims to provide researchers with insights into the different areas and +applications within current research, highlighting significant advancements and +identifying opportunities for future work. + +
+
+ comment: 44 pages, 7 figures, This work has been submitted to the Journal of + Intelligent Manufacturing +
+
+
+
+
+ + ♻ ☆ OccNeRF: Advancing 3D Occupancy Prediction in LiDAR-Free Environments + + +
+ Occupancy prediction reconstructs 3D structures of surrounding environments. +It provides detailed information for autonomous driving planning and +navigation. However, most existing methods heavily rely on the LiDAR point +clouds to generate occupancy ground truth, which is not available in the +vision-based system. In this paper, we propose an OccNeRF method for training +occupancy networks without 3D supervision. Different from previous works which +consider a bounded scene, we parameterize the reconstructed occupancy fields +and reorganize the sampling strategy to align with the cameras' infinite +perceptive range. The neural rendering is adopted to convert occupancy fields +to multi-camera depth maps, supervised by multi-frame photometric consistency. +Moreover, for semantic occupancy prediction, we design several strategies to +polish the prompts and filter the outputs of a pretrained open-vocabulary 2D +segmentation model. Extensive experiments for both self-supervised depth +estimation and 3D occupancy prediction tasks on nuScenes and SemanticKITTI +datasets demonstrate the effectiveness of our method. + +
+
+ comment: Code: https://github.com/LinShan-Bin/OccNeRF +
+
+
+
+
+ + ♻ ☆ V-RoAst: A New Dataset for Visual Road Assessment + + +
+ Road traffic crashes cause millions of deaths annually and have a significant +economic impact, particularly in low- and middle-income countries (LMICs). This +paper presents an approach using Vision Language Models (VLMs) for road safety +assessment, overcoming the limitations of traditional Convolutional Neural +Networks (CNNs). We introduce a new task ,V-RoAst (Visual question answering +for Road Assessment), with a real-world dataset. Our approach optimizes prompt +engineering and evaluates advanced VLMs, including Gemini-1.5-flash and +GPT-4o-mini. The models effectively examine attributes for road assessment. +Using crowdsourced imagery from Mapillary, our scalable solution influentially +estimates road safety levels. In addition, this approach is designed for local +stakeholders who lack resources, as it does not require training data. It +offers a cost-effective and automated methods for global road safety +assessments, potentially saving lives and reducing economic burdens. + +
+
+
+
+
+ + ♻ ☆ Self-Supervised Visual Preference Alignment + + +
+ This paper makes the first attempt towards unsupervised preference alignment +in Vision-Language Models (VLMs). We generate chosen and rejected responses +with regard to the original and augmented image pairs, and conduct preference +alignment with direct preference optimization. It is based on a core idea: +properly designed augmentation to the image input will induce VLM to generate +false but hard negative responses, which helps the model to learn from and +produce more robust and powerful answers. The whole pipeline no longer hinges +on supervision from GPT-4 or human involvement during alignment, and is highly +efficient with few lines of code. With only 8k randomly sampled unsupervised +data, it achieves 90\% relative score to GPT-4 on complex reasoning in +LLaVA-Bench, and improves LLaVA-7B/13B by 6.7\%/5.6\% score on complex +multi-modal benchmark MM-Vet. Visualizations shows its improved ability to +align with user-intentions. A series of ablations are firmly conducted to +reveal the latent mechanism of the approach, which also indicates its potential +towards further scaling. Code are available in +https://github.com/Kevinz-code/SeVa. + +
+
+ comment: MM2024 oral +
+
+
+
+
+ + ♻ ☆ DiffuMatting: Synthesizing Arbitrary Objects with Matting-level + Annotation ECCV 2024 + + +
+ Due to the difficulty and labor-consuming nature of getting highly accurate +or matting annotations, there only exists a limited amount of highly accurate +labels available to the public. To tackle this challenge, we propose a +DiffuMatting which inherits the strong Everything generation ability of +diffusion and endows the power of "matting anything". Our DiffuMatting can 1). +act as an anything matting factory with high accurate annotations 2). be +well-compatible with community LoRAs or various conditional control approaches +to achieve the community-friendly art design and controllable generation. +Specifically, inspired by green-screen-matting, we aim to teach the diffusion +model to paint on a fixed green screen canvas. To this end, a large-scale +greenscreen dataset (Green100K) is collected as a training dataset for +DiffuMatting. Secondly, a green background control loss is proposed to keep the +drawing board as a pure green color to distinguish the foreground and +background. To ensure the synthesized object has more edge details, a +detailed-enhancement of transition boundary loss is proposed as a guideline to +generate objects with more complicated edge structures. Aiming to +simultaneously generate the object and its matting annotation, we build a +matting head to make a green color removal in the latent space of the VAE +decoder. Our DiffuMatting shows several potential applications (e.g., +matting-data generator, community-friendly art design and controllable +generation). As a matting-data generator, DiffuMatting synthesizes general +object and portrait matting sets, effectively reducing the relative MSE error +by 15.4% in General Object Matting and 11.4% in Portrait Matting tasks. The +dataset is released in our project page at +\url{https://diffumatting.github.io}. + +
+
+ comment: This paper was accepted by ECCV 2024, and the project page is + accessible at: \url{https://diffumatting.github.io} +
+
+
+
+
+ + ♻ ☆ Source-Free Domain Adaptation Guided by Vision and Vision-Language + Pre-Training ICCV + + +
+ Source-free domain adaptation (SFDA) aims to adapt a source model trained on +a fully-labeled source domain to a related but unlabeled target domain. While +the source model is a key avenue for acquiring target pseudolabels, the +generated pseudolabels may exhibit source bias. In the conventional SFDA +pipeline, a large data (e.g. ImageNet) pre-trained feature extractor is used to +initialize the source model at the start of source training, and subsequently +discarded. Despite having diverse features important for generalization, the +pre-trained feature extractor can overfit to the source data distribution +during source training and forget relevant target domain knowledge. Rather than +discarding this valuable knowledge, we introduce an integrated framework to +incorporate pre-trained networks into the target adaptation process. The +proposed framework is flexible and allows us to plug modern pre-trained +networks into the adaptation process to leverage their stronger representation +learning capabilities. For adaptation, we propose the Co-learn algorithm to +improve target pseudolabel quality collaboratively through the source model and +a pre-trained feature extractor. Building on the recent success of the +vision-language model CLIP in zero-shot image recognition, we present an +extension Co-learn++ to further incorporate CLIP's zero-shot classification +decisions. We evaluate on 4 benchmark datasets and include more challenging +scenarios such as open-set, partial-set and open-partial SFDA. Experimental +results demonstrate that our proposed strategy improves adaptation performance +and can be successfully integrated with existing SFDA methods. + +
+
+ comment: Extension of ICCV paper arXiv:2212.07585, accepted to IJCV +
+
+
+
+
+ + ♻ ☆ Structure-preserving Planar Simplification for Indoor Environments + + +
+ This paper presents a novel approach for structure-preserving planar +simplification of indoor scene point clouds for both simulated and real-world +environments. Initially, the scene point cloud undergoes preprocessing steps, +including noise reduction and Manhattan world alignment, to ensure robustness +and coherence in subsequent analyses. We segment each captured scene into +structured (walls-ceiling-floor) and non-structured (indoor objects) scenes. +Leveraging a RANSAC algorithm, we extract primitive planes from the input point +cloud, facilitating the segmentation and simplification of the structured +scene. The best-fitting wall meshes are then generated from the primitives, +followed by adjacent mesh merging with the vertex-translation algorithm which +preserves the mesh layout. To accurately represent ceilings and floors, we +employ the mesh clipping algorithm which clips the ceiling and floor meshes +with respect to wall normals. In the case of indoor scenes, we apply a surface +reconstruction technique to enhance the fidelity. This paper focuses on the +intricate steps of the proposed scene simplification methodology, addressing +complex scenarios such as multi-story and slanted walls and ceilings. We also +conduct qualitative and quantitative performance comparisons against popular +surface reconstruction, shape approximation, and floorplan generation +approaches. + +
+
+
+
+
+ + ♻ ☆ Freehand Sketch Generation from Mechanical Components ACM MM + + +
+ Drawing freehand sketches of mechanical components on multimedia devices for +AI-based engineering modeling has become a new trend. However, its development +is being impeded because existing works cannot produce suitable sketches for +data-driven research. These works either generate sketches lacking a freehand +style or utilize generative models not originally designed for this task +resulting in poor effectiveness. To address this issue, we design a two-stage +generative framework mimicking the human sketching behavior pattern, called +MSFormer, which is the first time to produce humanoid freehand sketches +tailored for mechanical components. The first stage employs Open CASCADE +technology to obtain multi-view contour sketches from mechanical components, +filtering perturbing signals for the ensuing generation process. Meanwhile, we +design a view selector to simulate viewpoint selection tasks during human +sketching for picking out information-rich sketches. The second stage +translates contour sketches into freehand sketches by a transformer-based +generator. To retain essential modeling features as much as possible and +rationalize stroke distribution, we introduce a novel edge-constraint stroke +initialization. Furthermore, we utilize a CLIP vision encoder and a new loss +function incorporating the Hausdorff distance to enhance the generalizability +and robustness of the model. Extensive experiments demonstrate that our +approach achieves state-of-the-art performance for generating freehand sketches +in the mechanical domain. Project page: https://mcfreeskegen.github.io . + +
+
+ comment: Published at ACM Multimedia (ACM MM) 2024 +
+
+
+
+
+ + ♻ ☆ The NeRFect Match: Exploring NeRF Features for Visual Localization ECCV24 + + +
+ In this work, we propose the use of Neural Radiance Fields (NeRF) as a scene +representation for visual localization. Recently, NeRF has been employed to +enhance pose regression and scene coordinate regression models by augmenting +the training database, providing auxiliary supervision through rendered images, +or serving as an iterative refinement module. We extend its recognized +advantages -- its ability to provide a compact scene representation with +realistic appearances and accurate geometry -- by exploring the potential of +NeRF's internal features in establishing precise 2D-3D matches for +localization. To this end, we conduct a comprehensive examination of NeRF's +implicit knowledge, acquired through view synthesis, for matching under various +conditions. This includes exploring different matching network architectures, +extracting encoder features at multiple layers, and varying training +configurations. Significantly, we introduce NeRFMatch, an advanced 2D-3D +matching function that capitalizes on the internal knowledge of NeRF learned +via view synthesis. Our evaluation of NeRFMatch on standard localization +benchmarks, within a structure-based pipeline, sets a new state-of-the-art for +localization performance on Cambridge Landmarks. + +
+
+ comment: ECCV24 camera ready +
+
+
+
+
+ + ♻ ☆ ML-Mamba: Efficient Multi-Modal Large Language Model Utilizing Mamba-2 + + +
+ Multimodal Large Language Models (MLLMs) have attracted much attention for +their multifunctionality. However, traditional Transformer architectures incur +significant overhead due to their secondary computational complexity. To +address this issue, we introduce ML-Mamba, a multimodal language model, which +utilizes the latest and efficient Mamba-2 model for inference. Mamba-2 is known +for its linear scalability and fast processing of long sequences. We replace +the Transformer-based backbone with a pre-trained Mamba-2 model and explore +methods for integrating 2D visual selective scanning mechanisms into multimodal +learning while also trying various visual encoders and Mamba-2 model variants. +Our extensive experiments in various multimodal benchmark tests demonstrate the +competitive performance of ML-Mamba and highlight the potential of state space +models in multimodal tasks. The experimental results show that: (1) we +empirically explore how to effectively apply the 2D vision selective scan +mechanism for multimodal learning. We propose a novel multimodal connector +called the Mamba-2 Scan Connector (MSC), which enhances representational +capabilities. (2) ML-Mamba achieves performance comparable to state-of-the-art +methods such as TinyLaVA and MobileVLM v2 through its linear sequential +modeling while faster inference speed; (3) Compared to multimodal models +utilizing Mamba-1, the Mamba-2-based ML-Mamba exhibits superior inference +performance and effectiveness. + +
+
+
+
+
+ + ♻ ☆ MUSES: 3D-Controllable Image Generation via Multi-Modal Agent + Collaboration + + +
+ Despite recent advancements in text-to-image generation, most existing +methods struggle to create images with multiple objects and complex spatial +relationships in 3D world. To tackle this limitation, we introduce a generic AI +system, namely MUSES, for 3D-controllable image generation from user queries. +Specifically, our MUSES addresses this challenging task by developing a +progressive workflow with three key components, including (1) Layout Manager +for 2D-to-3D layout lifting, (2) Model Engineer for 3D object acquisition and +calibration, (3) Image Artist for 3D-to-2D image rendering. By mimicking the +collaboration of human professionals, this multi-modal agent pipeline +facilitates the effective and automatic creation of images with 3D-controllable +objects, through an explainable integration of top-down planning and bottom-up +generation. Additionally, we find that existing benchmarks lack detailed +descriptions of complex 3D spatial relationships of multiple objects. To fill +this gap, we further construct a new benchmark of T2I-3DisBench (3D image +scene), which describes diverse 3D image scenes with 50 detailed prompts. +Extensive experiments show the state-of-the-art performance of MUSES on both +T2I-CompBench and T2I-3DisBench, outperforming recent strong competitors such +as DALL-E 3 and Stable Diffusion 3. These results demonstrate a significant +step of MUSES forward in bridging natural language, 2D image generation, and 3D +world. + +
+
+
+
+
+ + ♻ ☆ Quantifying the effect of X-ray scattering for data generation in + real-time defect detection + + +
+ Background: X-ray imaging is widely used for the non-destructive detection of +defects in industrial products on a conveyor belt. In-line detection requires +highly accurate, robust, and fast algorithms. Deep Convolutional Neural +Networks (DCNNs) satisfy these requirements when a large amount of labeled data +is available. To overcome the challenge of collecting these data, different +methods of X-ray image generation are considered. + Objective: Depending on the desired degree of similarity to real data, +different physical effects should either be simulated or can be ignored. X-ray +scattering is known to be computationally expensive to simulate, and this +effect can greatly affect the accuracy of a generated X-ray image. We aim to +quantitatively evaluate the effect of scattering on defect detection. + Methods: Monte-Carlo simulation is used to generate X-ray scattering +distribution. DCNNs are trained on the data with and without scattering and +applied to the same test datasets. Probability of Detection (POD) curves are +computed to compare their performance, characterized by the size of the +smallest detectable defect. + Results: We apply the methodology to a model problem of defect detection in +cylinders. When trained on data without scattering, DCNNs reliably detect +defects larger than 1.3 mm, and using data with scattering improves performance +by less than 5%. If the analysis is performed on the cases with large +scattering-to-primary ratio ($1 < SPR < 5$), the difference in performance +could reach 15% (approx. 0.4 mm). + Conclusion: Excluding the scattering signal from the training data has the +largest effect on the smallest detectable defects, and the difference decreases +for larger defects. The scattering-to-primary ratio has a significant effect on +detection performance and the required accuracy of data generation. + +
+
+ comment: This paper appears in: Journal of X-Ray Science and Technology, vol. + 32, no. 4, pp. 1099-1119, 2024. Print ISSN: 0895-3996 Online ISSN: 1095-9114 + Digital Object Identifier: https://doi.org/10.3233/XST-230389 +
+
+
+
+
+ + ♻ ☆ Rectified Iterative Disparity for Stereo Matching + + +
+ Both uncertainty-assisted and iteration-based methods have achieved great +success in stereo matching. However, existing uncertainty estimation methods +take a single image and the corresponding disparity as input, which imposes +higher demands on the estimation network. In this paper, we propose Cost +volume-based disparity Uncertainty Estimation (UEC). Based on the rich +similarity information in the cost volume coming from the image pairs, the +proposed UEC can achieve competitive performance with low computational cost. +Secondly, we propose two methods of uncertainty-assisted disparity estimation, +Uncertainty-based Disparity Rectification (UDR) and Uncertainty-based Disparity +update Conditioning (UDC). These two methods optimise the disparity update +process of the iterative-based approach without adding extra parameters. In +addition, we propose Disparity Rectification loss that significantly improves +the accuracy of small amount of disparity updates. We present a +high-performance stereo architecture, DR Stereo, which is a combination of the +proposed methods. Experimental results from SceneFlow, KITTI, Middlebury 2014, +and ETH3D show that DR-Stereo achieves very competitive disparity estimation +performance. + +
+
+
+
+
+ + ♻ ☆ Large-scale Pre-trained Models are Surprisingly Strong in Incremental + Novel Class Discovery ICPR 2024 + + +
+ Discovering novel concepts in unlabelled datasets and in a continuous manner +is an important desideratum of lifelong learners. In the literature such +problems have been partially addressed under very restricted settings, where +novel classes are learned by jointly accessing a related labelled set (e.g., +NCD) or by leveraging only a supervisedly pre-trained model (e.g., class-iNCD). +In this work we challenge the status quo in class-iNCD and propose a learning +paradigm where class discovery occurs continuously and truly unsupervisedly, +without needing any related labelled set. In detail, we propose to exploit the +richer priors from strong self-supervised pre-trained models (PTM). To this +end, we propose simple baselines, composed of a frozen PTM backbone and a +learnable linear classifier, that are not only simple to implement but also +resilient under longer learning scenarios. We conduct extensive empirical +evaluation on a multitude of benchmarks and show the effectiveness of our +proposed baselines when compared with sophisticated state-of-the-art methods. +The code is open source. + +
+
+ comment: Accepted as a conference paper to ICPR 2024 +
+
+
+
+
+ + ♻ ☆ Towards Robust Federated Image Classification: An Empirical Study of + Weight Selection Strategies in Manufacturing + + +
+ In the realm of Federated Learning (FL), particularly within the +manufacturing sector, the strategy for selecting client weights for server +aggregation is pivotal for model performance. This study investigates the +comparative effectiveness of two weight selection strategies: Final Epoch +Weight Selection (FEWS) and Optimal Epoch Weight Selection (OEWS). Designed for +manufacturing contexts where collaboration typically involves a limited number +of partners (two to four clients), our research focuses on federated image +classification tasks. We employ various neural network architectures, including +EfficientNet, ResNet, and VGG, to assess the impact of these weight selection +strategies on model convergence and robustness. Our research aims to determine +whether FEWS or OEWS enhances the global FL model's performance across +communication rounds (CRs). Through empirical analysis and rigorous +experimentation, we seek to provide valuable insights for optimizing FL +implementations in manufacturing, ensuring that collaborative efforts yield the +most effective and reliable models with a limited number of participating +clients. The findings from this study are expected to refine FL practices +significantly in manufacturing, thereby enhancing the efficiency and +performance of collaborative machine learning endeavors in this vital sector. + +
+
+ comment: Submitted to The 2nd IEEE International Conference on Federated + Learning Technologies and Applications (FLTA24) +
+
+
+
+
+ + ♻ ☆ AntifakePrompt: Prompt-Tuned Vision-Language Models are Fake Image + Detectors + + +
+ Deep generative models can create remarkably photorealistic fake images while +raising concerns about misinformation and copyright infringement, known as +deepfake threats. Deepfake detection technique is developed to distinguish +between real and fake images, where the existing methods typically learn +classifiers in the image domain or various feature domains. However, the +generalizability of deepfake detection against emerging and more advanced +generative models remains challenging. In this paper, being inspired by the +zero-shot advantages of Vision-Language Models (VLMs), we propose a novel +approach called AntifakePrompt, using VLMs (e.g., InstructBLIP) and prompt +tuning techniques to improve the deepfake detection accuracy over unseen data. +We formulate deepfake detection as a visual question answering problem, and +tune soft prompts for InstructBLIP to answer the real/fake information of a +query image. We conduct full-spectrum experiments on datasets from a diversity +of 3 held-in and 20 held-out generative models, covering modern text-to-image +generation, image editing and adversarial image attacks. These testing datasets +provide useful benchmarks in the realm of deepfake detection for further +research. Moreover, results demonstrate that (1) the deepfake detection +accuracy can be significantly and consistently improved (from 71.06% to 92.11%, +in average accuracy over unseen domains) using pretrained vision-language +models with prompt tuning; (2) our superior performance is at less cost of +training data and trainable parameters, resulting in an effective and efficient +solution for deepfake detection. Code and models can be found at +https://github.com/nctu-eva-lab/AntifakePrompt. + +
+
+
+
+
+ + ♻ ☆ Decoupling Dynamic Monocular Videos for Dynamic View Synthesis + + +
+ The challenge of dynamic view synthesis from dynamic monocular videos, i.e., +synthesizing novel views for free viewpoints given a monocular video of a +dynamic scene captured by a moving camera, mainly lies in accurately modeling +the \textbf{dynamic objects} of a scene using limited 2D frames, each with a +varying timestamp and viewpoint. Existing methods usually require pre-processed +2D optical flow and depth maps by off-the-shelf methods to supervise the +network, making them suffer from the inaccuracy of the pre-processed +supervision and the ambiguity when lifting the 2D information to 3D. In this +paper, we tackle this challenge in an unsupervised fashion. Specifically, we +decouple the motion of the dynamic objects into object motion and camera +motion, respectively regularized by proposed unsupervised surface consistency +and patch-based multi-view constraints. The former enforces the 3D geometric +surfaces of moving objects to be consistent over time, while the latter +regularizes their appearances to be consistent across different viewpoints. +Such a fine-grained motion formulation can alleviate the learning difficulty +for the network, thus enabling it to produce not only novel views with higher +quality but also more accurate scene flows and depth than existing methods +requiring extra supervision. + +
+
+ comment: Accepted to TVCG +
+
+
+
+
+ + ♻ ☆ PhD: A Prompted Visual Hallucination Evaluation Dataset + + +
+ Multimodal Large Language Models (MLLMs) hallucinate, resulting in an +emerging topic of visual hallucination evaluation (VHE). We introduce in this +paper PhD, a large-scale benchmark for VHE. The essence of VHE is to ask an +MLLM the right questions concerning a specific image. Depending on what to ask +(objects, attributes, sentiment, etc.) and how the questions are asked, we +structure PhD along two dimensions, i.e. task and mode. Five visual recognition +tasks, ranging from low-level (object / attribute recognition) to middle-level +(sentiment / position recognition and counting), are considered. Besides a +normal visual QA mode, which we term VHE-base, PhD also asks questions with +inaccurate context (VHE-iac) or with incorrect context (VHE-icc), or with +AI-generated counter common sense images (VHE-ccs). We construct PhD by a +ChatGPT-assisted semi-automated pipeline, encompassing four pivotal modules: +task-specific hallucinatory element (hitem) selection, hitem-embedded question +generation, inaccurate / incorrect context generation, and CCS image +generation. With over 102k VQA triplets in total, PhD reveals considerable +variability in MLLMs' performance across various modes, offering valuable +insights into the nature of hallucination issues. As such, PhD stands as a +potent tool not only for VHE but may also play a significant role in the +refinement of MLLMs. + +
+
+
+
+
+ + ♻ ☆ UNK-VQA: A Dataset and a Probe into the Abstention Ability of + Multi-modal Large Models + + +
+ Teaching Visual Question Answering (VQA) models to refrain from answering +unanswerable questions is necessary for building a trustworthy AI system. +Existing studies, though have explored various aspects of VQA but somewhat +ignored this particular attribute. This paper aims to bridge the research gap +by contributing a comprehensive dataset, called UNK-VQA. The dataset is +specifically designed to address the challenge of questions that models do not +know. To this end, we first augment the existing data via deliberate +perturbations on either the image or question. In specific, we carefully ensure +that the question-image semantics remain close to the original unperturbed +distribution. By this means, the identification of unanswerable questions +becomes challenging, setting our dataset apart from others that involve mere +image replacement. We then extensively evaluate the zero- and few-shot +performance of several emerging multi-modal large models and discover their +significant limitations when applied to our dataset. Additionally, we also +propose a straightforward method to tackle these unanswerable questions. This +dataset, we believe, will serve as a valuable benchmark for enhancing the +abstention capability of VQA models, thereby leading to increased +trustworthiness of AI systems. We have made the dataset +(https://github.com/guoyang9/UNK-VQA) available to facilitate further +exploration in this area. + +
+
+ comment: Accepted by TPAMI +
+
+
+
+
+ + ♻ ☆ Semi-Supervised Learning with Multi-Head Co-Training AAAI + + +
+ Co-training, extended from self-training, is one of the frameworks for +semi-supervised learning. Without natural split of features, single-view +co-training works at the cost of training extra classifiers, where the +algorithm should be delicately designed to prevent individual classifiers from +collapsing into each other. To remove these obstacles which deter the adoption +of single-view co-training, we present a simple and efficient algorithm +Multi-Head Co-Training. By integrating base learners into a multi-head +structure, the model is in a minimal amount of extra parameters. Every +classification head in the unified model interacts with its peers through a +"Weak and Strong Augmentation" strategy, in which the diversity is naturally +brought by the strong data augmentation. Therefore, the proposed method +facilitates single-view co-training by 1). promoting diversity implicitly and +2). only requiring a small extra computational overhead. The effectiveness of +Multi-Head Co-Training is demonstrated in an empirical study on standard +semi-supervised learning benchmarks. + +
+
+ comment: The 36th AAAI Conference on Artificial Intelligence (AAAI-22) +
+
+
+
+
+ + ♻ ☆ RGBD-Glue: General Feature Combination for Robust RGB-D Point Cloud + Registration + + +
+ Point cloud registration is a fundamental task for estimating rigid +transformations between point clouds. Previous studies have used geometric +information for extracting features, matching and estimating transformation. +Recently, owing to the advancement of RGB-D sensors, researchers have attempted +to combine visual and geometric information to improve registration +performance. However, these studies focused on extracting distinctive features +by deep feature fusion, which cannot effectively solve the negative effects of +each feature's weakness, and cannot sufficiently leverage the valid +information. In this paper, we propose a new feature combination framework, +which applies a looser but more effective combination. An explicit filter based +on transformation consistency is designed for the combination framework, which +can overcome each feature's weakness. And an adaptive threshold determined by +the error distribution is proposed to extract more valid information from the +two types of features. Owing to the distinctive design, our proposed framework +can estimate more accurate correspondences and is applicable to both +hand-crafted and learning-based feature descriptors. Experiments on ScanNet and +3DMatch show that our method achieves a state-of-the-art performance. + +
+
+
+
+
+ + ♻ ☆ Visual SLAM with 3D Gaussian Primitives and Depth Priors Enabling Novel + View Synthesis + + +
+ Conventional geometry-based SLAM systems lack dense 3D reconstruction +capabilities since their data association usually relies on feature +correspondences. Additionally, learning-based SLAM systems often fall short in +terms of real-time performance and accuracy. Balancing real-time performance +with dense 3D reconstruction capabilities is a challenging problem. In this +paper, we propose a real-time RGB-D SLAM system that incorporates a novel view +synthesis technique, 3D Gaussian Splatting, for 3D scene representation and +pose estimation. This technique leverages the real-time rendering performance +of 3D Gaussian Splatting with rasterization and allows for differentiable +optimization in real time through CUDA implementation. We also enable mesh +reconstruction from 3D Gaussians for explicit dense 3D reconstruction. To +estimate accurate camera poses, we utilize a rotation-translation decoupled +strategy with inverse optimization. This involves iteratively updating both in +several iterations through gradient-based optimization. This process includes +differentiably rendering RGB, depth, and silhouette maps and updating the +camera parameters to minimize a combined loss of photometric loss, depth +geometry loss, and visibility loss, given the existing 3D Gaussian map. +However, 3D Gaussian Splatting (3DGS) struggles to accurately represent +surfaces due to the multi-view inconsistency of 3D Gaussians, which can lead to +reduced accuracy in both camera pose estimation and scene reconstruction. To +address this, we utilize depth priors as additional regularization to enforce +geometric constraints, thereby improving the accuracy of both pose estimation +and 3D reconstruction. We also provide extensive experimental results on public +benchmark datasets to demonstrate the effectiveness of our proposed methods in +terms of pose accuracy, geometric accuracy, and rendering performance. + +
+
+
+
+
+ + ♻ ☆ MMFakeBench: A Mixed-Source Multimodal Misinformation Detection + Benchmark for LVLMs + + +
+ Current multimodal misinformation detection (MMD) methods often assume a +single source and type of forgery for each sample, which is insufficient for +real-world scenarios where multiple forgery sources coexist. The lack of a +benchmark for mixed-source misinformation has hindered progress in this field. +To address this, we introduce MMFakeBench, the first comprehensive benchmark +for mixed-source MMD. MMFakeBench includes 3 critical sources: textual veracity +distortion, visual veracity distortion, and cross-modal consistency distortion, +along with 12 sub-categories of misinformation forgery types. We further +conduct an extensive evaluation of 6 prevalent detection methods and 15 large +vision-language models (LVLMs) on MMFakeBench under a zero-shot setting. The +results indicate that current methods struggle under this challenging and +realistic mixed-source MMD setting. Additionally, we propose an innovative +unified framework, which integrates rationales, actions, and tool-use +capabilities of LVLM agents, significantly enhancing accuracy and +generalization. We believe this study will catalyze future research into more +realistic mixed-source multimodal misinformation and provide a fair evaluation +of misinformation detection methods. + +
+
+ comment: Project page: https://liuxuannan.github.io/MMFakeBench.github.io/ +
+
+
+
+
+ + ♻ ☆ Region Guided Attention Network for Retinal Vessel Segmentation + + +
+ Retinal imaging has emerged as a promising method of addressing this +challenge, taking advantage of the unique structure of the retina. The retina +is an embryonic extension of the central nervous system, providing a direct in +vivo window into neurological health. Recent studies have shown that specific +structural changes in retinal vessels can not only serve as early indicators of +various diseases but also help to understand disease progression. In this work, +we present a lightweight retinal vessel segmentation network based on the +encoder-decoder mechanism with region-guided attention. We introduce inverse +addition attention blocks with region guided attention to focus on the +foreground regions and improve the segmentation of regions of interest. To +further boost the model's performance on retinal vessel segmentation, we employ +a weighted dice loss. This choice is particularly effective in addressing the +class imbalance issues frequently encountered in retinal vessel segmentation +tasks. Dice loss penalises false positives and false negatives equally, +encouraging the model to generate more accurate segmentation with improved +object boundary delineation and reduced fragmentation. Extensive experiments on +a benchmark dataset show better performance (0.8285, 0.8098, 0.9677, and 0.8166 +recall, precision, accuracy and F1 score respectively) compared to +state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ Predicting the Next Action by Modeling the Abstract Goal ICPR + + +
+ The problem of anticipating human actions is an inherently uncertain one. +However, we can reduce this uncertainty if we have a sense of the goal that the +actor is trying to achieve. Here, we present an action anticipation model that +leverages goal information for the purpose of reducing the uncertainty in +future predictions. Since we do not possess goal information or the observed +actions during inference, we resort to visual representation to encapsulate +information about both actions and goals. Through this, we derive a novel +concept called abstract goal which is conditioned on observed sequences of +visual features for action anticipation. We design the abstract goal as a +distribution whose parameters are estimated using a variational recurrent +network. We sample multiple candidates for the next action and introduce a goal +consistency measure to determine the best candidate that follows from the +abstract goal. Our method obtains impressive results on the very challenging +Epic-Kitchens55 (EK55), EK100, and EGTEA Gaze+ datasets. We obtain absolute +improvements of +13.69, +11.24, and +5.19 for Top-1 verb, Top-1 noun, and Top-1 +action anticipation accuracy respectively over prior state-of-the-art methods +for seen kitchens (S1) of EK55. Similarly, we also obtain significant +improvements in the unseen kitchens (S2) set for Top-1 verb (+10.75), noun +(+5.84) and action (+2.87) anticipation. Similar trend is observed for EGTEA +Gaze+ dataset, where absolute improvement of +9.9, +13.1 and +6.8 is obtained +for noun, verb, and action anticipation. It is through the submission of this +paper that our method is currently the new state-of-the-art for action +anticipation in EK55 and EGTEA Gaze+ +https://competitions.codalab.org/competitions/20071#results Code available at +https://github.com/debadityaroy/Abstract_Goal + +
+
+ comment: Accepted at the 27th International Conference on Pattern Recognition + (ICPR) +
+
+
+
+
+ + ♻ ☆ Generalization Gap in Data Augmentation: Insights from Illumination ICPR 2024 + + +
+ In the field of computer vision, data augmentation is widely used to enrich +the feature complexity of training datasets with deep learning techniques. +However, regarding the generalization capabilities of models, the difference in +artificial features generated by data augmentation and natural visual features +has not been fully revealed. This study introduces the concept of "visual +representation variables" to define the possible visual variations in a task as +a joint distribution of these variables. We focus on the visual representation +variable "illumination", by simulating its distribution degradation and +examining how data augmentation techniques enhance model performance on a +classification task. Our goal is to investigate the differences in +generalization between models trained with augmented data and those trained +under real-world illumination conditions. Results indicate that after applying +various data augmentation methods, model performance has significantly +improved. Yet, a noticeable generalization gap still exists after utilizing +various data augmentation methods, emphasizing the critical role of feature +diversity in the training set for enhancing model generalization. + +
+
+ comment: Accepted in ICPR 2024 +
+
+
+
+
+ + ♻ ☆ OV-DQUO: Open-Vocabulary DETR with Denoising Text Query Training and + Open-World Unknown Objects Supervision + + +
+ Open-vocabulary detection aims to detect objects from novel categories beyond +the base categories on which the detector is trained. However, existing +open-vocabulary detectors trained on base category data tend to assign higher +confidence to trained categories and confuse novel categories with the +background. To resolve this, we propose OV-DQUO, an +\textbf{O}pen-\textbf{V}ocabulary DETR with \textbf{D}enoising text +\textbf{Q}uery training and open-world \textbf{U}nknown \textbf{O}bjects +supervision. Specifically, we introduce a wildcard matching method. This method +enables the detector to learn from pairs of unknown objects recognized by the +open-world detector and text embeddings with general semantics, mitigating the +confidence bias between base and novel categories. Additionally, we propose a +denoising text query training strategy. It synthesizes foreground and +background query-box pairs from open-world unknown objects to train the +detector through contrastive learning, enhancing its ability to distinguish +novel objects from the background. We conducted extensive experiments on the +challenging OV-COCO and OV-LVIS benchmarks, achieving new state-of-the-art +results of 45.6 AP50 and 39.3 mAP on novel categories respectively, without the +need for additional training data. Models and code are released at +\url{https://github.com/xiaomoguhz/OV-DQUO} + +
+
+
+
+
+ + ♻ ☆ Addressing a fundamental limitation in deep vision models: lack of + spatial attention + + +
+ The primary aim of this manuscript is to underscore a significant limitation +in current deep learning models, particularly vision models. Unlike human +vision, which efficiently selects only the essential visual areas for further +processing, leading to high speed and low energy consumption, deep vision +models process the entire image. In this work, we examine this issue from a +broader perspective and propose a solution that could pave the way for the next +generation of more efficient vision models. Basically, convolution and pooling +operations are selectively applied to altered regions, with a change map sent +to subsequent layers. This map indicates which computations need to be +repeated. The code is available at +https://github.com/aliborji/spatial_attention. + +
+
+
+
+
+ + ♻ ☆ FAGStyle: Feature Augmentation on Geodesic Surface for Zero-shot + Text-guided Diffusion Image Style Transfer + + +
+ The goal of image style transfer is to render an image guided by a style +reference while maintaining the original content. Existing image-guided methods +rely on specific style reference images, restricting their wider application +and potentially compromising result quality. As a flexible alternative, +text-guided methods allow users to describe the desired style using text +prompts. Despite their versatility, these methods often struggle with +maintaining style consistency, reflecting the described style accurately, and +preserving the content of the target image. To address these challenges, we +introduce FAGStyle, a zero-shot text-guided diffusion image style transfer +method. Our approach enhances inter-patch information interaction by +incorporating the Sliding Window Crop technique and Feature Augmentation on +Geodesic Surface into our style control loss. Furthermore, we integrate a +Pre-Shape self-correlation consistency loss to ensure content consistency. +FAGStyle demonstrates superior performance over existing methods, consistently +achieving stylization that retains the semantic content of the source image. +Experimental results confirms the efficacy of FAGStyle across a diverse range +of source contents and styles, both imagined and common. + +
+
+
+
+
+ + ♻ ☆ TrAME: Trajectory-Anchored Multi-View Editing for Text-Guided 3D + Gaussian Splatting Manipulation + + +
+ Despite significant strides in the field of 3D scene editing, current methods +encounter substantial challenge, particularly in preserving 3D consistency in +multi-view editing process. To tackle this challenge, we propose a progressive +3D editing strategy that ensures multi-view consistency via a +Trajectory-Anchored Scheme (TAS) with a dual-branch editing mechanism. +Specifically, TAS facilitates a tightly coupled iterative process between 2D +view editing and 3D updating, preventing error accumulation yielded from +text-to-image process. Additionally, we explore the relationship between +optimization-based methods and reconstruction-based methods, offering a unified +perspective for selecting superior design choice, supporting the rationale +behind the designed TAS. We further present a tuning-free View-Consistent +Attention Control (VCAC) module that leverages cross-view semantic and +geometric reference from the source branch to yield aligned views from the +target branch during the editing of 2D views. To validate the effectiveness of +our method, we analyze 2D examples to demonstrate the improved consistency with +the VCAC module. Further extensive quantitative and qualitative results in +text-guided 3D scene editing indicate that our method achieves superior editing +quality compared to state-of-the-art methods. We will make the complete +codebase publicly available following the conclusion of the review process. + +
+
+
+
+
+ + ♻ ☆ Reconstruct Spine CT from Biplanar X-Rays via Diffusion Learning + + +
+ Intraoperative CT imaging serves as a crucial resource for surgical guidance; +however, it may not always be readily accessible or practical to implement. In +scenarios where CT imaging is not an option, reconstructing CT scans from +X-rays can offer a viable alternative. In this paper, we introduce an +innovative method for 3D CT reconstruction utilizing biplanar X-rays. Distinct +from previous research that relies on conventional image generation techniques, +our approach leverages a conditional diffusion process to tackle the task of +reconstruction. More precisely, we employ a diffusion-based probabilistic model +trained to produce 3D CT images based on orthogonal biplanar X-rays. To improve +the structural integrity of the reconstructed images, we incorporate a novel +projection loss function. Experimental results validate that our proposed +method surpasses existing state-of-the-art benchmarks in both visual image +quality and multiple evaluative metrics. Specifically, our technique achieves a +higher Structural Similarity Index (SSIM) of 0.83, a relative increase of 10\%, +and a lower Fr\'echet Inception Distance (FID) of 83.43, which represents a +relative decrease of 25\%. + +
+
+
+
+
+ + ♻ ☆ CrossFi: A Cross Domain Wi-Fi Sensing Framework Based on Siamese Network + + +
+ In recent years, Wi-Fi sensing has garnered significant attention due to its +numerous benefits, such as privacy protection, low cost, and penetration +ability. Extensive research has been conducted in this field, focusing on areas +such as gesture recognition, people identification, and fall detection. +However, many data-driven methods encounter challenges related to domain shift, +where the model fails to perform well in environments different from the +training data. One major factor contributing to this issue is the limited +availability of Wi-Fi sensing datasets, which makes models learn excessive +irrelevant information and over-fit to the training set. Unfortunately, +collecting large-scale Wi-Fi sensing datasets across diverse scenarios is a +challenging task. To address this problem, we propose CrossFi, a siamese +network-based approach that excels in both in-domain scenario and cross-domain +scenario, including few-shot, zero-shot scenarios, and even works in few-shot +new-class scenario where testing set contains new categories. The core +component of CrossFi is a sample-similarity calculation network called CSi-Net, +which improves the structure of the siamese network by using an attention +mechanism to capture similarity information, instead of simply calculating the +distance or cosine similarity. Based on it, we develop an extra Weight-Net that +can generate a template for each class, so that our CrossFi can work in +different scenarios. Experimental results demonstrate that our CrossFi achieves +state-of-the-art performance across various scenarios. In gesture recognition +task, our CrossFi achieves an accuracy of 98.17% in in-domain scenario, 91.72% +in one-shot cross-domain scenario, 64.81% in zero-shot cross-domain scenario, +and 84.75% in one-shot new-class scenario. To facilitate future research, we +will release the code for our model upon publication. + +
+
+
+
+
+ + ♻ ☆ D$^3$FlowSLAM: Self-Supervised Dynamic SLAM with Flow Motion + Decomposition and DINO Guidance + + +
+ In this paper, we introduce a self-supervised deep SLAM method that robustly +operates in dynamic scenes while accurately identifying dynamic components. Our +method leverages a dual-flow representation for static flow and dynamic flow, +facilitating effective scene decomposition in dynamic environments. We propose +a dynamic update module based on this representation and develop a dense SLAM +system that excels in dynamic scenarios. In addition, we design a +self-supervised training scheme using DINO as a prior, enabling label-free +training. Our method achieves superior accuracy compared to other +self-supervised methods. It also matches or even surpasses the performance of +existing supervised methods in some cases. All code and data will be made +publicly available upon acceptance. + +
+
+ comment: Homepage: https://zju3dv.github.io/deflowslam +
+
+
+
+
+ + ♻ ☆ ComKD-CLIP: Comprehensive Knowledge Distillation for Contrastive + Language-Image Pre-traning Model + + +
+ Contrastive Language-Image Pre-training (CLIP) models excel in integrating +semantic information between images and text through contrastive learning +techniques. It has achieved remarkable performance in various multimodal tasks. +However, the deployment of large CLIP models is hindered in resource-limited +environments, while smaller models frequently fail to meet the performance +benchmarks required for practical applications. In this paper, we propose a +novel approach, ComKD-CLIP: Comprehensive Knowledge Distillation for +Contrastive Language-Image Pre-traning Model, which aims to comprehensively +distill the knowledge from a large teacher CLIP model into a smaller student +model, ensuring comparable performance with significantly reduced parameters. +ComKD-CLIP is composed of two key mechanisms: Image Feature Alignment (IFAlign) +and Educational Attention (EduAttention). IFAlign makes the image features +extracted by the student model closely match those extracted by the teacher +model, enabling the student to learn teacher's knowledge of extracting image +features. EduAttention explores the cross-relationships between text features +extracted by the teacher model and image features extracted by the student +model, enabling the student model to learn how the teacher model integrates +text-image features. In addition, ComKD-CLIP can refine the knowledge distilled +from IFAlign and EduAttention by leveraging the text-image feature fusion +results of the teacher model, ensuring the student model accurately absorbs the +teacher's knowledge. Extensive experiments conducted on 11 datasets have +demonstrated the superiority of the proposed method. + +
+
+ comment: update +
+
+
+
+
+ + ♻ ☆ S$^3$-MonoDETR: Supervised Shape&Scale-perceptive Deformable Transformer + for Monocular 3D Object Detection + + +
+ Recently, transformer-based methods have shown exceptional performance in +monocular 3D object detection, which can predict 3D attributes from a single 2D +image. These methods typically use visual and depth representations to generate +query points on objects, whose quality plays a decisive role in the detection +accuracy. However, current unsupervised attention mechanisms without any +geometry appearance awareness in transformers are susceptible to producing +noisy features for query points, which severely limits the network performance +and also makes the model have a poor ability to detect multi-category objects +in a single training process. To tackle this problem, this paper proposes a +novel ``Supervised Shape&Scale-perceptive Deformable Attention'' (S$^3$-DA) +module for monocular 3D object detection. Concretely, S$^3$-DA utilizes visual +and depth features to generate diverse local features with various shapes and +scales and predict the corresponding matching distribution simultaneously to +impose valuable shape&scale perception for each query. Benefiting from this, +S$^3$-DA effectively estimates receptive fields for query points belonging to +any category, enabling them to generate robust query features. Besides, we +propose a Multi-classification-based Shape&Scale Matching (MSM) loss to +supervise the above process. Extensive experiments on KITTI and Waymo Open +datasets demonstrate that S$^3$-DA significantly improves the detection +accuracy, yielding state-of-the-art performance of single-category and +multi-category 3D object detection in a single training process compared to the +existing approaches. The source code will be made publicly available at +https://github.com/mikasa3lili/S3-MonoDETR. + +
+
+ comment: The source code will be made publicly available at + https://github.com/mikasa3lili/S3-MonoDETR +
+
+
+
+
+ + ♻ ☆ Empowering LLMs with Pseudo-Untrimmed Videos for Audio-Visual Temporal + Understanding + + +
+ Large language models (LLMs) have demonstrated remarkable capabilities in +natural language and multimodal domains. By fine-tuning multimodal LLMs with +temporal annotations from well-annotated datasets, e.g., dense video captioning +datasets, their temporal understanding capacity in video-language tasks can be +obtained. However, there is a notable lack of untrimmed audio-visual video +datasets with precise temporal annotations for events. This deficiency hinders +LLMs from learning the alignment between time, audio-visual events, and text +tokens, thus impairing their ability to temporally localize audio-visual events +in videos. To address this gap, we introduce PU-VALOR, a comprehensive +audio-visual dataset comprising over 114,000 pseudo-untrimmed videos with +detailed temporal annotations. PU-VALOR is derived from the large-scale but +coarse-annotated audio-visual dataset VALOR, through a subtle method involving +event-based video clustering, random temporal scaling, and permutation. By +fine-tuning a multimodal LLM on PU-VALOR, we developed AVicuna, a model capable +of aligning audio-visual events with temporal intervals and corresponding text +tokens. AVicuna excels in temporal localization and time-aware dialogue +capabilities. Our experiments demonstrate that AVicuna effectively handles +temporal understanding in audio-visual videos and achieves state-of-the-art +performance on open-ended video QA, audio-visual QA, and audio-visual event +dense localization tasks. + +
+
+
+
+
+ + ♻ ☆ NeRF-US: Removing Ultrasound Imaging Artifacts from Neural Radiance + Fields in the Wild + + +
+ Current methods for performing 3D reconstruction and novel view synthesis +(NVS) in ultrasound imaging data often face severe artifacts when training +NeRF-based approaches. The artifacts produced by current approaches differ from +NeRF floaters in general scenes because of the unique nature of ultrasound +capture. Furthermore, existing models fail to produce reasonable 3D +reconstructions when ultrasound data is captured or obtained casually in +uncontrolled environments, which is common in clinical settings. Consequently, +existing reconstruction and NVS methods struggle to handle ultrasound motion, +fail to capture intricate details, and cannot model transparent and reflective +surfaces. In this work, we introduced NeRF-US, which incorporates 3D-geometry +guidance for border probability and scattering density into NeRF training, +while also utilizing ultrasound-specific rendering over traditional volume +rendering. These 3D priors are learned through a diffusion model. Through +experiments conducted on our new "Ultrasound in the Wild" dataset, we observed +accurate, clinically plausible, artifact-free reconstructions. + +
+
+
+
+
+ + ♻ ☆ LSVOS Challenge 3rd Place Report: SAM2 and Cutie based VOS + + +
+ Video Object Segmentation (VOS) presents several challenges, including object +occlusion and fragmentation, the dis-appearance and re-appearance of objects, +and tracking specific objects within crowded scenes. In this work, we combine +the strengths of the state-of-the-art (SOTA) models SAM2 and Cutie to address +these challenges. Additionally, we explore the impact of various +hyperparameters on video instance segmentation performance. Our approach +achieves a J\&F score of 0.7952 in the testing phase of LSVOS challenge VOS +track, ranking third overall. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2406.03668 +
+
+
+
+
+ + ♻ ☆ Enhancing Ship Classification in Optical Satellite Imagery: Integrating + Convolutional Block Attention Module with ResNet for Improved Performance + + +
+ In this study, we present an advanced convolutional neural network (CNN) +architecture for ship classification based on optical satellite imagery, which +significantly enhances performance through the integration of a convolutional +block attention module (CBAM) and additional architectural innovations. +Building upon the foundational ResNet50 model, we first incorporated a standard +CBAM to direct the model's focus toward more informative features, achieving an +accuracy of 87% compared to 85% of the baseline ResNet50. Further augmentations +involved multiscale feature integration, depthwise separable convolutions, and +dilated convolutions, culminating in an enhanced ResNet model with improved +CBAM. This model demonstrated a remarkable accuracy of 95%, with precision, +recall, and F1 scores all witnessing substantial improvements across various +ship classes. In particular, the bulk carrier and oil tanker classes exhibited +nearly perfect precision and recall rates, underscoring the enhanced capability +of the model to accurately identify and classify ships. Attention heatmap +analyses further validated the efficacy of the improved model, revealing more +focused attention on relevant ship features regardless of background +complexities. These findings underscore the potential of integrating attention +mechanisms and architectural innovations into CNNs for high-resolution +satellite imagery classification. This study navigates through the class +imbalance and computational costs and proposes future directions for +scalability and adaptability in new or rare ship-type recognition. This study +lays the groundwork for applying advanced deep learning techniques in remote +sensing, offering insights into scalable and efficient satellite image +classification. + +
+
+ comment: Submitted to IEEE Access on August 16, 2024 +
+
+
+
+
+ + ♻ ☆ JPEG-LM: LLMs as Image Generators with Canonical Codec Representations + + +
+ Recent work in image and video generation has been adopting the +autoregressive LLM architecture due to its generality and potentially easy +integration into multi-modal systems. The crux of applying autoregressive +training in language generation to visual generation is discretization -- +representing continuous data like images and videos as discrete tokens. Common +methods of discretizing images and videos include modeling raw pixel values, +which are prohibitively lengthy, or vector quantization, which requires +convoluted pre-hoc training. In this work, we propose to directly model images +and videos as compressed files saved on computers via canonical codecs (e.g., +JPEG, AVC/H.264). Using the default Llama architecture without any +vision-specific modifications, we pretrain JPEG-LM from scratch to generate +images (and AVC-LM to generate videos as a proof of concept), by directly +outputting compressed file bytes in JPEG and AVC formats. Evaluation of image +generation shows that this simple and straightforward approach is more +effective than pixel-based modeling and sophisticated vector quantization +baselines (on which our method yields a 31% reduction in FID). Our analysis +shows that JPEG-LM has an especial advantage over vector quantization models in +generating long-tail visual elements. Overall, we show that using canonical +codec representations can help lower the barriers between language generation +and visual generation, facilitating future research on multi-modal +language/image/video LLMs. + +
+
+
+
+
+ + ♻ ☆ NeuFlow v2: High-Efficiency Optical Flow Estimation on Edge Devices + + +
+ Real-time high-accuracy optical flow estimation is crucial for various +real-world applications. While recent learning-based optical flow methods have +achieved high accuracy, they often come with significant computational costs. +In this paper, we propose a highly efficient optical flow method that balances +high accuracy with reduced computational demands. Building upon NeuFlow v1, we +introduce new components including a much more light-weight backbone and a fast +refinement module. Both these modules help in keeping the computational demands +light while providing close to state of the art accuracy. Compares to other +state of the art methods, our model achieves a 10x-70x speedup while +maintaining comparable performance on both synthetic and real-world data. It is +capable of running at over 20 FPS on 512x384 resolution images on a Jetson Orin +Nano. The full training and evaluation code is available at +https://github.com/neufieldrobotics/NeuFlow_v2. + +
+
+
+
+
+ + ♻ ☆ BIV-Priv-Seg: Locating Private Content in Images Taken by People With + Visual Impairments + + +
+ Individuals who are blind or have low vision (BLV) are at a heightened risk +of sharing private information if they share photographs they have taken. To +facilitate developing technologies that can help preserve privacy, we introduce +BIV-Priv-Seg, the first localization dataset originating from people with +visual impairments that shows private content. It contains 1,028 images with +segmentation annotations for 16 private object categories. We first +characterize BIV-Priv-Seg and then evaluate modern models' performance for +locating private content in the dataset. We find modern models struggle most +with locating private objects that are not salient, small, and lack text as +well as recognizing when private content is absent from an image. We facilitate +future extensions by sharing our new dataset with the evaluation server at +https://vizwiz.org/tasks-and-datasets/object-localization. + +
+
+
+
+
+ + ♻ ☆ ProtoArgNet: Interpretable Image Classification with Super-Prototypes + and Argumentation [Technical Report] + + +
+ We propose ProtoArgNet, a novel interpretable deep neural architecture for +image classification in the spirit of prototypical-part-learning as found, +e.g., in ProtoPNet. While earlier approaches associate every class with +multiple prototypical-parts, ProtoArgNet uses super-prototypes that combine +prototypical-parts into a unified class representation. This is done by +combining local activations of prototypes in an MLP-like manner, enabling the +localization of prototypes and learning (non-linear) spatial relationships +among them. By leveraging a form of argumentation, ProtoArgNet is capable of +providing both supporting (i.e. `this looks like that') and attacking (i.e. +`this differs from that') explanations. We demonstrate on several datasets that +ProtoArgNet outperforms state-of-the-art prototypical-part-learning approaches. +Moreover, the argumentation component in ProtoArgNet is customisable to the +user's cognitive requirements by a process of sparsification, which leads to +more compact explanations compared to state-of-the-art approaches. + +
+
+
+
+
+ + ♻ ☆ Polyp SAM 2: Advancing Zero shot Polyp Segmentation in Colorectal Cancer + Detection + + +
+ Polyp segmentation plays a crucial role in the early detection and diagnosis +of colorectal cancer. However, obtaining accurate segmentations often requires +labor-intensive annotations and specialized models. Recently, Meta AI Research +released a general Segment Anything Model 2 (SAM 2), which has demonstrated +promising performance in several segmentation tasks. In this work, we evaluate +the performance of SAM 2 in segmenting polyps under various prompted settings. +We hope this report will provide insights to advance the field of polyp +segmentation and promote more interesting work in the future. This project is +publicly available at https://github.com/ sajjad-sh33/Polyp-SAM-2. + +
+
+
+
+
+ + ♻ ☆ Zero-shot Prompt-based Video Encoder for Surgical Gesture Recognition + + +
+ Purpose: In order to produce a surgical gesture recognition system that can +support a wide variety of procedures, either a very large annotated dataset +must be acquired, or fitted models must generalize to new labels (so called +"zero-shot" capability). In this paper we investigate the feasibility of latter +option. Methods: Leveraging the Bridge-Prompt framework, we prompt-tune a +pre-trained vision-text model (CLIP) for gesture recognition in surgical +videos. This can utilize extensive outside video data such as text, but also +make use of label meta-data and weakly supervised contrastive losses. Results: +Our experiments show that prompt-based video encoder outperforms standard +encoders in surgical gesture recognition tasks. Notably, it displays strong +performance in zero-shot scenarios, where gestures/tasks that were not provided +during the encoder training phase are included in the prediction phase. +Additionally, we measure the benefit of inclusion text descriptions in the +feature extractor training schema. Conclusion Bridge-Prompt and similar +pre-trained+prompt-tuned video encoder models present significant visual +representation for surgical robotics, especially in gesture recognition tasks. +Given the diverse range of surgical tasks (gestures), the ability of these +models to zero-shot transfer without the need for any task (gesture) specific +retraining makes them invaluable. + +
+
+ comment: 17 pages,4 figures, 7 tables, IPCAI 2024 & IJCARS +
+
+
+
+
+ + ♻ ☆ MegaScenes: Scene-Level View Synthesis at Scale ECCV 2024 + + +
+ Scene-level novel view synthesis (NVS) is fundamental to many vision and +graphics applications. Recently, pose-conditioned diffusion models have led to +significant progress by extracting 3D information from 2D foundation models, +but these methods are limited by the lack of scene-level training data. Common +dataset choices either consist of isolated objects (Objaverse), or of +object-centric scenes with limited pose distributions (DTU, CO3D). In this +paper, we create a large-scale scene-level dataset from Internet photo +collections, called MegaScenes, which contains over 100K structure from motion +(SfM) reconstructions from around the world. Internet photos represent a +scalable data source but come with challenges such as lighting and transient +objects. We address these issues to further create a subset suitable for the +task of NVS. Additionally, we analyze failure cases of state-of-the-art NVS +methods and significantly improve generation consistency. Through extensive +experiments, we validate the effectiveness of both our dataset and method on +generating in-the-wild scenes. For details on the dataset and code, see our +project page at https://megascenes.github.io. + +
+
+ comment: Accepted at ECCV 2024. Our project page is at + https://megascenes.github.io +
+
+
+
+
+ + ♻ ☆ SZTU-CMU at MER2024: Improving Emotion-LLaMA with Conv-Attention for + Multimodal Emotion Recognition IJCAI + + +
+ This paper presents our winning approach for the MER-NOISE and MER-OV tracks +of the MER2024 Challenge on multimodal emotion recognition. Our system +leverages the advanced emotional understanding capabilities of Emotion-LLaMA to +generate high-quality annotations for unlabeled samples, addressing the +challenge of limited labeled data. To enhance multimodal fusion while +mitigating modality-specific noise, we introduce Conv-Attention, a lightweight +and efficient hybrid framework. Extensive experimentation vali-dates the +effectiveness of our approach. In the MER-NOISE track, our system achieves a +state-of-the-art weighted average F-score of 85.30%, surpassing the second and +third-place teams by 1.47% and 1.65%, respectively. For the MER-OV track, our +utilization of Emotion-LLaMA for open-vocabulary annotation yields an 8.52% +improvement in average accuracy and recall compared to GPT-4V, securing the +highest score among all participating large multimodal models. The code and +model for Emotion-LLaMA are available at +https://github.com/ZebangCheng/Emotion-LLaMA. + +
+
+ comment: Ranked 1st in MER24@IJCAI and MRAC24@ACM MM (MER-NOISE & MER-OV + (self-evaluated)) +
+
+
+
+
+
+
+
+ + Information Retrieval 23 + +
+
+
+ + ☆ Do We Really Need to Drop Items with Missing Modalities in Multimodal + Recommendation? CIKM 2024 + + +
+ Generally, items with missing modalities are dropped in multimodal +recommendation. However, with this work, we question this procedure, +highlighting that it would further damage the pipeline of any multimodal +recommender system. First, we show that the lack of (some) modalities is, in +fact, a widely-diffused phenomenon in multimodal recommendation. Second, we +propose a pipeline that imputes missing multimodal features in recommendation +by leveraging traditional imputation strategies in machine learning. Then, +given the graph structure of the recommendation data, we also propose three +more effective imputation solutions that leverage the item-item co-purchase +graph and the multimodal similarities of co-interacted items. Our method can be +plugged into any multimodal RSs in the literature working as an untrained +pre-processing phase, showing (through extensive experiments) that any data +pre-filtering is not only unnecessary but also harmful to the performance. + +
+
+ comment: Accepted at CIKM 2024 in the short paper track +
+
+
+
+
+ + ☆ A Novel Evaluation Perspective on GNNs-based Recommender Systems through + the Topology of the User-Item Graph RecSys 2024 + + +
+ Recently, graph neural networks (GNNs)-based recommender systems have +encountered great success in recommendation. As the number of GNNs approaches +rises, some works have started questioning the theoretical and empirical +reasons behind their superior performance. Nevertheless, this investigation +still disregards that GNNs treat the recommendation data as a topological graph +structure. Building on this assumption, in this work, we provide a novel +evaluation perspective on GNNs-based recommendation, which investigates the +impact of the graph topology on the recommendation performance. To this end, we +select some (topological) properties of the recommendation data and three +GNNs-based recommender systems (i.e., LightGCN, DGCF, and SVD-GCN). Then, +starting from three popular recommendation datasets (i.e., Yelp2018, Gowalla, +and Amazon-Book) we sample them to obtain 1,800 size-reduced datasets that +still resemble the original ones but can encompass a wider range of topological +structures. We use this procedure to build a large pool of samples for which +data characteristics and recommendation performance of the selected GNNs models +are measured. Through an explanatory framework, we find strong correspondences +between graph topology and GNNs performance, offering a novel evaluation +perspective on these models. + +
+
+ comment: Accepted at RecSys 2024 in the reproducibility track. arXiv admin + note: substantial text overlap with arXiv:2308.10778 +
+
+
+
+
+ + ☆ Mathematical Information Retrieval: Search and Question Answering + + +
+ Mathematical information is essential for technical work, but its creation, +interpretation, and search are challenging. To help address these challenges, +researchers have developed multimodal search engines and mathematical question +answering systems. This book begins with a simple framework characterizing the +information tasks that people and systems perform as we work to answer +math-related questions. The framework is used to organize and relate the other +core topics of the book, including interactions between people and systems, +representing math formulas in sources, and evaluation. We close with some key +questions and concrete directions for future work. This book is intended for +use by students, instructors, and researchers, and those who simply wish that +it was easier to find and use mathematical information + +
+
+ comment: [DRAFT] 1st draft +
+
+
+
+
+ + ☆ End-to-End Cost-Effective Incentive Recommendation under Budget + Constraint with Uplift Modeling RecSys 2024 + + +
+ In modern online platforms, incentives are essential factors that enhance +user engagement and increase platform revenue. Over recent years, uplift +modeling has been introduced as a strategic approach to assign incentives to +individual customers. Especially in many real-world applications, online +platforms can only incentivize customers with specific budget constraints. This +problem can be reformulated as the multi-choice knapsack problem. This +optimization aims to select the optimal incentive for each customer to maximize +the return on investment. Recent works in this field frequently tackle the +budget allocation problem using a two-stage approach. However, this solution is +confronted with the following challenges: (1) The causal inference methods +often ignore the domain knowledge in online marketing, where the expected +response curve of a customer should be monotonic and smooth as the incentive +increases. (2) An optimality gap between the two stages results in inferior +sub-optimal allocation performance due to the loss of the incentive +recommendation information for the uplift prediction under the limited budget +constraint. To address these challenges, we propose a novel End-to-End +Cost-Effective Incentive Recommendation (E3IR) model under budget constraints. +Specifically, our methods consist of two modules, i.e., the uplift prediction +module and the differentiable allocation module. In the uplift prediction +module, we construct prediction heads to capture the incremental improvement +between adjacent treatments with the marketing domain constraints (i.e., +monotonic and smooth). We incorporate integer linear programming (ILP) as a +differentiable layer input in the allocation module. Furthermore, we conduct +extensive experiments on public and real product datasets, demonstrating that +our E3IR improves allocation performance compared to existing two-stage +approaches. + +
+
+ comment: Accepted by RecSys 2024 +
+
+
+
+
+ + ☆ DTN: Deep Multiple Task-specific Feature Interactions Network for + Multi-Task Recommendation + + +
+ Neural-based multi-task learning (MTL) has been successfully applied to many +recommendation applications. However, these MTL models (e.g., MMoE, PLE) did +not consider feature interaction during the optimization, which is crucial for +capturing complex high-order features and has been widely used in ranking +models for real-world recommender systems. Moreover, through feature importance +analysis across various tasks in MTL, we have observed an interesting +divergence phenomenon that the same feature can have significantly different +importance across different tasks in MTL. To address these issues, we propose +Deep Multiple Task-specific Feature Interactions Network (DTN) with a novel +model structure design. DTN introduces multiple diversified task-specific +feature interaction methods and task-sensitive network in MTL networks, +enabling the model to learn task-specific diversified feature interaction +representations, which improves the efficiency of joint representation learning +in a general setup. We applied DTN to our company's real-world E-commerce +recommendation dataset, which consisted of over 6.3 billion samples, the +results demonstrated that DTN significantly outperformed state-of-the-art MTL +models. Moreover, during online evaluation of DTN in a large-scale E-commerce +recommender system, we observed a 3.28% in clicks, a 3.10% increase in orders +and a 2.70% increase in GMV (Gross Merchandise Value) compared to the +state-of-the-art MTL models. Finally, extensive offline experiments conducted +on public benchmark datasets demonstrate that DTN can be applied to various +scenarios beyond recommendations, enhancing the performance of ranking models. + +
+
+
+
+
+ + ☆ Calibrating the Predictions for Top-N Recommendations RecSys 2024 + + +
+ Well-calibrated predictions of user preferences are essential for many +applications. Since recommender systems typically select the top-N items for +users, calibration for those top-N items, rather than for all items, is +important. We show that previous calibration methods result in miscalibrated +predictions for the top-N items, despite their excellent calibration +performance when evaluated on all items. In this work, we address the +miscalibration in the top-N recommended items. We first define evaluation +metrics for this objective and then propose a generic method to optimize +calibration models focusing on the top-N items. It groups the top-N items by +their ranks and optimizes distinct calibration models for each group with +rank-dependent training weights. We verify the effectiveness of the proposed +method for both explicit and implicit feedback datasets, using diverse classes +of recommender models. + +
+
+ comment: accepted at RecSys 2024 +
+
+
+
+
+ + ☆ Oh, Behave! Country Representation Dynamics Created by Feedback Loops in + Music Recommender Systems RecSys 2024 + + +
+ Recent work suggests that music recommender systems are prone to +disproportionally frequent recommendations of music from countries more +prominently represented in the training data, notably the US. However, it +remains unclear to what extent feedback loops in music recommendation influence +the dynamics of such imbalance. In this work, we investigate the dynamics of +representation of local (i.e., country-specific) and US-produced music in user +profiles and recommendations. To this end, we conduct a feedback loop +simulation study using the standardized LFM-2b dataset. The results suggest +that most of the investigated recommendation models decrease the proportion of +music from local artists in their recommendations. Furthermore, we find that +models preserving average proportions of US and local music do not necessarily +provide country-calibrated recommendations. We also look into popularity +calibration and, surprisingly, find that the most popularity-calibrated model +in our study (ItemKNN) provides the least country-calibrated recommendations. +In addition, users from less represented countries (e.g., Finland) are, in the +long term, most affected by the under-representation of their local music in +recommendations. + +
+
+ comment: RecSys 2024 +
+
+
+
+
+ + ☆ A Quick, trustworthy spectral detection Q&A system based on the SDAAP + Dataset and large language model + + +
+ Large Language Model (LLM) has demonstrated significant success in a range of +natural language processing (NLP) tasks within general domain. The emergence of +LLM has introduced innovative methodologies across diverse fields, including +the natural sciences. Researchers aim to implement automated, concurrent +process driven by LLM to supplant conventional manual, repetitive and +labor-intensive work. In the domain of spectral analysis and detection, it is +imperative for researchers to autonomously acquire pertinent knowledge across +various research objects, which encompasses the spectroscopic techniques and +the chemometric methods that are employed in experiments and analysis. +Paradoxically, despite the recognition of spectroscopic detection as an +effective analytical method, the fundamental process of knowledge retrieval +remains both time-intensive and repetitive. In response to this challenge, we +first introduced the Spectral Detection and Analysis Based Paper(SDAAP) +dataset, which is the first open-source textual knowledge dataset for spectral +analysis and detection and contains annotated literature data as well as +corresponding knowledge instruction data. Subsequently, we also designed an +automated Q\&A framework based on the SDAAP dataset, which can retrieve +relevant knowledge and generate high-quality responses by extracting entities +in the input as retrieval parameters. It is worth noting that: within this +framework, LLM is only used as a tool to provide generalizability, while RAG +technique is used to accurately capture the source of the knowledge.This +approach not only improves the quality of the generated responses, but also +ensures the traceability of the knowledge. Experimental results show that our +framework generates responses with more reliable expertise compared to the +baseline. + +
+
+ comment: 16 pages,10 figures,3 tables +
+
+
+
+
+ + ☆ LARR: Large Language Model Aided Real-time Scene Recommendation with + Semantic Understanding + + +
+ Click-Through Rate (CTR) prediction is crucial for Recommendation System(RS), +aiming to provide personalized recommendation services for users in many +aspects such as food delivery, e-commerce and so on. However, traditional RS +relies on collaborative signals, which lacks semantic understanding to +real-time scenes. We also noticed that a major challenge in utilizing Large +Language Models (LLMs) for practical recommendation purposes is their +efficiency in dealing with long text input. To break through the problems +above, we propose Large Language Model Aided Real-time Scene +Recommendation(LARR), adopt LLMs for semantic understanding, utilizing +real-time scene information in RS without requiring LLM to process the entire +real-time scene text directly, thereby enhancing the efficiency of LLM-based +CTR modeling. Specifically, recommendation domain-specific knowledge is +injected into LLM and then RS employs an aggregation encoder to build real-time +scene information from separate LLM's outputs. Firstly, a LLM is continual +pretrained on corpus built from recommendation data with the aid of special +tokens. Subsequently, the LLM is fine-tuned via contrastive learning on three +kinds of sample construction strategies. Through this step, LLM is transformed +into a text embedding model. Finally, LLM's separate outputs for different +scene features are aggregated by an encoder, aligning to collaborative signals +in RS, enhancing the performance of recommendation model. + +
+
+
+
+
+ + ☆ Denoising Pre-Training and Customized Prompt Learning for Efficient + Multi-Behavior Sequential Recommendation + + +
+ In the realm of recommendation systems, users exhibit a diverse array of +behaviors when interacting with items. This phenomenon has spurred research +into learning the implicit semantic relationships between these behaviors to +enhance recommendation performance. However, these methods often entail high +computational complexity. To address concerns regarding efficiency, +pre-training presents a viable solution. Its objective is to extract knowledge +from extensive pre-training data and fine-tune the model for downstream tasks. +Nevertheless, previous pre-training methods have primarily focused on +single-behavior data, while multi-behavior data contains significant noise. +Additionally, the fully fine-tuning strategy adopted by these methods still +imposes a considerable computational burden. In response to this challenge, we +propose DPCPL, the first pre-training and prompt-tuning paradigm tailored for +Multi-Behavior Sequential Recommendation. Specifically, in the pre-training +stage, we commence by proposing a novel Efficient Behavior Miner (EBM) to +filter out the noise at multiple time scales, thereby facilitating the +comprehension of the contextual semantics of multi-behavior sequences. +Subsequently, we propose to tune the pre-trained model in a highly efficient +manner with the proposed Customized Prompt Learning (CPL) module, which +generates personalized, progressive, and diverse prompts to fully exploit the +potential of the pre-trained model effectively. Extensive experiments on three +real-world datasets have unequivocally demonstrated that DPCPL not only +exhibits high efficiency and effectiveness, requiring minimal parameter +adjustments but also surpasses the state-of-the-art performance across a +diverse range of downstream tasks. + +
+
+
+
+
+ + ☆ Deep Tree-based Retrieval for Efficient Recommendation: Theory and + Method + + +
+ With the development of deep learning techniques, deep recommendation models +also achieve remarkable improvements in terms of recommendation accuracy. +However, due to the large number of candidate items in practice and the high +cost of preference computation, these methods also suffer from low efficiency +of recommendation. The recently proposed tree-based deep recommendation models +alleviate the problem by directly learning tree structure and representations +under the guidance of recommendation objectives. However, such models have +shortcomings. The max-heap assumption in the hierarchical tree, in which the +preference for a parent node should be the maximum between the preferences for +its children, is difficult to satisfy in their binary classification +objectives. To this end, we propose Tree-based Deep Retrieval (TDR for short) +for efficient recommendation. In TDR, all the trees generated during the +training process are retained to form the forest. When learning the node +representation of each tree, we have to satisfy the max-heap assumption as much +as possible and mimic beam search behavior over the tree in the training stage. +This is achieved by TDR to regard the training task as multi-classification +over tree nodes at the same level. However, the number of tree nodes grows +exponentially with levels, making us train the preference model with the +guidance of the sampled-softmax technique. The experiments are conducted on +real-world datasets, validating the effectiveness of the proposed preference +model learning method and tree learning method. + +
+
+
+
+
+ + ☆ Parallel Algorithms for Median Consensus Clustering in Complex Networks + + +
+ We develop an algorithm that finds the consensus of many different clustering +solutions of a graph. We formulate the problem as a median set partitioning +problem and propose a greedy optimization technique. Unlike other approaches +that find median set partitions, our algorithm takes graph structure into +account and finds a comparable quality solution much faster than the other +approaches. For graphs with known communities, our consensus partition captures +the actual community structure more accurately than alternative approaches. To +make it applicable to large graphs, we remove sequential dependencies from our +algorithm and design a parallel algorithm. Our parallel algorithm achieves 35x +speedup when utilizing 64 processing cores for large real-world graphs from +single-cell experiments. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ Reasoning and Tools for Human-Level Forecasting + + +
+ Language models (LMs) trained on web-scale datasets are largely successful +due to their ability to memorize large amounts of training data, even if only +present in a few examples. These capabilities are often desirable in evaluation +on tasks such as question answering but raise questions about whether these +models can exhibit genuine reasoning or succeed only at mimicking patterns from +the training data. This distinction is particularly salient in forecasting +tasks, where the answer is not present in the training data, and the model must +reason to make logical deductions. We present Reasoning and Tools for +Forecasting (RTF), a framework of reasoning-and-acting (ReAct) agents that can +dynamically retrieve updated information and run numerical simulation with +equipped tools. We evaluate our model with questions from competitive +forecasting platforms and demonstrate that our method is competitive with and +can outperform human predictions. This suggests that LMs, with the right tools, +can indeed think and adapt like humans, offering valuable insights for +real-world decision-making. + +
+
+
+
+
+ + ☆ Does It Look Sequential? An Analysis of Datasets for Evaluation of + Sequential Recommendations + + +
+ Sequential recommender systems are an important and demanded area of +research. Such systems aim to use the order of interactions in a user's history +to predict future interactions. The premise is that the order of interactions +and sequential patterns play an essential role. Therefore, it is crucial to use +datasets that exhibit a sequential structure to evaluate sequential +recommenders properly. + We apply several methods based on the random shuffling of the user's sequence +of interactions to assess the strength of sequential structure across 15 +datasets, frequently used for sequential recommender systems evaluation in +recent research papers presented at top-tier conferences. As shuffling +explicitly breaks sequential dependencies inherent in datasets, we estimate the +strength of sequential patterns by comparing metrics for shuffled and original +versions of the dataset. Our findings show that several popular datasets have a +rather weak sequential structure. + +
+
+
+
+
+ + ☆ What are the limits of cross-lingual dense passage retrieval for + low-resource languages? + + +
+ In this paper, we analyze the capabilities of the multi-lingual Dense Passage +Retriever (mDPR) for extremely low-resource languages. In the Cross-lingual +Open-Retrieval Answer Generation (CORA) pipeline, mDPR achieves success on +multilingual open QA benchmarks across 26 languages, of which 9 were unseen +during training. These results are promising for Question Answering (QA) for +low-resource languages. We focus on two extremely low-resource languages for +which mDPR performs poorly: Amharic and Khmer. We collect and curate datasets +to train mDPR models using Translation Language Modeling (TLM) and +question--passage alignment. We also investigate the effect of our extension on +the language distribution in the retrieval results. Our results on the MKQA and +AmQA datasets show that language alignment brings improvements to mDPR for the +low-resource languages, but the improvements are modest and the results remain +low. We conclude that fulfilling CORA's promise to enable multilingual open QA +in extremely low-resource settings is challenging because the model, the data, +and the evaluation approach are intertwined. Hence, all three need attention in +follow-up work. We release our code for reproducibility and future work: +https://anonymous.4open.science/r/Question-Answering-for-Low-Resource-Languages-B13C/ + +
+
+
+
+
+ + ☆ Ancient Wisdom, Modern Tools: Exploring Retrieval-Augmented LLMs for + Ancient Indian Philosophy ACL 2024 + + +
+ LLMs have revolutionized the landscape of information retrieval and knowledge +dissemination. However, their application in specialized areas is often +hindered by factual inaccuracies and hallucinations, especially in long-tail +knowledge distributions. We explore the potential of retrieval-augmented +generation (RAG) models for long-form question answering (LFQA) in a +specialized knowledge domain. We present VedantaNY-10M, a dataset curated from +extensive public discourses on the ancient Indian philosophy of Advaita +Vedanta. We develop and benchmark a RAG model against a standard, non-RAG LLM, +focusing on transcription, retrieval, and generation performance. Human +evaluations by computational linguists and domain experts show that the RAG +model significantly outperforms the standard model in producing factual and +comprehensive responses having fewer hallucinations. In addition, a +keyword-based hybrid retriever that emphasizes unique low-frequency terms +further improves results. Our study provides insights into effectively +integrating modern large language models with ancient knowledge systems. +Project page with dataset and code: https://sites.google.com/view/vedantany-10m + +
+
+ comment: Best paper at the Workshop on Machine Learning for Ancient Languages + @ ACL 2024. Proceedings of the 1st Machine Learning for Ancient Languages + Workshop, 2024.ml4al-1.23, Association for Computational Linguistics (ACL) + 2024. Dataset, code, and evaluation is available at: + https://sites.google.com/view/vedantany-10m +
+
+
+
+
+ + ♻ ☆ Bias and Unfairness in Information Retrieval Systems: New Challenges in + the LLM Era KDD 2024 + + +
+ With the rapid advancements of large language models (LLMs), information +retrieval (IR) systems, such as search engines and recommender systems, have +undergone a significant paradigm shift. This evolution, while heralding new +opportunities, introduces emerging challenges, particularly in terms of biases +and unfairness, which may threaten the information ecosystem. In this paper, we +present a comprehensive survey of existing works on emerging and pressing bias +and unfairness issues in IR systems when the integration of LLMs. We first +unify bias and unfairness issues as distribution mismatch problems, providing a +groundwork for categorizing various mitigation strategies through distribution +alignment. Subsequently, we systematically delve into the specific bias and +unfairness issues arising from three critical stages of LLMs integration into +IR systems: data collection, model development, and result evaluation. In doing +so, we meticulously review and analyze recent literature, focusing on the +definitions, characteristics, and corresponding mitigation strategies +associated with these issues. Finally, we identify and highlight some open +problems and challenges for future work, aiming to inspire researchers and +stakeholders in the IR field and beyond to better understand and mitigate bias +and unfairness issues of IR in this LLM era. We also consistently maintain a +GitHub repository for the relevant papers and resources in this rising +direction at https://github.com/KID-22/LLM-IR-Bias-Fairness-Survey. + +
+
+ comment: KDD 2024 Tutorial&Survey; Tutorial Website: + https://llm-ir-bias-fairness.github.io/ +
+
+
+
+
+ + ♻ ☆ Multi-Grained Query-Guided Set Prediction Network for Grounded + Multimodal Named Entity Recognition + + +
+ Grounded Multimodal Named Entity Recognition (GMNER) is an emerging +information extraction (IE) task, aiming to simultaneously extract entity +spans, types, and corresponding visual regions of entities from given +sentence-image pairs data. Recent unified methods employing machine reading +comprehension or sequence generation-based frameworks show limitations in this +difficult task. The former, utilizing human-designed queries, struggles to +differentiate ambiguous entities, such as Jordan (Person) and off-White x +Jordan (Shoes). The latter, following the one-by-one decoding order, suffers +from exposure bias issues. We maintain that these works misunderstand the +relationships of multimodal entities. To tackle these, we propose a novel +unified framework named Multi-grained Query-guided Set Prediction Network +(MQSPN) to learn appropriate relationships at intra-entity and inter-entity +levels. Specifically, MQSPN consists of a Multi-grained Query Set (MQS) and a +Multimodal Set Prediction Network (MSP). MQS explicitly aligns entity regions +with entity spans by employing a set of learnable queries to strengthen +intra-entity connections. Based on distinct intra-entity modeling, MSP +reformulates GMNER as a set prediction, guiding models to establish appropriate +inter-entity relationships from a global matching perspective. Additionally, we +incorporate a query-guided Fusion Net (QFNet) to work as a glue network between +MQS and MSP. Extensive experiments demonstrate that our approach achieves +state-of-the-art performances in widely used benchmarks. + +
+
+ comment: 13 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Learning Partially Aligned Item Representation for Cross-Domain + Sequential Recommendation + + +
+ Cross-domain sequential recommendation (CDSR) aims to uncover and transfer +users' sequential preferences across multiple recommendation domains. While +significant endeavors have been made, they primarily concentrated on developing +advanced transfer modules and aligning user representations using +self-supervised learning techniques. However, the problem of aligning item +representations has received limited attention, and misaligned item +representations can potentially lead to sub-optimal sequential modeling and +user representation alignment. To this end, we propose a model-agnostic +framework called \textbf{C}ross-domain item representation \textbf{A}lignment +for \textbf{C}ross-\textbf{D}omain \textbf{S}equential \textbf{R}ecommendation +(\textbf{CA-CDSR}), which achieves sequence-aware generation and adaptively +partial alignment for item representations. Specifically, we first develop a +sequence-aware feature augmentation strategy, which captures both collaborative +and sequential item correlations, thus facilitating holistic item +representation generation. Next, we conduct an empirical study to investigate +the partial representation alignment problem from a spectrum perspective. It +motivates us to devise an adaptive spectrum filter, achieving partial alignment +adaptively. Furthermore, the aligned item representations can be fed into +different sequential encoders to obtain user representations. The entire +framework is optimized in a multi-task learning paradigm with an annealing +strategy. Extensive experiments have demonstrated that CA-CDSR can surpass +state-of-the-art baselines by a significant margin and can effectively align +items in representation spaces to enhance performance. + +
+
+
+
+
+ + ♻ ☆ Optimizing E-commerce Search: Toward a Generalizable and Rank-Consistent + Pre-Ranking Model + + +
+ In large e-commerce platforms, search systems are typically composed of a +series of modules, including recall, pre-ranking, and ranking phases. The +pre-ranking phase, serving as a lightweight module, is crucial for filtering +out the bulk of products in advance for the downstream ranking module. +Industrial efforts on optimizing the pre-ranking model have predominantly +focused on enhancing ranking consistency, model structure, and generalization +towards long-tail items. Beyond these optimizations, meeting the system +performance requirements presents a significant challenge. Contrasting with +existing industry works, we propose a novel method: a Generalizable and +RAnk-ConsistEnt Pre-Ranking Model (GRACE), which achieves: 1) Ranking +consistency by introducing multiple binary classification tasks that predict +whether a product is within the top-k results as estimated by the ranking +model, which facilitates the addition of learning objectives on common +point-wise ranking models; 2) Generalizability through contrastive learning of +representation for all products by pre-training on a subset of ranking product +embeddings; 3) Ease of implementation in feature construction and online +deployment. Our extensive experiments demonstrate significant improvements in +both offline metrics and online A/B test: a 0.75% increase in AUC and a 1.28% +increase in CVR. + +
+
+
+
+
+ + ♻ ☆ LSVOS Challenge 3rd Place Report: SAM2 and Cutie based VOS + + +
+ Video Object Segmentation (VOS) presents several challenges, including object +occlusion and fragmentation, the dis-appearance and re-appearance of objects, +and tracking specific objects within crowded scenes. In this work, we combine +the strengths of the state-of-the-art (SOTA) models SAM2 and Cutie to address +these challenges. Additionally, we explore the impact of various +hyperparameters on video instance segmentation performance. Our approach +achieves a J\&F score of 0.7952 in the testing phase of LSVOS challenge VOS +track, ranking third overall. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2406.03668 +
+
+
+
+
+ + ♻ ☆ Persona-DB: Efficient Large Language Model Personalization for Response + Prediction with Collaborative Data Refinement + + +
+ The increasing demand for personalized interactions with large language +models (LLMs) calls for methodologies capable of accurately and efficiently +identifying user opinions and preferences. Retrieval augmentation emerges as an +effective strategy, as it can accommodate a vast number of users without the +costs from fine-tuning. Existing research, however, has largely focused on +enhancing the retrieval stage and devoted limited exploration toward optimizing +the representation of the database, a crucial aspect for tasks such as +personalization. In this work, we examine the problem from a novel angle, +focusing on how data can be better represented for more data-efficient +retrieval in the context of LLM customization. To tackle this challenge, we +introduce Persona-DB, a simple yet effective framework consisting of a +hierarchical construction process to improve generalization across task +contexts and collaborative refinement to effectively bridge knowledge gaps +among users. In the evaluation of response prediction, Persona-DB demonstrates +superior context efficiency in maintaining accuracy with a significantly +reduced retrieval size, a critical advantage in scenarios with extensive +histories or limited context windows. Our experiments also indicate a marked +improvement of over 10% under cold-start scenarios, when users have extremely +sparse data. Furthermore, our analysis reveals the increasing importance of +collaborative knowledge as the retrieval capacity expands. + +
+
+
+
+
+ + ♻ ☆ Probability-turbulence divergence: A tunable allotaxonometric instrument + for comparing heavy-tailed categorical distributions + + +
+ Real-world complex systems often comprise many distinct types of elements as +well as many more types of networked interactions between elements. When the +relative abundances of types can be measured well, we further observe +heavy-tailed categorical distributions for type frequencies. For the comparison +of type frequency distributions of two systems or a system with itself at +different time points in time -- a facet of allotaxonometry -- a great range of +probability divergences are available. Here, we introduce and explore +`probability-turbulence divergence', a tunable, straightforward, and +interpretable instrument for comparing normalizable categorical frequency +distributions. We model probability-turbulence divergence (PTD) after +rank-turbulence divergence (RTD). While probability-turbulence divergence is +more limited in application than rank-turbulence divergence, it is more +sensitive to changes in type frequency. We build allotaxonographs to display +probability turbulence, incorporating a way to visually accommodate zero +probabilities for `exclusive types' which are types that appear in only one +system. We explore comparisons of example distributions taken from literature, +social media, and ecology. We show how probability-turbulence divergence either +explicitly or functionally generalizes many existing kinds of distances and +measures, including, as special cases, $L^{(p)}$ norms, the S{\o}rensen-Dice +coefficient (the $F_1$ statistic), and the Hellinger distance. We discuss +similarities with the generalized entropies of R{\'e}nyi and Tsallis, and the +diversity indices (or Hill numbers) from ecology. We close with thoughts on +open problems concerning the optimization of the tuning of rank- and +probability-turbulence divergence. + +
+
+ comment: 14 pages, 7 figures +
+
+
+
+
+
+
+
+ + Machine Learning 145 + +
+
+
+ + ☆ Efficient Exploration and Discriminative World Model Learning with an + Object-Centric Abstraction + + +
+ In the face of difficult exploration problems in reinforcement learning, we +study whether giving an agent an object-centric mapping (describing a set of +items and their attributes) allow for more efficient learning. We found this +problem is best solved hierarchically by modelling items at a higher level of +state abstraction to pixels, and attribute change at a higher level of temporal +abstraction to primitive actions. This abstraction simplifies the transition +dynamic by making specific future states easier to predict. We make use of this +to propose a fully model-based algorithm that learns a discriminative world +model, plans to explore efficiently with only a count-based intrinsic reward, +and can subsequently plan to reach any discovered (abstract) states. + We demonstrate the model's ability to (i) efficiently solve single tasks, +(ii) transfer zero-shot and few-shot across item types and environments, and +(iii) plan across long horizons. Across a suite of 2D crafting and MiniHack +environments, we empirically show our model significantly out-performs +state-of-the-art low-level methods (without abstraction), as well as performant +model-free and model-based methods using the same abstraction. Finally, we show +how to reinforce learn low level object-perturbing policies, as well as +supervise learn the object mapping itself. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Scaling Cross-Embodied Learning: One Policy for Manipulation, + Navigation, Locomotion and Aviation + + +
+ Modern machine learning systems rely on large datasets to attain broad +generalization, and this often poses a challenge in robot learning, where each +robotic platform and task might have only a small dataset. By training a single +policy across many different kinds of robots, a robot learning method can +leverage much broader and more diverse datasets, which in turn can lead to +better generalization and robustness. However, training a single policy on +multi-robot data is challenging because robots can have widely varying sensors, +actuators, and control frequencies. We propose CrossFormer, a scalable and +flexible transformer-based policy that can consume data from any embodiment. We +train CrossFormer on the largest and most diverse dataset to date, 900K +trajectories across 20 different robot embodiments. We demonstrate that the +same network weights can control vastly different robots, including single and +dual arm manipulation systems, wheeled robots, quadcopters, and quadrupeds. +Unlike prior work, our model does not require manual alignment of the +observation or action spaces. Extensive experiments in the real world show that +our method matches the performance of specialist policies tailored for each +embodiment, while also significantly outperforming the prior state of the art +in cross-embodiment learning. + +
+
+ comment: Project website at https://crossformer-model.github.io/ +
+
+
+
+
+ + ☆ ACE: A Cross-Platform Visual-Exoskeletons System for Low-Cost Dexterous + Teleoperation + + +
+ Learning from demonstrations has shown to be an effective approach to robotic +manipulation, especially with the recently collected large-scale robot data +with teleoperation systems. Building an efficient teleoperation system across +diverse robot platforms has become more crucial than ever. However, there is a +notable lack of cost-effective and user-friendly teleoperation systems for +different end-effectors, e.g., anthropomorphic robot hands and grippers, that +can operate across multiple platforms. To address this issue, we develop ACE, a +cross-platform visual-exoskeleton system for low-cost dexterous teleoperation. +Our system utilizes a hand-facing camera to capture 3D hand poses and an +exoskeleton mounted on a portable base, enabling accurate real-time capture of +both finger and wrist poses. Compared to previous systems, which often require +hardware customization according to different robots, our single system can +generalize to humanoid hands, arm-hands, arm-gripper, and quadruped-gripper +systems with high-precision teleoperation. This enables imitation learning for +complex manipulation tasks on diverse platforms. + +
+
+ comment: Webpage: https://ace-teleop.github.io/ +
+
+
+
+
+ + ☆ Approaching Deep Learning through the Spectral Dynamics of Weights + + +
+ We propose an empirical approach centered on the spectral dynamics of weights +-- the behavior of singular values and vectors during optimization -- to unify +and clarify several phenomena in deep learning. We identify a consistent bias +in optimization across various experiments, from small-scale ``grokking'' to +large-scale tasks like image classification with ConvNets, image generation +with UNets, speech recognition with LSTMs, and language modeling with +Transformers. We also demonstrate that weight decay enhances this bias beyond +its role as a norm regularizer, even in practical systems. Moreover, we show +that these spectral dynamics distinguish memorizing networks from generalizing +ones, offering a novel perspective on this longstanding conundrum. +Additionally, we leverage spectral dynamics to explore the emergence of +well-performing sparse subnetworks (lottery tickets) and the structure of the +loss surface through linear mode connectivity. Our findings suggest that +spectral dynamics provide a coherent framework to better understand the +behavior of neural networks across diverse settings. + +
+
+
+
+
+ + ☆ LLM Pruning and Distillation in Practice: The Minitron Approach + + +
+ We present a comprehensive report on compressing the Llama 3.1 8B and Mistral +NeMo 12B models to 4B and 8B parameters, respectively, using pruning and +distillation. We explore two distinct pruning strategies: (1) depth pruning and +(2) joint hidden/attention/MLP (width) pruning, and evaluate the results on +common benchmarks from the LM Evaluation Harness. The models are then aligned +with NeMo Aligner and tested in instruct-tuned versions. This approach produces +a compelling 4B model from Llama 3.1 8B and a state-of-the-art +Mistral-NeMo-Minitron-8B (MN-Minitron-8B for brevity) model from Mistral NeMo +12B. We found that with no access to the original data, it is beneficial to +slightly fine-tune teacher models on the distillation dataset. We open-source +our base model weights on Hugging Face with a permissive license. + +
+
+
+
+
+ + ☆ Optical ISAC: Fundamental Performance Limits and Transceiver Design + + +
+ This paper characterizes the optimal capacity-distortion (C-D) tradeoff in an +optical point-to-point (P2P) system with single-input single-output for +communication and single-input multiple-output for sensing (SISO-SIMO-C/S) +within an integrated sensing and communication (ISAC) framework. We introduce +practical, asymptotically optimal maximum a posteriori (MAP) and maximum +likelihood estimators (MLE) for target distance, addressing nonlinear +measurement-to-state relationships and non-conjugate priors. Our results show +these estimators converge to the Bayesian Cramer-Rao bound (BCRB) as sensing +antennas increase. We also demonstrate that the achievable rate-CRB (AR-CRB) +serves as an outer bound (OB) for the optimal C-D region. To optimize input +distribution across the Pareto boundary of the C-D region, we propose two +algorithms: an iterative Blahut-Arimoto algorithm (BAA)-type method and a +memory-efficient closed-form (CF) approach, including a CF optimal distribution +for high optical signal-to-noise ratio (O-SNR) conditions. Additionally, we +extend and modify the Deterministic-Random Tradeoff (DRT) to this optical ISAC +context. + +
+
+ comment: 7 pages, 3 figures +
+
+
+
+
+ + ☆ Critique-out-Loud Reward Models + + +
+ Traditionally, reward models used for reinforcement learning from human +feedback (RLHF) are trained to directly predict preference scores without +leveraging the generation capabilities of the underlying large language model +(LLM). This limits the capabilities of reward models as they must reason +implicitly about the quality of a response, i.e., preference modeling must be +performed in a single forward pass through the model. To enable reward models +to reason explicitly about the quality of a response, we introduce +Critique-out-Loud (CLoud) reward models. CLoud reward models operate by first +generating a natural language critique of the assistant's response that is then +used to predict a scalar reward for the quality of the response. We demonstrate +the success of CLoud reward models for both Llama-3-8B and 70B base models: +compared to classic reward models CLoud reward models improve pairwise +preference classification accuracy on RewardBench by 4.65 and 5.84 percentage +points for the 8B and 70B base models respectively. Furthermore, CLoud reward +models lead to a Pareto improvement for win rate on ArenaHard when used as the +scoring model for Best-of-N. Finally, we explore how to exploit the dynamic +inference compute capabilities of CLoud reward models by performing +self-consistency decoding for reward prediction. + +
+
+
+
+
+ + ☆ RFID based Health Adherence Medicine Case Using Fair Federated Learning + + +
+ Medication nonadherence significantly reduces the effectiveness of therapies, +yet it remains prevalent among patients. Nonadherence has been linked to +adverse outcomes, including increased risks of mortality and hospitalization. +Although various methods exist to help patients track medication schedules, +such as the Intelligent Drug Administration System (IDAS) and Smart Blister, +these tools often face challenges that hinder their commercial viability. +Building on the principles of dosage measurement and information communication +in IoT, we introduce the Smart Pill Case a smart health adherence tool that +leverages RFID-based data recording and NFC-based data extraction. This system +incorporates a load cell for precise dosage measurement and features an Android +app to monitor medication intake, offer suggestions, and issue warnings. To +enhance the effectiveness and personalization of the Smart Pill Case, we +propose integrating federated learning into the system. Federated learning +allows the Smart Pill Case to learn from medication adherence patterns across +multiple users without compromising individual privacy. By training machine +learning models on decentralized data collected from various Smart Pill Cases, +the system can continuously improve its recommendations and warnings, adapting +to the diverse needs and behaviors of users. This approach not only enhances +the tools ability to support medication adherence but also ensures that +sensitive user data remains secure and private. + +
+
+
+
+
+ + ☆ Sum of Squares Circuits + + +
+ Designing expressive generative models that support exact and efficient +inference is a core question in probabilistic ML. Probabilistic circuits (PCs) +offer a framework where this tractability-vs-expressiveness trade-off can be +analyzed theoretically. Recently, squared PCs encoding subtractive mixtures via +negative parameters have emerged as tractable models that can be exponentially +more expressive than monotonic PCs, i.e., PCs with positive parameters only. In +this paper, we provide a more precise theoretical characterization of the +expressiveness relationships among these models. First, we prove that squared +PCs can be less expressive than monotonic ones. Second, we formalize a novel +class of PCs -- sum of squares PCs -- that can be exponentially more expressive +than both squared and monotonic PCs. Around sum of squares PCs, we build an +expressiveness hierarchy that allows us to precisely unify and separate +different tractable model classes such as Born Machines and PSD models, and +other recently introduced tractable probabilistic models by using complex +parameters. Finally, we empirically show the effectiveness of sum of squares +circuits in performing distribution estimation. + +
+
+
+
+
+ + ☆ Embedding Ordinality to Binary Loss Function for Improving Solar Flare + Forecasting + + +
+ In this paper, we propose a novel loss function aimed at optimizing the +binary flare prediction problem by embedding the intrinsic ordinal flare +characteristics into the binary cross-entropy (BCE) loss function. This +modification is intended to provide the model with better guidance based on the +ordinal characteristics of the data and improve the overall performance of the +models. For our experiments, we employ a ResNet34-based model with transfer +learning to predict $\geq$M-class flares by utilizing the shape-based features +of magnetograms of active region (AR) patches spanning from $-$90$^{\circ}$ to +$+$90$^{\circ}$ of solar longitude as our input data. We use a composite skill +score (CSS) as our evaluation metric, which is calculated as the geometric mean +of the True Skill Score (TSS) and the Heidke Skill Score (HSS) to rank and +compare our models' performance. The primary contributions of this work are as +follows: (i) We introduce a novel approach to encode ordinality into a binary +loss function showing an application to solar flare prediction, (ii) We enhance +solar flare forecasting by enabling flare predictions for each AR across the +entire solar disk, without any longitudinal restrictions, and evaluate and +compare performance. (iii) Our candidate model, optimized with the proposed +loss function, shows an improvement of $\sim$7%, $\sim$4%, and $\sim$3% for AR +patches within $\pm$30$^\circ$, $\pm$60$^\circ$, and $\pm$90$^\circ$ of solar +longitude, respectively in terms of CSS, when compared with standard BCE. +Additionally, we demonstrate the ability to issue flare forecasts for ARs in +near-limb regions (regions between $\pm$60$^{\circ}$ to $\pm$90$^{\circ}$) with +a CSS=0.34 (TSS=0.50 and HSS=0.23), expanding the scope of AR-based models for +solar flare prediction. This advances the reliability of solar flare forecasts, +leading to more effective prediction capabilities. + +
+
+ comment: 10 Pages, 8 Figures. This manuscript is accepted to be published at + DSAA 2024 conference. arXiv admin note: substantial text overlap with + arXiv:2406.11054 +
+
+
+
+
+ + ☆ Mixed Sparsity Training: Achieving 4$\times$ FLOP Reduction for + Transformer Pretraining + + +
+ Large language models (LLMs) have made significant strides in complex tasks, +yet their widespread adoption is impeded by substantial computational demands. +With hundreds of billion parameters, transformer-based LLMs necessitate months +of pretraining across a high-end GPU cluster. However, this paper reveals a +compelling finding: transformers exhibit considerable redundancy in pretraining +computations, which motivates our proposed solution, Mixed Sparsity Training +(MST), an efficient pretraining method that can reduce about $75\%$ of Floating +Point Operations (FLOPs) while maintaining performance. MST integrates dynamic +sparse training (DST) with Sparsity Variation (SV) and Hybrid Sparse Attention +(HSA) during pretraining, involving three distinct phases: warm-up, +ultra-sparsification, and restoration. The warm-up phase transforms the dense +model into a sparse one, and the restoration phase reinstates connections. +Throughout these phases, the model is trained with a dynamically evolving +sparse topology and an HSA mechanism to maintain performance and minimize +training FLOPs concurrently. Our experiment on GPT-2 showcases a FLOP reduction +of $4\times$ without compromising performance. + +
+
+
+
+
+ + ☆ MARLIN: Mixed-Precision Auto-Regressive Parallel Inference on Large + Language Models + + +
+ As inference on Large Language Models (LLMs) emerges as an important workload +in machine learning applications, weight quantization has become a standard +technique for efficient GPU deployment. Quantization not only reduces model +size, but has also been shown to yield substantial speedups for single-user +inference, due to reduced memory movement, with low accuracy impact. Yet, it +remains open whether speedups are achievable also in \emph{batched} settings +with multiple parallel clients, which are highly relevant for practical +serving. It is unclear whether GPU kernels can be designed to remain +practically memory-bound, while supporting the substantially increased compute +requirements of batched workloads. + This paper resolves this question positively by describing the design of +Mixed-precision Auto-Regressive LINear kernels, called MARLIN. Concretely, +given a model whose weights are compressed via quantization to, e.g., 4 bits +per element, MARLIN shows that batchsizes up to 16-32 can be supported with +close to maximum ($4\times$) quantization speedup, and larger batchsizes up to +64-128 with gradually decreasing, but still significant, acceleration. MARLIN +accomplishes this via a combination of techniques, such as asynchronous memory +access, complex task scheduling and pipelining, and bespoke quantization +support. Our experiments show that MARLIN's near-optimal performance on +individual LLM layers across different scenarios can also lead to end-to-end +LLM inference speedups (of up to $2.8\times$) when integrated with the popular +vLLM serving engine. Finally, MARLIN is extensible to further compression +techniques, like NVIDIA 2:4 sparsity, leading to additional speedups. + +
+
+
+
+
+ + ☆ Iterative Object Count Optimization for Text-to-image Diffusion Models + + +
+ We address a persistent challenge in text-to-image models: accurately +generating a specified number of objects. Current models, which learn from +image-text pairs, inherently struggle with counting, as training data cannot +depict every possible number of objects for any given object. To solve this, we +propose optimizing the generated image based on a counting loss derived from a +counting model that aggregates an object\'s potential. Employing an +out-of-the-box counting model is challenging for two reasons: first, the model +requires a scaling hyperparameter for the potential aggregation that varies +depending on the viewpoint of the objects, and second, classifier guidance +techniques require modified models that operate on noisy intermediate diffusion +steps. To address these challenges, we propose an iterated online training mode +that improves the accuracy of inferred images while altering the text +conditioning embedding and dynamically adjusting hyperparameters. Our method +offers three key advantages: (i) it can consider non-derivable counting +techniques based on detection models, (ii) it is a zero-shot plug-and-play +solution facilitating rapid changes to the counting techniques and image +generation methods, and (iii) the optimized counting token can be reused to +generate accurate images without additional optimization. We evaluate the +generation of various objects and show significant improvements in accuracy. +The project page is available at https://ozzafar.github.io/count_token. + +
+
+ comment: Pre-print +
+
+
+
+
+ + ☆ On Learnable Parameters of Optimal and Suboptimal Deep Learning Models + + +
+ We scrutinize the structural and operational aspects of deep learning models, +particularly focusing on the nuances of learnable parameters (weight) +statistics, distribution, node interaction, and visualization. By establishing +correlations between variance in weight patterns and overall network +performance, we investigate the varying (optimal and suboptimal) performances +of various deep-learning models. Our empirical analysis extends across widely +recognized datasets such as MNIST, Fashion-MNIST, and CIFAR-10, and various +deep learning models such as deep neural networks (DNNs), convolutional neural +networks (CNNs), and vision transformer (ViT), enabling us to pinpoint +characteristics of learnable parameters that correlate with successful +networks. Through extensive experiments on the diverse architectures of deep +learning models, we shed light on the critical factors that influence the +functionality and efficiency of DNNs. Our findings reveal that successful +networks, irrespective of datasets or models, are invariably similar to other +successful networks in their converged weights statistics and distribution, +while poor-performing networks vary in their weights. In addition, our research +shows that the learnable parameters of widely varied deep learning models such +as DNN, CNN, and ViT exhibit similar learning characteristics. + +
+
+
+
+
+ + ☆ Plug-in estimation of Schrödinger bridges + + +
+ We propose a procedure for estimating the Schr\"odinger bridge between two +probability distributions. Unlike existing approaches, our method does not +require iteratively simulating forward and backward diffusions or training +neural networks to fit unknown drifts. Instead, we show that the potentials +obtained from solving the static entropic optimal transport problem between the +source and target samples can be modified to yield a natural plug-in estimator +of the time-dependent drift that defines the bridge between two measures. Under +minimal assumptions, we show that our proposal, which we call the +\emph{Sinkhorn bridge}, provably estimates the Schr\"odinger bridge with a rate +of convergence that depends on the intrinsic dimensionality of the target +measure. Our approach combines results from the areas of sampling, and +theoretical and statistical entropic optimal transport. + +
+
+ comment: 39 pages, 3 figures, 1 table +
+
+
+
+
+ + ☆ First line of defense: A robust first layer mitigates adversarial + attacks + + +
+ Adversarial training (AT) incurs significant computational overhead, leading +to growing interest in designing inherently robust architectures. We +demonstrate that a carefully designed first layer of the neural network can +serve as an implicit adversarial noise filter (ANF). This filter is created +using a combination of large kernel size, increased convolution filters, and a +maxpool operation. We show that integrating this filter as the first layer in +architectures such as ResNet, VGG, and EfficientNet results in adversarially +robust networks. Our approach achieves higher adversarial accuracies than +existing natively robust architectures without AT and is competitive with +adversarial-trained architectures across a wide range of datasets. Supporting +our findings, we show that (a) the decision regions for our method have better +margins, (b) the visualized loss surfaces are smoother, (c) the modified peak +signal-to-noise ratio (mPSNR) values at the output of the ANF are higher, (d) +high-frequency components are more attenuated, and (e) architectures +incorporating ANF exhibit better denoising in Gaussian noise compared to +baseline architectures. Code for all our experiments are available at +\url{https://github.com/janani-suresh-97/first-line-defence.git}. + +
+
+
+
+
+ + ☆ Optimizing Federated Graph Learning with Inherent Structural Knowledge + and Dual-Densely Connected GNNs + + +
+ Federated Graph Learning (FGL) is an emerging technology that enables clients +to collaboratively train powerful Graph Neural Networks (GNNs) in a distributed +manner without exposing their private data. Nevertheless, FGL still faces the +challenge of the severe non-Independent and Identically Distributed (non-IID) +nature of graphs, which possess diverse node and edge structures, especially +across varied domains. Thus, exploring the knowledge inherent in these +structures becomes significantly crucial. Existing methods, however, either +overlook the inherent structural knowledge in graph data or capture it at the +cost of significantly increased resource demands (e.g., FLOPs and communication +bandwidth), which can be detrimental to distributed paradigms. Inspired by +this, we propose FedDense, a novel FGL framework that optimizes the utilization +efficiency of inherent structural knowledge. To better acquire knowledge of +diverse and underexploited structures, FedDense first explicitly encodes the +structural knowledge inherent within graph data itself alongside node features. +Besides, FedDense introduces a Dual-Densely Connected (DDC) GNN architecture +that exploits the multi-scale (i.e., one-hop to multi-hop) feature and +structure insights embedded in the aggregated feature maps at each layer. In +addition to the exploitation of inherent structures, we consider resource +limitations in FGL, devising exceedingly narrow layers atop the DDC +architecture and adopting a selective parameter sharing strategy to reduce +resource costs substantially. We conduct extensive experiments using 15 +datasets across 4 different domains, demonstrating that FedDense consistently +surpasses baselines by a large margin in training performance, while demanding +minimal resources. + +
+
+
+
+
+ + ☆ 5G NR PRACH Detection with Convolutional Neural Networks (CNN): + Overcoming Cell Interference Challenges + + +
+ In this paper, we present a novel approach to interference detection in 5G +New Radio (5G-NR) networks using Convolutional Neural Networks (CNN). +Interference in 5G networks challenges high-quality service due to dense user +equipment deployment and increased wireless environment complexity. Our +CNN-based model is designed to detect Physical Random Access Channel (PRACH) +sequences amidst various interference scenarios, leveraging the spatial and +temporal characteristics of PRACH signals to enhance detection accuracy and +robustness. Comprehensive datasets of simulated PRACH signals under controlled +interference conditions were generated to train and validate the model. +Experimental results show that our CNN-based approach outperforms traditional +PRACH detection methods in accuracy, precision, recall and F1-score. This study +demonstrates the potential of AI/ML techniques in advancing interference +management in 5G networks, providing a foundation for future research and +practical applications in optimizing network performance and reliability. + +
+
+
+
+
+ + ☆ Macformer: Transformer with Random Maclaurin Feature Attention + + +
+ Random feature attention (RFA) adopts random fourier feature (RFF) methods to +approximate the softmax function, resulting in a linear time and space +attention mechanism that enables the construction of an efficient Transformer. +Inspired by RFA, we propose Macformer, a Transformer architecture that employs +random Maclaurin features (RMF) to approximate various dot-product kernels, +thereby accelerating attention computations for long sequence. Macformer +consists of Random Maclaurin Feature Attention (RMFA) and pre-post Scaling +Batch Normalization (ppSBN), the former is an unbiased approximation for +dot-product kernelized attention and the later is a two-stage regularization +mechanism guaranteeing the error of RMFA. We conducted toy experiments to +demonstrate the efficiency of RMFA and ppSBN, and experiments on long range +arena (LRA) benchmark to validate the acceleration and accuracy of Macformer +with different dot-product kernels. Experiment results of Macformer are +consistent with our theoretical analysis. + +
+
+
+
+
+ + ☆ Estimated Audio-Caption Correspondences Improve Language-Based Audio + Retrieval + + +
+ Dual-encoder-based audio retrieval systems are commonly optimized with +contrastive learning on a set of matching and mismatching audio-caption pairs. +This leads to a shared embedding space in which corresponding items from the +two modalities end up close together. Since audio-caption datasets typically +only contain matching pairs of recordings and descriptions, it has become +common practice to create mismatching pairs by pairing the audio with a caption +randomly drawn from the dataset. This is not ideal because the randomly sampled +caption could, just by chance, partly or entirely describe the audio recording. +However, correspondence information for all possible pairs is costly to +annotate and thus typically unavailable; we, therefore, suggest substituting it +with estimated correspondences. To this end, we propose a two-staged training +procedure in which multiple retrieval models are first trained as usual, i.e., +without estimated correspondences. In the second stage, the audio-caption +correspondences predicted by these models then serve as prediction targets. We +evaluate our method on the ClothoV2 and the AudioCaps benchmark and show that +it improves retrieval performance, even in a restricting self-distillation +setting where a single model generates and then learns from the estimated +correspondences. We further show that our method outperforms the current state +of the art by 1.6 pp. mAP@10 on the ClothoV2 benchmark. + +
+
+ comment: In Proceedings of the 9th Workshop on Detection and Classification of + Acoustic Scenes and Events, DCASE, Tokyo, Japan, 2024. Implementation + available on GitHub: https://github.com/OptimusPrimus/salsa +
+
+
+
+
+ + ☆ Optimizing Interpretable Decision Tree Policies for Reinforcement + Learning + + +
+ Reinforcement learning techniques leveraging deep learning have made +tremendous progress in recent years. However, the complexity of neural networks +prevents practitioners from understanding their behavior. Decision trees have +gained increased attention in supervised learning for their inherent +interpretability, enabling modelers to understand the exact prediction process +after learning. This paper considers the problem of optimizing interpretable +decision tree policies to replace neural networks in reinforcement learning +settings. Previous works have relaxed the tree structure, restricted to +optimizing only tree leaves, or applied imitation learning techniques to +approximately copy the behavior of a neural network policy with a decision +tree. We propose the Decision Tree Policy Optimization (DTPO) algorithm that +directly optimizes the complete decision tree using policy gradients. Our +technique uses established decision tree heuristics for regression to perform +policy optimization. We empirically show that DTPO is a competitive algorithm +compared to imitation learning algorithms for optimizing decision tree policies +in reinforcement learning. + +
+
+
+
+
+ + ☆ A Markovian Model for Learning-to-Optimize + + +
+ We present a probabilistic model for stochastic iterative algorithms with the +use case of optimization algorithms in mind. Based on this model, we present +PAC-Bayesian generalization bounds for functions that are defined on the +trajectory of the learned algorithm, for example, the expected (non-asymptotic) +convergence rate and the expected time to reach the stopping criterion. Thus, +not only does this model allow for learning stochastic algorithms based on +their empirical performance, it also yields results about their actual +convergence rate and their actual convergence time. We stress that, since the +model is valid in a more general setting than learning-to-optimize, it is of +interest for other fields of application, too. Finally, we conduct five +practically relevant experiments, showing the validity of our claims. + +
+
+
+
+
+ + ☆ End-to-End Cost-Effective Incentive Recommendation under Budget + Constraint with Uplift Modeling RecSys 2024 + + +
+ In modern online platforms, incentives are essential factors that enhance +user engagement and increase platform revenue. Over recent years, uplift +modeling has been introduced as a strategic approach to assign incentives to +individual customers. Especially in many real-world applications, online +platforms can only incentivize customers with specific budget constraints. This +problem can be reformulated as the multi-choice knapsack problem. This +optimization aims to select the optimal incentive for each customer to maximize +the return on investment. Recent works in this field frequently tackle the +budget allocation problem using a two-stage approach. However, this solution is +confronted with the following challenges: (1) The causal inference methods +often ignore the domain knowledge in online marketing, where the expected +response curve of a customer should be monotonic and smooth as the incentive +increases. (2) An optimality gap between the two stages results in inferior +sub-optimal allocation performance due to the loss of the incentive +recommendation information for the uplift prediction under the limited budget +constraint. To address these challenges, we propose a novel End-to-End +Cost-Effective Incentive Recommendation (E3IR) model under budget constraints. +Specifically, our methods consist of two modules, i.e., the uplift prediction +module and the differentiable allocation module. In the uplift prediction +module, we construct prediction heads to capture the incremental improvement +between adjacent treatments with the marketing domain constraints (i.e., +monotonic and smooth). We incorporate integer linear programming (ILP) as a +differentiable layer input in the allocation module. Furthermore, we conduct +extensive experiments on public and real product datasets, demonstrating that +our E3IR improves allocation performance compared to existing two-stage +approaches. + +
+
+ comment: Accepted by RecSys 2024 +
+
+
+
+
+ + ☆ Annealed Sinkhorn for Optimal Transport: convergence, regularization + path and debiasing + + +
+ Sinkhorn's algorithm is a method of choice to solve large-scale optimal +transport (OT) problems. In this context, it involves an inverse temperature +parameter $\beta$ that determines the speed-accuracy trade-off. To improve this +trade-off, practitioners often use a variant of this algorithm, Annealed +Sinkhorn, that uses an nondecreasing sequence $(\beta_t)_{t\in \mathbb{N}}$ +where $t$ is the iteration count. However, besides for the schedule +$\beta_t=\Theta(\log t)$ which is impractically slow, it is not known whether +this variant is guaranteed to actually solve OT. Our first contribution answers +this question: we show that a concave annealing schedule asymptotically solves +OT if and only if $\beta_t\to+\infty$ and $\beta_t-\beta_{t-1}\to 0$. The proof +is based on an equivalence with Online Mirror Descent and further suggests that +the iterates of Annealed Sinkhorn follow the solutions of a sequence of +relaxed, entropic OT problems, the regularization path. An analysis of this +path reveals that, in addition to the well-known "entropic" error in +$\Theta(\beta^{-1}_t)$, the annealing procedure induces a "relaxation" error in +$\Theta(\beta_{t}-\beta_{t-1})$. The best error trade-off is achieved with the +schedule $\beta_t = \Theta(\sqrt{t})$ which, albeit slow, is a universal +limitation of this method. Going beyond this limitation, we propose a simple +modification of Annealed Sinkhorn that reduces the relaxation error, and +therefore enables faster annealing schedules. In toy experiments, we observe +the effectiveness of our Debiased Annealed Sinkhorn's algorithm: a single run +of this algorithm spans the whole speed-accuracy Pareto front of the standard +Sinkhorn's algorithm. + +
+
+
+
+
+ + ☆ Data-driven Modeling of Combined Sewer Systems for Urban Sustainability: + An Empirical Evaluation + + +
+ Climate change poses complex challenges, with extreme weather events becoming +increasingly frequent and difficult to model. Examples include the dynamics of +Combined Sewer Systems (CSS). Overburdened CSS during heavy rainfall will +overflow untreated wastewater into surface water bodies. Classical approaches +to modeling the impact of extreme rainfall events rely on physical simulations, +which are particularly challenging to create for large urban infrastructures. +Deep Learning (DL) models offer a cost-effective alternative for modeling the +complex dynamics of sewer systems. In this study, we present a comprehensive +empirical evaluation of several state-of-the-art DL time series models for +predicting sewer system dynamics in a large urban infrastructure, utilizing +three years of measurement data. We especially investigate the potential of DL +models to maintain predictive precision during network outages by comparing +global models, which have access to all variables within the sewer system, and +local models, which are limited to data from a restricted set of local sensors. +Our findings demonstrate that DL models can accurately predict the dynamics of +sewer system load, even under network outage conditions. These results suggest +that DL models can effectively aid in balancing the load redistribution in CSS, +thereby enhancing the sustainability and resilience of urban infrastructures. + +
+
+ comment: 12 pages, 4 figures, accepted at 47th German Conference on Artificial + Intelligence, Wuerzburg 2024 +
+
+
+
+
+ + ☆ DTN: Deep Multiple Task-specific Feature Interactions Network for + Multi-Task Recommendation + + +
+ Neural-based multi-task learning (MTL) has been successfully applied to many +recommendation applications. However, these MTL models (e.g., MMoE, PLE) did +not consider feature interaction during the optimization, which is crucial for +capturing complex high-order features and has been widely used in ranking +models for real-world recommender systems. Moreover, through feature importance +analysis across various tasks in MTL, we have observed an interesting +divergence phenomenon that the same feature can have significantly different +importance across different tasks in MTL. To address these issues, we propose +Deep Multiple Task-specific Feature Interactions Network (DTN) with a novel +model structure design. DTN introduces multiple diversified task-specific +feature interaction methods and task-sensitive network in MTL networks, +enabling the model to learn task-specific diversified feature interaction +representations, which improves the efficiency of joint representation learning +in a general setup. We applied DTN to our company's real-world E-commerce +recommendation dataset, which consisted of over 6.3 billion samples, the +results demonstrated that DTN significantly outperformed state-of-the-art MTL +models. Moreover, during online evaluation of DTN in a large-scale E-commerce +recommender system, we observed a 3.28% in clicks, a 3.10% increase in orders +and a 2.70% increase in GMV (Gross Merchandise Value) compared to the +state-of-the-art MTL models. Finally, extensive offline experiments conducted +on public benchmark datasets demonstrate that DTN can be applied to various +scenarios beyond recommendations, enhancing the performance of ranking models. + +
+
+
+
+
+ + ☆ Networked Communication for Mean-Field Games with Function Approximation + and Empirical Mean-Field Estimation + + +
+ Recent works have provided algorithms by which decentralised agents, which +may be connected via a communication network, can learn equilibria in +Mean-Field Games from a single, non-episodic run of the empirical system. +However, these algorithms are given for tabular settings: this computationally +limits the size of players' observation space, meaning that the algorithms are +not able to handle anything but small state spaces, nor to generalise beyond +policies depending on the ego player's state to so-called +'population-dependent' policies. We address this limitation by introducing +function approximation to the existing setting, drawing on the Munchausen +Online Mirror Descent method that has previously been employed only in +finite-horizon, episodic, centralised settings. While this permits us to +include the population's mean-field distribution in the observation for each +player's policy, it is arguably unrealistic to assume that decentralised agents +would have access to this global information: we therefore additionally provide +new algorithms that allow agents to estimate the global empirical distribution +based on a local neighbourhood, and to improve this estimate via communication +over a given network. Our experiments showcase how the communication network +allows decentralised agents to estimate the mean-field distribution for +population-dependent policies, and that exchanging policy information helps +networked agents to outperform both independent and even centralised agents in +function-approximation settings, by an even greater margin than in tabular +settings. + +
+
+
+
+
+ + ☆ Improving Calibration by Relating Focal Loss, Temperature Scaling, and + Properness ECAI 2024 + + +
+ Proper losses such as cross-entropy incentivize classifiers to produce class +probabilities that are well-calibrated on the training data. Due to the +generalization gap, these classifiers tend to become overconfident on the test +data, mandating calibration methods such as temperature scaling. The focal loss +is not proper, but training with it has been shown to often result in +classifiers that are better calibrated on test data. Our first contribution is +a simple explanation about why focal loss training often leads to better +calibration than cross-entropy training. For this, we prove that focal loss can +be decomposed into a confidence-raising transformation and a proper loss. This +is why focal loss pushes the model to provide under-confident predictions on +the training data, resulting in being better calibrated on the test data, due +to the generalization gap. Secondly, we reveal a strong connection between +temperature scaling and focal loss through its confidence-raising +transformation, which we refer to as the focal calibration map. Thirdly, we +propose focal temperature scaling - a new post-hoc calibration method combining +focal calibration and temperature scaling. Our experiments on three image +classification datasets demonstrate that focal temperature scaling outperforms +standard temperature scaling. + +
+
+ comment: Accepted to ECAI 2024 +
+
+
+
+
+ + ☆ Calibrating the Predictions for Top-N Recommendations RecSys 2024 + + +
+ Well-calibrated predictions of user preferences are essential for many +applications. Since recommender systems typically select the top-N items for +users, calibration for those top-N items, rather than for all items, is +important. We show that previous calibration methods result in miscalibrated +predictions for the top-N items, despite their excellent calibration +performance when evaluated on all items. In this work, we address the +miscalibration in the top-N recommended items. We first define evaluation +metrics for this objective and then propose a generic method to optimize +calibration models focusing on the top-N items. It groups the top-N items by +their ranks and optimizes distinct calibration models for each group with +rank-dependent training weights. We verify the effectiveness of the proposed +method for both explicit and implicit feedback datasets, using diverse classes +of recommender models. + +
+
+ comment: accepted at RecSys 2024 +
+
+
+
+
+ + ☆ Self-Supervised Iterative Refinement for Anomaly Detection in Industrial + Quality Control + + +
+ This study introduces the Iterative Refinement Process (IRP), a robust +anomaly detection methodology designed for high-stakes industrial quality +control. The IRP enhances defect detection accuracy through a cyclic data +refinement strategy, iteratively removing misleading data points to improve +model performance and robustness. We validate the IRP's effectiveness using two +benchmark datasets, Kolektor SDD2 (KSDD2) and MVTec AD, covering a wide range +of industrial products and defect types. Our experimental results demonstrate +that the IRP consistently outperforms traditional anomaly detection models, +particularly in environments with high noise levels. This study highlights the +IRP's potential to significantly enhance anomaly detection processes in +industrial settings, effectively managing the challenges of sparse and noisy +data. + +
+
+
+
+
+ + ☆ Memorization In In-Context Learning + + +
+ In-context learning (ICL) has proven to be an effective strategy for +improving the performance of large language models (LLMs) with no additional +training. However, the exact mechanism behind these performance improvements +remains unclear. This study is the first to show how ICL surfaces memorized +training data and to explore the correlation between this memorization and +performance across various ICL regimes: zero-shot, few-shot, and many-shot. Our +most notable findings include: (1) ICL significantly surfaces memorization +compared to zero-shot learning in most cases; (2) demonstrations, without their +labels, are the most effective element in surfacing memorization; (3) ICL +improves performance when the surfaced memorization in few-shot regimes reaches +a high level (about 40%); and (4) there is a very strong correlation between +performance and memorization in ICL when it outperforms zero-shot learning. +Overall, our study uncovers a hidden phenomenon -- memorization -- at the core +of ICL, raising an important question: to what extent do LLMs truly generalize +from demonstrations in ICL, and how much of their success is due to +memorization? + +
+
+ comment: v1 +
+
+
+
+
+ + ☆ A Survey of Embodied Learning for Object-Centric Robotic Manipulation + + +
+ Embodied learning for object-centric robotic manipulation is a rapidly +developing and challenging area in embodied AI. It is crucial for advancing +next-generation intelligent robots and has garnered significant interest +recently. Unlike data-driven machine learning methods, embodied learning +focuses on robot learning through physical interaction with the environment and +perceptual feedback, making it especially suitable for robotic manipulation. In +this paper, we provide a comprehensive survey of the latest advancements in +this field and categorize the existing work into three main branches: 1) +Embodied perceptual learning, which aims to predict object pose and affordance +through various data representations; 2) Embodied policy learning, which +focuses on generating optimal robotic decisions using methods such as +reinforcement learning and imitation learning; 3) Embodied task-oriented +learning, designed to optimize the robot's performance based on the +characteristics of different tasks in object grasping and manipulation. In +addition, we offer an overview and discussion of public datasets, evaluation +metrics, representative applications, current challenges, and potential future +research directions. A project associated with this survey has been established +at https://github.com/RayYoh/OCRM_survey. + +
+
+
+
+
+ + ☆ The Vizier Gaussian Process Bandit Algorithm + + +
+ Google Vizier has performed millions of optimizations and accelerated +numerous research and production systems at Google, demonstrating the success +of Bayesian optimization as a large-scale service. Over multiple years, its +algorithm has been improved considerably, through the collective experiences of +numerous research efforts and user feedback. In this technical report, we +discuss the implementation details and design choices of the current default +algorithm provided by Open Source Vizier. Our experiments on standardized +benchmarks reveal its robustness and versatility against well-established +industry baselines on multiple practical modes. + +
+
+ comment: Google DeepMind Technical Report. Code can be found in + https://github.com/google/vizier +
+
+
+
+
+ + ☆ Last-Iterate Convergence of General Parameterized Policies in + Constrained MDPs + + +
+ We consider the problem of learning a Constrained Markov Decision Process +(CMDP) via general parameterization. Our proposed Primal-Dual based Regularized +Accelerated Natural Policy Gradient (PDR-ANPG) algorithm uses entropy and +quadratic regularizers to reach this goal. For a parameterized policy class +with transferred compatibility approximation error, $\epsilon_{\mathrm{bias}}$, +PDR-ANPG achieves a last-iterate $\epsilon$ optimality gap and $\epsilon$ +constraint violation (up to some additive factor of $\epsilon_{\mathrm{bias}}$) +with a sample complexity of +$\tilde{\mathcal{O}}(\epsilon^{-2}\min\{\epsilon^{-2},\epsilon_{\mathrm{bias}}^{-\frac{1}{3}}\})$. +If the class is incomplete ($\epsilon_{\mathrm{bias}}>0$), then the sample +complexity reduces to $\tilde{\mathcal{O}}(\epsilon^{-2})$ for +$\epsilon<(\epsilon_{\mathrm{bias}})^{\frac{1}{6}}$. Moreover, for complete +policies with $\epsilon_{\mathrm{bias}}=0$, our algorithm achieves a +last-iterate $\epsilon$ optimality gap and $\epsilon$ constraint violation with +$\tilde{\mathcal{O}}(\epsilon^{-4})$ sample complexity. It is a significant +improvement of the state-of-the-art last-iterate guarantees of general +parameterized CMDPs. + +
+
+
+
+
+ + ☆ Slicing Input Features to Accelerate Deep Learning: A Case Study with + Graph Neural Networks + + +
+ As graphs grow larger, full-batch GNN training becomes hard for single GPU +memory. Therefore, to enhance the scalability of GNN training, some studies +have proposed sampling-based mini-batch training and distributed graph +learning. However, these methods still have drawbacks, such as performance +degradation and heavy communication. This paper introduces SliceGCN, a +feature-sliced distributed large-scale graph learning method. SliceGCN slices +the node features, with each computing device, i.e., GPU, handling partial +features. After each GPU processes its share, partial representations are +obtained and concatenated to form complete representations, enabling a single +GPU's memory to handle the entire graph structure. This aims to avoid the +accuracy loss typically associated with mini-batch training (due to incomplete +graph structures) and to reduce inter-GPU communication during message passing +(the forward propagation process of GNNs). To study and mitigate potential +accuracy reductions due to slicing features, this paper proposes feature fusion +and slice encoding. Experiments were conducted on six node classification +datasets, yielding some interesting analytical results. These results indicate +that while SliceGCN does not enhance efficiency on smaller datasets, it does +improve efficiency on larger datasets. Additionally, we found that SliceGCN and +its variants have better convergence, feature fusion and slice encoding can +make training more stable, reduce accuracy fluctuations, and this study also +discovered that the design of SliceGCN has a potentially parameter-efficient +nature. + +
+
+
+
+
+ + ☆ Learning Deep Dissipative Dynamics + + +
+ This study challenges strictly guaranteeing ``dissipativity'' of a dynamical +system represented by neural networks learned from given time-series data. +Dissipativity is a crucial indicator for dynamical systems that generalizes +stability and input-output stability, known to be valid across various systems +including robotics, biological systems, and molecular dynamics. By analytically +proving the general solution to the nonlinear Kalman-Yakubovich-Popov (KYP) +lemma, which is the necessary and sufficient condition for dissipativity, we +propose a differentiable projection that transforms any dynamics represented by +neural networks into dissipative ones and a learning method for the transformed +dynamics. Utilizing the generality of dissipativity, our method strictly +guarantee stability, input-output stability, and energy conservation of trained +dynamical systems. Finally, we demonstrate the robustness of our method against +out-of-domain input through applications to robotic arms and fluid dynamics. +Code here https://github.com/kojima-r/DeepDissipativeModel + +
+
+
+
+
+ + ☆ LAKD-Activation Mapping Distillation Based on Local Learning + + +
+ Knowledge distillation is widely applied in various fundamental vision models +to enhance the performance of compact models. Existing knowledge distillation +methods focus on designing different distillation targets to acquire knowledge +from teacher models. However, these methods often overlook the efficient +utilization of distilled information, crudely coupling different types of +information, making it difficult to explain how the knowledge from the teacher +network aids the student network in learning. This paper proposes a novel +knowledge distillation framework, Local Attention Knowledge Distillation +(LAKD), which more efficiently utilizes the distilled information from teacher +networks, achieving higher interpretability and competitive performance. The +framework establishes an independent interactive training mechanism through a +separation-decoupling mechanism and non-directional activation mapping. LAKD +decouples the teacher's features and facilitates progressive interaction +training from simple to complex. Specifically, the student network is divided +into local modules with independent gradients to decouple the knowledge +transferred from the teacher. The non-directional activation mapping helps the +student network integrate knowledge from different local modules by learning +coarse-grained feature knowledge. We conducted experiments on the CIFAR-10, +CIFAR-100, and ImageNet datasets, and the results show that our LAKD method +significantly outperforms existing methods, consistently achieving +state-of-the-art performance across different datasets. + +
+
+ comment: 8 pages,7 figures +
+
+
+
+
+ + ☆ Using Part-based Representations for Explainable Deep Reinforcement + Learning + + +
+ Utilizing deep learning models to learn part-based representations holds +significant potential for interpretable-by-design approaches, as these models +incorporate latent causes obtained from feature representations through simple +addition. However, training a part-based learning model presents challenges, +particularly in enforcing non-negative constraints on the model's parameters, +which can result in training difficulties such as instability and convergence +issues. Moreover, applying such approaches in Deep Reinforcement Learning (RL) +is even more demanding due to the inherent instabilities that impact many +optimization methods. In this paper, we propose a non-negative training +approach for actor models in RL, enabling the extraction of part-based +representations that enhance interpretability while adhering to non-negative +constraints. To this end, we employ a non-negative initialization technique, as +well as a modified sign-preserving training method, which can ensure better +gradient flow compared to existing approaches. We demonstrate the effectiveness +of the proposed approach using the well-known Cartpole benchmark. + +
+
+
+
+
+ + ☆ Persistent Homology via Ellipsoids + + +
+ Persistent homology is one of the most popular methods in Topological Data +Analysis. An initial step in any analysis with persistent homology involves +constructing a nested sequence of simplicial complexes, called a filtration, +from a point cloud. There is an abundance of different complexes to choose +from, with Rips, Alpha, and witness complexes being popular choices. In this +manuscript, we build a different type of a geometrically-informed simplicial +complex, called an ellipsoid complex. This complex is based on the idea that +ellipsoids aligned with tangent directions better approximate the data compared +to conventional (Euclidean) balls centered at sample points that are used in +the construction of Rips and Alpha complexes, for instance. We use Principal +Component Analysis to estimate tangent spaces directly from samples and present +algorithms as well as an implementation for computing ellipsoid barcodes, i.e., +topological descriptors based on ellipsoid complexes. Furthermore, we conduct +extensive experiments and compare ellipsoid barcodes with standard Rips +barcodes. Our findings indicate that ellipsoid complexes are particularly +effective for estimating homology of manifolds and spaces with bottlenecks from +samples. In particular, the persistence intervals corresponding to a +ground-truth topological feature are longer compared to the intervals obtained +when using the Rips complex of the data. Furthermore, ellipsoid barcodes lead +to better classification results in sparsely-sampled point clouds. Finally, we +demonstrate that ellipsoid barcodes outperform Rips barcodes in classification +tasks. + +
+
+
+
+
+ + ☆ DABench: A Benchmark Dataset for Data-Driven Weather Data Assimilation + + +
+ Recent advancements in deep learning (DL) have led to the development of +several Large Weather Models (LWMs) that rival state-of-the-art (SOTA) +numerical weather prediction (NWP) systems. Up to now, these models still rely +on traditional NWP-generated analysis fields as input and are far from being an +autonomous system. While researchers are exploring data-driven data +assimilation (DA) models to generate accurate initial fields for LWMs, the lack +of a standard benchmark impedes the fair evaluation among different data-driven +DA algorithms. Here, we introduce DABench, a benchmark dataset utilizing ERA5 +data as ground truth to guide the development of end-to-end data-driven weather +prediction systems. DABench contributes four standard features: (1) sparse and +noisy simulated observations under the guidance of the observing system +simulation experiment method; (2) a skillful pre-trained weather prediction +model to generate background fields while fairly evaluating the impact of +assimilation outcomes on predictions; (3) standardized evaluation metrics for +model comparison; (4) a strong baseline called the DA Transformer (DaT). DaT +integrates the four-dimensional variational DA prior knowledge into the +Transformer model and outperforms the SOTA in physical state reconstruction, +named 4DVarNet. Furthermore, we exemplify the development of an end-to-end +data-driven weather prediction system by integrating DaT with the prediction +model. Researchers can leverage DABench to develop their models and compare +performance against established baselines, which will benefit the future +advancements of data-driven weather prediction systems. The code is available +on this Github repository and the dataset is available at the Baidu Drive. + +
+
+ comment: 37pages, 12 figures, 6 tables +
+
+
+
+
+ + ☆ Towards Aligned Data Removal via Twin Machine Unlearning + + +
+ Modern privacy regulations have spurred the evolution of machine unlearning, +a technique that enables the removal of data from an already trained ML model +without requiring retraining from scratch. Previous unlearning methods tend to +induce the model to achieve lowest classification accuracy on the removal data. +Nonetheless, the authentic objective of machine unlearning is to align the +unlearned model with the gold model, i.e., achieving the same classification +accuracy as the gold model. For this purpose, we present a Twin Machine +Unlearning (TMU) approach, where a twin unlearning problem is defined +corresponding to the original unlearning problem. As a results, the +generalization-label predictor trained on the twin problem can be transferred +to the original problem, facilitating aligned data removal. Comprehensive +empirical experiments illustrate that our approach significantly enhances the +alignment between the unlearned model and the gold model. Meanwhile, our method +allows data removal without compromising the model accuracy. + +
+
+
+
+
+ + ☆ Linear-time One-Class Classification with Repeated Element-wise Folding + + +
+ This paper proposes an easy-to-use method for one-class classification: +Repeated Element-wise Folding (REF). The algorithm consists of repeatedly +standardizing and applying an element-wise folding operation on the one-class +training data. Equivalent mappings are performed on unknown test items and the +classification prediction is based on the item's distance to the origin of the +final distribution. As all the included operations have linear time complexity, +the proposed algorithm provides a linear-time alternative for the commonly used +computationally much more demanding approaches. Furthermore, REF can avoid the +challenges of hyperparameter setting in one-class classification by providing +robust default settings. The experiments show that the proposed method can +produce similar classification performance or even outperform the more complex +algorithms on various benchmark datasets. Matlab codes for REF are publicly +available at https://github.com/JenniRaitoharju/REF. + +
+
+ comment: Accepted to EUSIPCO 2024 +
+
+
+
+
+ + ☆ Revisiting FunnyBirds evaluation framework for prototypical parts + networks + + +
+ Prototypical parts networks, such as ProtoPNet, became popular due to their +potential to produce more genuine explanations than post-hoc methods. However, +for a long time, this potential has been strictly theoretical, and no +systematic studies have existed to support it. That changed recently with the +introduction of the FunnyBirds benchmark, which includes metrics for evaluating +different aspects of explanations. + However, this benchmark employs attribution maps visualization for all +explanation techniques except for the ProtoPNet, for which the bounding boxes +are used. This choice significantly influences the metric scores and questions +the conclusions stated in FunnyBirds publication. + In this study, we comprehensively compare metric scores obtained for two +types of ProtoPNet visualizations: bounding boxes and similarity maps. Our +analysis indicates that employing similarity maps aligns better with the +essence of ProtoPNet, as evidenced by different metric scores obtained from +FunnyBirds. Therefore, we advocate using similarity maps as a visualization +technique for prototypical parts networks in explainability evaluation +benchmarks. + +
+
+ comment: Published at 2nd XAI World Conference +
+
+
+
+
+ + ☆ First Activations Matter: Training-Free Methods for Dynamic Activation + in Large Language Models + + +
+ Dynamic activation (DA) techniques, such as DejaVu and MoEfication, have +demonstrated their potential to significantly enhance the inference efficiency +of large language models (LLMs). However, these techniques often rely on ReLU +activation functions or require additional parameters and training to maintain +performance. This paper introduces a training-free Threshold-based Dynamic +Activation(TDA) method that leverage sequence information to exploit the +inherent sparsity of models across various architectures. This method is +designed to accelerate generation speed by 18-25\% without significantly +compromising task performance, thereby addressing the limitations of existing +DA techniques. Moreover, we delve into the root causes of LLM sparsity and +theoretically analyze two of its critical features: history-related activation +uncertainty and semantic-irrelevant activation inertia. Our comprehensive +analyses not only provide a robust theoretical foundation for DA methods but +also offer valuable insights to guide future research in optimizing LLMs for +greater efficiency and effectiveness. + +
+
+
+
+
+ + ☆ Data-Centric Machine Learning for Earth Observation: Necessary and + Sufficient Features ACL + + +
+ The availability of temporal geospatial data in multiple modalities has been +extensively leveraged to enhance the performance of machine learning models. +While efforts on the design of adequate model architectures are approaching a +level of saturation, focusing on a data-centric perspective can complement +these efforts to achieve further enhancements in data usage efficiency and +model generalization capacities. This work contributes to this direction. We +leverage model explanation methods to identify the features crucial for the +model to reach optimal performance and the smallest set of features sufficient +to achieve this performance. We evaluate our approach on three temporal +multimodal geospatial datasets and compare multiple model explanation +techniques. Our results reveal that some datasets can reach their optimal +accuracy with less than 20% of the temporal instances, while in other datasets, +the time series of a single band from a single modality is sufficient. + +
+
+ comment: Accepted at MACLEAN workshop, ECML/PKDD 2024 +
+
+
+
+
+ + ☆ A Unified Framework for Continual Learning and Machine Unlearning + + +
+ Continual learning and machine unlearning are crucial challenges in machine +learning, typically addressed separately. Continual learning focuses on +adapting to new knowledge while preserving past information, whereas unlearning +involves selectively forgetting specific subsets of data. In this paper, we +introduce a novel framework that jointly tackles both tasks by leveraging +controlled knowledge distillation. Our approach enables efficient learning with +minimal forgetting and effective targeted unlearning. By incorporating a fixed +memory buffer, the system supports learning new concepts while retaining prior +knowledge. The distillation process is carefully managed to ensure a balance +between acquiring new information and forgetting specific data as needed. +Experimental results on benchmark datasets show that our method matches or +exceeds the performance of existing approaches in both continual learning and +machine unlearning. This unified framework is the first to address both +challenges simultaneously, paving the way for adaptable models capable of +dynamic learning and forgetting while maintaining strong overall performance. + +
+
+
+
+
+ + ☆ Graph Classification via Reference Distribution Learning: Theory and + Practice + + +
+ Graph classification is a challenging problem owing to the difficulty in +quantifying the similarity between graphs or representing graphs as vectors, +though there have been a few methods using graph kernels or graph neural +networks (GNNs). Graph kernels often suffer from computational costs and manual +feature engineering, while GNNs commonly utilize global pooling operations, +risking the loss of structural or semantic information. This work introduces +Graph Reference Distribution Learning (GRDL), an efficient and accurate graph +classification method. GRDL treats each graph's latent node embeddings given by +GNN layers as a discrete distribution, enabling direct classification without +global pooling, based on maximum mean discrepancy to adaptively learned +reference distributions. To fully understand this new model (the existing +theories do not apply) and guide its configuration (e.g., network architecture, +references' sizes, number, and regularization) for practical use, we derive +generalization error bounds for GRDL and verify them numerically. More +importantly, our theoretical and numerical results both show that GRDL has a +stronger generalization ability than GNNs with global pooling operations. +Experiments on moderate-scale and large-scale graph datasets show the +superiority of GRDL over the state-of-the-art, emphasizing its remarkable +efficiency, being at least 10 times faster than leading competitors in both +training and inference stages. + +
+
+
+
+
+ + ☆ Towards Probabilistic Inductive Logic Programming with Neurosymbolic + Inference and Relaxation + + +
+ Many inductive logic programming (ILP) methods are incapable of learning +programs from probabilistic background knowledge, e.g. coming from sensory data +or neural networks with probabilities. We propose Propper, which handles flawed +and probabilistic background knowledge by extending ILP with a combination of +neurosymbolic inference, a continuous criterion for hypothesis selection (BCE) +and a relaxation of the hypothesis constrainer (NoisyCombo). For relational +patterns in noisy images, Propper can learn programs from as few as 8 examples. +It outperforms binary ILP and statistical models such as a Graph Neural +Network. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ GeoReasoner: Reasoning On Geospatially Grounded Context For Natural + Language Understanding + + +
+ In human reading and communication, individuals tend to engage in geospatial +reasoning, which involves recognizing geographic entities and making informed +inferences about their interrelationships. To mimic such cognitive process, +current methods either utilize conventional natural language understanding +toolkits, or directly apply models pretrained on geo-related natural language +corpora. However, these methods face two significant challenges: i) they do not +generalize well to unseen geospatial scenarios, and ii) they overlook the +importance of integrating geospatial context from geographical databases with +linguistic information from the Internet. To handle these challenges, we +propose GeoReasoner, a language model capable of reasoning on geospatially +grounded natural language. Specifically, it first leverages Large Language +Models (LLMs) to generate a comprehensive location description based on +linguistic and geospatial information. It also encodes direction and distance +information into spatial embedding via treating them as pseudo-sentences. +Consequently, the model is trained on both anchor-level and neighbor-level +inputs to learn geo-entity representation. Extensive experimental results +demonstrate GeoReasoner's superiority in three tasks: toponym recognition, +toponym linking, and geo-entity typing, compared to the state-of-the-art +baselines. + +
+
+ comment: Accepted by International Conference on Information and Knowledge + Management 2024 +
+
+
+
+
+ + ☆ ProteinGPT: Multimodal LLM for Protein Property Prediction and Structure + Understanding + + +
+ Understanding biological processes, drug development, and biotechnological +advancements requires detailed analysis of protein structures and sequences, a +task in protein research that is inherently complex and time-consuming when +performed manually. To streamline this process, we introduce ProteinGPT, a +state-of-the-art multi-modal protein chat system, that allows users to upload +protein sequences and/or structures for comprehensive protein analysis and +responsive inquiries. ProteinGPT seamlessly integrates protein sequence and +structure encoders with linear projection layers for precise representation +adaptation, coupled with a large language model (LLM) to generate accurate and +contextually relevant responses. To train ProteinGPT, we construct a +large-scale dataset of 132,092 proteins with annotations, and optimize the +instruction-tuning process using GPT-4o. This innovative system ensures +accurate alignment between the user-uploaded data and prompts, simplifying +protein analysis. Experiments show that ProteinGPT can produce promising +responses to proteins and their corresponding questions. + +
+
+ comment: 19 pages, 9 figures, 5 tables +
+
+
+
+
+ + ☆ Hypergraph Learning based Recommender System for Anomaly Detection, + Control and Optimization + + +
+ Anomaly detection is fundamental yet, challenging problem with practical +applications in industry. The current approaches neglect the higher-order +dependencies within the networks of interconnected sensors in the +high-dimensional time series(multisensor data) for anomaly detection. To this +end, we present a self-adapting anomaly detection framework for joint learning +of (a) discrete hypergraph structure and (b) modeling the temporal trends and +spatial relations among the interdependent sensors using the hierarchical +encoder-decoder architecture to overcome the challenges. The hypergraph +representation learning-based framework exploits the relational inductive +biases in the hypergraph-structured data to learn the pointwise +single-step-ahead forecasts through the self-supervised autoregressive task and +predicts the anomalies based on the forecast error. Furthermore, our framework +incentivizes learning the anomaly-diagnosis ontology through a differentiable +approach. It derives the anomaly information propagation-based computational +hypergraphs for root cause analysis and provides recommendations through an +offline, optimal predictive control policy to remedy an anomaly. We conduct +extensive experiments to evaluate the proposed method on the benchmark datasets +for fair and rigorous comparison with the popular baselines. The proposed +method outperforms the baseline models and achieves SOTA performance. We report +the ablation studies to support the efficacy of the framework. + +
+
+ comment: 16 pages, 10 figure, Accepted at IEEE International Conference on Big + Data 2022, Osaka, Japan +
+
+
+
+
+ + ☆ One-step Structure Prediction and Screening for Protein-Ligand Complexes + using Multi-Task Geometric Deep Learning + + +
+ Understanding the structure of the protein-ligand complex is crucial to drug +development. Existing virtual structure measurement and screening methods are +dominated by docking and its derived methods combined with deep learning. +However, the sampling and scoring methodology have largely restricted the +accuracy and efficiency. Here, we show that these two fundamental tasks can be +accurately tackled with a single model, namely LigPose, based on multi-task +geometric deep learning. By representing the ligand and the protein pair as a +graph, LigPose directly optimizes the three-dimensional structure of the +complex, with the learning of binding strength and atomic interactions as +auxiliary tasks, enabling its one-step prediction ability without docking +tools. Extensive experiments show LigPose achieved state-of-the-art performance +on major tasks in drug research. Its considerable improvements indicate a +promising paradigm of AI-based pipeline for drug development. + +
+
+
+
+
+ + ☆ Vision HgNN: An Electron-Micrograph is Worth Hypergraph of Hypernodes ICLR + + +
+ Material characterization using electron micrographs is a crucial but +challenging task with applications in various fields, such as semiconductors, +quantum materials, batteries, etc. The challenges in categorizing electron +micrographs include but are not limited to the complexity of patterns, high +level of detail, and imbalanced data distribution(long-tail distribution). +Existing methods have difficulty in modeling the complex relational structure +in electron micrographs, hindering their ability to effectively capture the +complex relationships between different spatial regions of micrographs. We +propose a hypergraph neural network(HgNN) backbone architecture, a conceptually +alternative approach, to better model the complex relationships in electron +micrographs and improve material characterization accuracy. By utilizing +cost-effective GPU hardware, our proposed framework outperforms popular +baselines. The results of the ablation studies demonstrate that the proposed +framework is effective in achieving state-of-the-art performance on benchmark +datasets and efficient in terms of computational and memory requirements for +handling large-scale electron micrograph-based datasets. + +
+
+ comment: 21 pages, Accepted in PML4DC Workshop at International Conference on + Learning Representations (ICLR) 2023 +
+
+
+
+
+ + ☆ Learning Flock: Enhancing Sets of Particles for Multi~Sub-State Particle + Filtering with Neural Augmentation + + +
+ A leading family of algorithms for state estimation in dynamic systems with +multiple sub-states is based on particle filters (PFs). PFs often struggle when +operating under complex or approximated modelling (necessitating many +particles) with low latency requirements (limiting the number of particles), as +is typically the case in multi target tracking (MTT). In this work, we +introduce a deep neural network (DNN) augmentation for PFs termed learning +flock (LF). LF learns to correct a particles-weights set, which we coin flock, +based on the relationships between all sub-particles in the set itself, while +disregarding the set acquisition procedure. Our proposed LF, which can be +readily incorporated into different PFs flow, is designed to facilitate rapid +operation by maintaining accuracy with a reduced number of particles. We +introduce a dedicated training algorithm, allowing both supervised and +unsupervised training, and yielding a module that supports a varying number of +sub-states and particles without necessitating re-training. We experimentally +show the improvements in performance, robustness, and latency of LF +augmentation for radar multi-target tracking, as well its ability to mitigate +the effect of a mismatched observation modelling. We also compare and +illustrate the advantages of LF over a state-of-the-art DNN-aided PF, and +demonstrate that LF enhances both classic PFs as well as DNN-based filters. + +
+
+ comment: Under review for publication in the IEEE +
+
+
+
+
+ + ☆ Clinical Context-aware Radiology Report Generation from Medical Images + using Transformers + + +
+ Recent developments in the field of Natural Language Processing, especially +language models such as the transformer have brought state-of-the-art results +in language understanding and language generation. In this work, we investigate +the use of the transformer model for radiology report generation from chest +X-rays. We also highlight limitations in evaluating radiology report generation +using only the standard language generation metrics. We then applied a +transformer based radiology report generation architecture, and also compare +the performance of a transformer based decoder with the recurrence based +decoder. Experiments were performed using the IU-CXR dataset, showing superior +results to its LSTM counterpart and being significantly faster. Finally, we +identify the need of evaluating radiology report generation system using both +language generation metrics and classification metrics, which helps to provide +robust measure of generated reports in terms of their coherence and diagnostic +value. + +
+
+ comment: 21 pages, 6 figures, 8 tables +
+
+
+
+
+ + ☆ Automatic Dataset Construction (ADC): Sample Collection, Data Curation, + and Beyond + + +
+ Large-scale data collection is essential for developing personalized training +data, mitigating the shortage of training data, and fine-tuning specialized +models. However, creating high-quality datasets quickly and accurately remains +a challenge due to annotation errors, the substantial time and costs associated +with human labor. To address these issues, we propose Automatic Dataset +Construction (ADC), an innovative methodology that automates dataset creation +with negligible cost and high efficiency. Taking the image classification task +as a starting point, ADC leverages LLMs for the detailed class design and code +generation to collect relevant samples via search engines, significantly +reducing the need for manual annotation and speeding up the data generation +process. Despite these advantages, ADC also encounters real-world challenges +such as label errors (label noise) and imbalanced data distributions (label +bias). We provide open-source software that incorporates existing methods for +label error detection, robust learning under noisy and biased data, ensuring a +higher-quality training data and more robust model training procedure. +Furthermore, we design three benchmark datasets focused on label noise +detection, label noise learning, and class-imbalanced learning. These datasets +are vital because there are few existing datasets specifically for label noise +detection, despite its importance. Finally, we evaluate the performance of +existing popular methods on these datasets, thereby facilitating further +research in the field. + +
+
+
+
+
+ + ☆ FATE: Focal-modulated Attention Encoder for Temperature Prediction + + +
+ One of the major challenges of the twenty-first century is climate change, +evidenced by rising sea levels, melting glaciers, and increased storm +frequency. Accurate temperature forecasting is vital for understanding and +mitigating these impacts. Traditional data-driven models often use recurrent +neural networks (RNNs) but face limitations in parallelization, especially with +longer sequences. To address this, we introduce a novel approach based on the +FocalNet Transformer architecture. Our Focal modulation Attention Encoder +(FATE) framework operates in a multi-tensor format, utilizing tensorized +modulation to capture spatial and temporal nuances in meteorological data. +Comparative evaluations against existing transformer encoders, 3D CNNs, LSTM, +and ConvLSTM models show that FATE excels at identifying complex patterns in +temperature data. Additionally, we present a new labeled dataset, the Climate +Change Parameter dataset (CCPD), containing 40 years of data from Jammu and +Kashmir on seven climate-related parameters. Experiments with real-world +temperature datasets from the USA, Canada, and Europe show accuracy +improvements of 12\%, 23\%, and 28\%, respectively, over current +state-of-the-art models. Our CCPD dataset also achieved a 24\% improvement in +accuracy. To support reproducible research, we have released the source code +and pre-trained FATE model at +\href{https://github.com/Tajamul21/FATE}{https://github.com/Tajamul21/FATE}. + +
+
+
+
+
+ + ☆ Design Principle Transfer in Neural Architecture Search via Large + Language Models + + +
+ Transferable neural architecture search (TNAS) has been introduced to design +efficient neural architectures for multiple tasks, to enhance the practical +applicability of NAS in real-world scenarios. In TNAS, architectural knowledge +accumulated in previous search processes is reused to warm up the architecture +search for new tasks. However, existing TNAS methods still search in an +extensive search space, necessitating the evaluation of numerous architectures. +To overcome this challenge, this work proposes a novel transfer paradigm, i.e., +design principle transfer. In this work, the linguistic description of various +structural components' effects on architectural performance is termed design +principles. They are learned from established architectures and then can be +reused to reduce the search space by discarding unpromising architectures. +Searching in the refined search space can boost both the search performance and +efficiency for new NAS tasks. To this end, a large language model +(LLM)-assisted design principle transfer (LAPT) framework is devised. In LAPT, +LLM is applied to automatically reason the design principles from a set of +given architectures, and then a principle adaptation method is applied to +refine these principles progressively based on the new search results. +Experimental results show that LAPT can beat the state-of-the-art TNAS methods +on most tasks and achieve comparable performance on others. + +
+
+
+
+
+ + ☆ Transfer Learning and the Early Estimation of Single-Photon Source + Quality using Machine Learning Methods + + +
+ The use of single-photon sources (SPSs) is central to numerous systems and +devices proposed amidst a modern surge in quantum technology. However, +manufacturing schemes remain imperfect, and single-photon emission purity must +often be experimentally verified via interferometry. Such a process is +typically slow and costly, which has motivated growing research into whether +SPS quality can be more rapidly inferred from incomplete emission statistics. +Hence, this study is a sequel to previous work that demonstrated significant +uncertainty in the standard method of quality estimation, i.e. the +least-squares fitting of a physically motivated function, and asks: can machine +learning (ML) do better? The study leverages eight datasets obtained from +measurements involving an exemplary quantum emitter, i.e. a single InGaAs/GaAs +epitaxial quantum dot; these eight contexts predominantly vary in the intensity +of the exciting laser. Specifically, via a form of `transfer learning', five ML +models, three linear and two ensemble-based, are trained on data from seven of +the contexts and tested on the eighth. Validation metrics quickly reveal that +even a linear regressor can outperform standard fitting when it is tested on +the same contexts it was trained on, but the success of transfer learning is +less assured, even though statistical analysis, made possible by data +augmentation, suggests its superiority as an early estimator. Accordingly, the +study concludes by discussing future strategies for grappling with the problem +of SPS context dissimilarity, e.g. feature engineering and model adaptation. + +
+
+ comment: The data and software that supports the findings of this study are + openly available at https://github.com/UTS-CASLab/sps-quality +
+
+
+
+
+ + ☆ Improving Out-of-Distribution Data Handling and Corruption Resistance + via Modern Hopfield Networks + + +
+ This study explores the potential of Modern Hopfield Networks (MHN) in +improving the ability of computer vision models to handle out-of-distribution +data. While current computer vision models can generalize to unseen samples +from the same distribution, they are susceptible to minor perturbations such as +blurring, which limits their effectiveness in real-world applications. We +suggest integrating MHN into the baseline models to enhance their robustness. +This integration can be implemented during the test time for any model and +combined with any adversarial defense method. Our research shows that the +proposed integration consistently improves model performance on the MNIST-C +dataset, achieving a state-of-the-art increase of 13.84% in average corruption +accuracy, a 57.49% decrease in mean Corruption Error (mCE), and a 60.61% +decrease in relative mCE compared to the baseline model. Additionally, we +investigate the capability of MHN to converge to the original non-corrupted +data. Notably, our method does not require test-time adaptation or augmentation +with corruptions, underscoring its practical viability for real-world +deployment. (Source code publicly available at: +https://github.com/salehsargolzaee/Hopfield-integrated-test) + +
+
+
+
+
+ + ☆ KAN4TSF: Are KAN and KAN-based models Effective for Time Series + Forecasting? + + +
+ Time series forecasting is a crucial task that predicts the future values of +variables based on historical data. Time series forecasting techniques have +been developing in parallel with the machine learning community, from early +statistical learning methods to current deep learning methods. Although +existing methods have made significant progress, they still suffer from two +challenges. The mathematical theory of mainstream deep learning-based methods +does not establish a clear relation between network sizes and fitting +capabilities, and these methods often lack interpretability. To this end, we +introduce the Kolmogorov-Arnold Network (KAN) into time series forecasting +research, which has better mathematical properties and interpretability. First, +we propose the Reversible Mixture of KAN experts (RMoK) model, which is a +KAN-based model for time series forecasting. RMoK uses a mixture-of-experts +structure to assign variables to KAN experts. Then, we compare performance, +integration, and speed between RMoK and various baselines on real-world +datasets, and the experimental results show that RMoK achieves the best +performance in most cases. And we find the relationship between temporal +feature weights and data periodicity through visualization, which roughly +explains RMoK's mechanism. Thus, we conclude that KAN and KAN-based models +(RMoK) are effective in time series forecasting. Code is available at KAN4TSF: +https://github.com/2448845600/KAN4TSF. + +
+
+
+
+
+ + ☆ FedMoE: Personalized Federated Learning via Heterogeneous Mixture of + Experts + + +
+ As Large Language Models (LLMs) push the boundaries of AI capabilities, their +demand for data is growing. Much of this data is private and distributed across +edge devices, making Federated Learning (FL) a de-facto alternative for +fine-tuning (i.e., FedLLM). However, it faces significant challenges due to the +inherent heterogeneity among clients, including varying data distributions and +diverse task types. Towards a versatile FedLLM, we replace traditional dense +model with a sparsely-activated Mixture-of-Experts (MoE) architecture, whose +parallel feed-forward networks enable greater flexibility. To make it more +practical in resource-constrained environments, we present FedMoE, the +efficient personalized FL framework to address data heterogeneity, constructing +an optimal sub-MoE for each client and bringing the knowledge back to global +MoE. FedMoE is composed of two fine-tuning stages. In the first stage, FedMoE +simplifies the problem by conducting a heuristic search based on observed +activation patterns, which identifies a suboptimal submodel for each client. In +the second stage, these submodels are distributed to clients for further +training and returned for server aggregating through a novel modular +aggregation strategy. Meanwhile, FedMoE progressively adjusts the submodels to +optimal through global expert recommendation. Experimental results demonstrate +the superiority of our method over previous personalized FL methods. + +
+
+
+
+
+ + ☆ Koopman AutoEncoder via Singular Value Decomposition for Data-Driven + Long-Term Prediction SP 2024 + + +
+ The Koopman autoencoder, a data-driven technique, has gained traction for +modeling nonlinear dynamics using deep learning methods in recent years. Given +the linear characteristics inherent to the Koopman operator, controlling its +eigenvalues offers an opportunity to enhance long-term prediction performance, +a critical task for forecasting future trends in time-series datasets with +long-term behaviors. However, controlling eigenvalues is challenging due to +high computational complexity and difficulties in managing them during the +training process. To tackle this issue, we propose leveraging the singular +value decomposition (SVD) of the Koopman matrix to adjust the singular values +for better long-term prediction. Experimental results demonstrate that, during +training, the loss term for singular values effectively brings the eigenvalues +close to the unit circle, and the proposed approach outperforms existing +baseline methods for long-term prediction tasks. + +
+
+ comment: 6 pages, 5 figures, to be presented at IEEE MLSP 2024 +
+
+
+
+
+ + ☆ Modeling Reference-dependent Choices with Graph Neural Networks + + +
+ While the classic Prospect Theory has highlighted the reference-dependent and +comparative nature of consumers' product evaluation processes, few models have +successfully integrated this theoretical hypothesis into data-driven preference +quantification, particularly in the realm of recommender systems development. +To bridge this gap, we propose a new research problem of modeling +reference-dependent preferences from a data-driven perspective, and design a +novel deep learning-based framework named Attributed Reference-dependent Choice +Model for Recommendation (ArcRec) to tackle the inherent challenges associated +with this problem. ArcRec features in building a reference network from +aggregated historical purchase records for instantiating theoretical reference +points, which is then decomposed into product attribute specific sub-networks +and represented through Graph Neural Networks. In this way, the reference +points of a consumer can be encoded at the attribute-level individually from +her past experiences but also reflect the crowd influences. ArcRec also makes +novel contributions to quantifying consumers' reference-dependent preferences +using a deep neural network-based utility function that integrates both +interest-inspired and price-inspired preferences, with their complex +interaction effects captured by an attribute-aware price sensitivity mechanism. +Most importantly, ArcRec introduces a novel Attribute-level Willingness-To-Pay +measure to the reference-dependent utility function, which captures a +consumer's heterogeneous salience of product attributes via observing her +attribute-level price tolerance to a product. Empirical evaluations on both +synthetic and real-world online shopping datasets demonstrate ArcRec's superior +performances over fourteen state-of-the-art baselines. + +
+
+
+
+
+ + ☆ Offline Policy Learning via Skill-step Abstraction for Long-horizon + Goal-Conditioned Tasks + + +
+ Goal-conditioned (GC) policy learning often faces a challenge arising from +the sparsity of rewards, when confronting long-horizon goals. To address the +challenge, we explore skill-based GC policy learning in offline settings, where +skills are acquired from existing data and long-horizon goals are decomposed +into sequences of near-term goals that align with these skills. Specifically, +we present an `offline GC policy learning via skill-step abstraction' framework +(GLvSA) tailored for tackling long-horizon GC tasks affected by goal +distribution shifts. In the framework, a GC policy is progressively learned +offline in conjunction with the incremental modeling of skill-step abstractions +on the data. We also devise a GC policy hierarchy that not only accelerates GC +policy learning within the framework but also allows for parameter-efficient +fine-tuning of the policy. Through experiments with the maze and Franka kitchen +environments, we demonstrate the superiority and efficiency of our GLvSA +framework in adapting GC policies to a wide range of long-horizon goals. The +framework achieves competitive zero-shot and few-shot adaptation performance, +outperforming existing GC policy learning and skill-based methods. + +
+
+ comment: 9 pages, 4 figures, International Joint Conference on Artificial + Intelligence 2024, Published version +
+
+
+
+
+ + ☆ ViIK: Flow-based Vision Inverse Kinematics Solver with Fusing Collision + Checking + + +
+ Inverse Kinematics (IK) is to find the robot's configurations that satisfy +the target pose of the end effector. In motion planning, diverse configurations +were required in case a feasible trajectory was not found. Meanwhile, collision +checking (CC), e.g. Oriented bounding box (OBB), Discrete Oriented Polytope +(DOP), and Quickhull \cite{quickhull}, needs to be done for each configuration +provided by the IK solver to ensure every goal configuration for motion +planning is available. This means the classical IK solver and CC algorithm +should be executed repeatedly for every configuration. Thus, the preparation +time is long when the required number of goal configurations is large, e.g. +motion planning in cluster environments. Moreover, structured maps, which might +be difficult to obtain, were required by classical collision-checking +algorithms. To sidestep such two issues, we propose a flow-based vision method +that can output diverse available configurations by fusing inverse kinematics +and collision checking, named Vision Inverse Kinematics solver (ViIK). +Moreover, ViIK uses RGB images as the perception of environments. ViIK can +output 1000 configurations within 40 ms, and the accuracy is about 3 +millimeters and 1.5 degrees. The higher accuracy can be obtained by being +refined by the classical IK solver within a few iterations. The self-collision +rates can be lower than 2%. The collision-with-env rates can be lower than 10% +in most scenes. The code is available at: https://github.com/AdamQLMeng/ViIK. + +
+
+
+
+
+ + ☆ Taming Generative Diffusion for Universal Blind Image Restoration + + +
+ Diffusion models have been widely utilized for image restoration. However, +previous blind image restoration methods still need to assume the type of +degradation model while leaving the parameters to be optimized, limiting their +real-world applications. Therefore, we aim to tame generative diffusion prior +for universal blind image restoration dubbed BIR-D, which utilizes an +optimizable convolutional kernel to simulate the degradation model and +dynamically update the parameters of the kernel in the diffusion steps, +enabling it to achieve blind image restoration results even in various complex +situations. Besides, based on mathematical reasoning, we have provided an +empirical formula for the chosen of adaptive guidance scale, eliminating the +need for a grid search for the optimal parameter. Experimentally, Our BIR-D has +demonstrated superior practicality and versatility than off-the-shelf +unsupervised methods across various tasks both on real-world and synthetic +datasets, qualitatively and quantitatively. BIR-D is able to fulfill +multi-guidance blind image restoration. Moreover, BIR-D can also restore images +that undergo multiple and complicated degradations, demonstrating the practical +applications. + +
+
+ comment: 14 pages, 9 figures, 8 tables +
+
+
+
+
+ + ☆ Chernoff Bounds for Tensor Expanders on Riemannian Manifolds Using Graph + Laplacian Approximation + + +
+ This paper addresses the advancement of probability tail bound analysis, a +crucial statistical tool for assessing the probability of large deviations of +random variables from their expected values. Traditional tail bounds, such as +Markov's, Chebyshev's, and Chernoff bounds, have proven valuable across +numerous scientific and engineering fields. However, as data complexity grows, +there is a pressing need to extend tail bound estimation from scalar variables +to high-dimensional random objects. Existing studies often rely on the +assumption of independence among high-dimensional random objects, an assumption +that may not always be valid. Building on the work of researchers like Garg et +al. and Chang, who employed random walks to model high-dimensional ensembles, +this study introduces a more generalized approach by exploring random walks +over manifolds. To address the challenges of constructing an appropriate +underlying graph for a manifold, we propose a novel method that enhances random +walks on graphs approximating the manifold. This approach ensures spectral +similarity between the original manifold and the approximated graph, including +matching eigenvalues, eigenvectors, and eigenfunctions. Leveraging graph +approximation technique proposed by Burago et al. for manifolds, we derive the +tensor Chernoff bound and establish its range for random walks on a Riemannian +manifold according to the underlying manifold's spectral characteristics. + +
+
+
+
+
+ + ☆ Inverting the Leverage Score Gradient: An Efficient Approximate Newton + Method + + +
+ Leverage scores have become essential in statistics and machine learning, +aiding regression analysis, randomized matrix computations, and various other +tasks. This paper delves into the inverse problem, aiming to recover the +intrinsic model parameters given the leverage scores gradient. This endeavor +not only enriches the theoretical understanding of models trained with leverage +score techniques but also has substantial implications for data privacy and +adversarial security. We specifically scrutinize the inversion of the leverage +score gradient, denoted as $g(x)$. An innovative iterative algorithm is +introduced for the approximate resolution of the regularized least squares +problem stated as $\min_{x \in \mathbb{R}^d} 0.5 \|g(x) - c\|_2^2 + +0.5\|\mathrm{diag}(w)Ax\|_2^2$. Our algorithm employs subsampled leverage score +distributions to compute an approximate Hessian in each iteration, under +standard assumptions, considerably mitigating the time complexity. Given that a +total of $T = \log(\| x_0 - x^* \|_2/ \epsilon)$ iterations are required, the +cost per iteration is optimized to the order of $O( (\mathrm{nnz}(A) + +d^{\omega} ) \cdot \mathrm{poly}(\log(n/\delta))$, where $\mathrm{nnz}(A)$ +denotes the number of non-zero entries of $A$. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2404.13785 +
+
+
+
+
+ + ☆ Practical Aspects on Solving Differential Equations Using Deep Learning: + A Primer + + +
+ Deep learning has become a popular tool across many scientific fields, +including the study of differential equations, particularly partial +differential equations. This work introduces the basic principles of deep +learning and the Deep Galerkin method, which uses deep neural networks to solve +differential equations. This primer aims to provide technical and practical +insights into the Deep Galerkin method and its implementation. We demonstrate +how to solve the one-dimensional heat equation step-by-step. We also show how +to apply the Deep Galerkin method to solve systems of ordinary differential +equations and integral equations, such as the Fredholm of the second kind. +Additionally, we provide code snippets within the text and the complete source +code on Github. The examples are designed so that one can run them on a simple +computer without needing a GPU. + +
+
+ comment: 32 pages, 12 figures, primer (tutorial) +
+
+
+
+
+ + ☆ Correlation Analysis of Adversarial Attack in Time Series Classification + + +
+ This study investigates the vulnerability of time series classification +models to adversarial attacks, with a focus on how these models process local +versus global information under such conditions. By leveraging the Normalized +Auto Correlation Function (NACF), an exploration into the inclination of neural +networks is conducted. It is demonstrated that regularization techniques, +particularly those employing Fast Fourier Transform (FFT) methods and targeting +frequency components of perturbations, markedly enhance the effectiveness of +attacks. Meanwhile, the defense strategies, like noise introduction and +Gaussian filtering, are shown to significantly lower the Attack Success Rate +(ASR), with approaches based on noise introducing notably effective in +countering high-frequency distortions. Furthermore, models designed to +prioritize global information are revealed to possess greater resistance to +adversarial manipulations. These results underline the importance of designing +attack and defense mechanisms, informed by frequency domain analysis, as a +means to considerably reinforce the resilience of neural network models against +adversarial threats. + +
+
+ comment: 15 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ NYU CTF Dataset: A Scalable Open-Source Benchmark Dataset for Evaluating + LLMs in Offensive Security + + +
+ Large Language Models (LLMs) are being deployed across various domains today. +However, their capacity to solve Capture the Flag (CTF) challenges in +cybersecurity has not been thoroughly evaluated. To address this, we develop a +novel method to assess LLMs in solving CTF challenges by creating a scalable, +open-source benchmark database specifically designed for these applications. +This database includes metadata for LLM testing and adaptive learning, +compiling a diverse range of CTF challenges from popular competitions. +Utilizing the advanced function calling capabilities of LLMs, we build a fully +automated system with an enhanced workflow and support for external tool calls. +Our benchmark dataset and automated framework allow us to evaluate the +performance of five LLMs, encompassing both black-box and open-source models. +This work lays the foundation for future research into improving the efficiency +of LLMs in interactive cybersecurity tasks and automated task planning. By +providing a specialized dataset, our project offers an ideal platform for +developing, testing, and refining LLM-based approaches to vulnerability +detection and resolution. Evaluating LLMs on these challenges and comparing +with human performance yields insights into their potential for AI-driven +cybersecurity solutions to perform real-world threat management. We make our +dataset open source to public https://github.com/NYU-LLM-CTF/LLM_CTF_Database +along with our playground automated framework +https://github.com/NYU-LLM-CTF/llm_ctf_automation. + +
+
+
+
+
+ + ♻ ☆ A Survey for Foundation Models in Autonomous Driving + + +
+ The advent of foundation models has revolutionized the fields of natural +language processing and computer vision, paving the way for their application +in autonomous driving (AD). This survey presents a comprehensive review of more +than 40 research papers, demonstrating the role of foundation models in +enhancing AD. Large language models contribute to planning and simulation in +AD, particularly through their proficiency in reasoning, code generation and +translation. In parallel, vision foundation models are increasingly adapted for +critical tasks such as 3D object detection and tracking, as well as creating +realistic driving scenarios for simulation and testing. Multi-modal foundation +models, integrating diverse inputs, exhibit exceptional visual understanding +and spatial reasoning, crucial for end-to-end AD. This survey not only provides +a structured taxonomy, categorizing foundation models based on their modalities +and functionalities within the AD domain but also delves into the methods +employed in current research. It identifies the gaps between existing +foundation models and cutting-edge AD approaches, thereby charting future +research directions and proposing a roadmap for bridging these gaps. + +
+
+
+
+
+ + ♻ ☆ Hypergraph: A Unified and Uniform Definition with Application to + Chemical Hypergraph and More + + +
+ The conventional definition of hypergraph has two major issues: (1) there is +not a standard definition of directed hypergraph and (2) there is not a formal +definition of nested hypergraph. To resolve these issues, we propose a new +definition of hypergraph that unifies the concepts of undirected, directed and +nested hypergraphs, and that is uniform in using hyperedge as a single +construct for representing high-order correlations among things, i.e., nodes +and hyperedges. Specifically, we define a hyperedge to be a simple hyperedge, a +nesting hyperedge, or a directed hyperedge. With this new definition, a +hypergraph is nested if it has nesting hyperedge(s), and is directed if it has +directed hyperedge(s). Otherwise, a hypergraph is a simple hypergraph. The +uniformity and power of this new definition, with visualization, should +facilitate the use of hypergraph for representing (hierarchical) high-order +correlations in general and chemical systems in particular. Graph has been +widely used as a mathematical structure for machine learning on molecular +structures and 3D molecular geometries. However, graph has a major limitation: +it can represent only pairwise correlations between nodes. Hypergraph extends +graph with high-order correlations among nodes. This extension is significant +or essential for machine learning on chemical systems. For molecules, this is +significant as it allows the direct, explicit representation of multicenter +bonds and molecular substructures. For chemical reactions, this is essential +since most chemical reactions involve multiple participants. We propose the use +of chemical hypergraph, a multilevel hypergraph with simple, nesting and +directed hyperedges, as a single mathematical structure for representing +chemical systems. We apply the new definition of hypergraph to chemical +hypergraph and, as simplified versions, molecular hypergraph and chemical +reaction hypergraph. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2310.03623 by other authors +
+
+
+
+
+ + ♻ ☆ PathMLP: Smooth Path Towards High-order Homophily + + +
+ Real-world graphs exhibit increasing heterophily, where nodes no longer tend +to be connected to nodes with the same label, challenging the homophily +assumption of classical graph neural networks (GNNs) and impeding their +performance. Intriguingly, from the observation of heterophilous data, we +notice that certain high-order information exhibits higher homophily, which +motivates us to involve high-order information in node representation learning. +However, common practices in GNNs to acquire high-order information mainly +through increasing model depth and altering message-passing mechanisms, which, +albeit effective to a certain extent, suffer from three shortcomings: 1) +over-smoothing due to excessive model depth and propagation times; 2) +high-order information is not fully utilized; 3) low computational efficiency. +In this regard, we design a similarity-based path sampling strategy to capture +smooth paths containing high-order homophily. Then we propose a lightweight +model based on multi-layer perceptrons (MLP), named PathMLP, which can encode +messages carried by paths via simple transformation and concatenation +operations, and effectively learn node representations in heterophilous graphs +through adaptive path aggregation. Extensive experiments demonstrate that our +method outperforms baselines on 16 out of 20 datasets, underlining its +effectiveness and superiority in alleviating the heterophily problem. In +addition, our method is immune to over-smoothing and has high computational +efficiency. The source code will be available in +https://github.com/Graph4Sec-Team/PathMLP. + +
+
+ comment: Accepted by Neural Networks +
+
+
+
+
+ + ♻ ☆ Mechanistically analyzing the effects of fine-tuning on procedurally + defined tasks + + +
+ Fine-tuning large pre-trained models has become the de facto strategy for +developing both task-specific and general-purpose machine learning systems, +including developing models that are safe to deploy. Despite its clear +importance, there has been minimal work that explains how fine-tuning alters +the underlying capabilities learned by a model during pretraining: does +fine-tuning yield entirely novel capabilities or does it just modulate existing +ones? We address this question empirically in synthetic, controlled settings +where we can use mechanistic interpretability tools (e.g., network pruning and +probing) to understand how the model's underlying capabilities are changing. We +perform an extensive analysis of the effects of fine-tuning in these settings, +and show that: (i) fine-tuning rarely alters the underlying model capabilities; +(ii) a minimal transformation, which we call a 'wrapper', is typically learned +on top of the underlying model capabilities, creating the illusion that they +have been modified; and (iii) further fine-tuning on a task where such hidden +capabilities are relevant leads to sample-efficient 'revival' of the +capability, i.e., the model begins reusing these capability after only a few +gradient steps. This indicates that practitioners can unintentionally remove a +model's safety wrapper merely by fine-tuning it on a, e.g., superficially +unrelated, downstream task. We additionally perform analysis on language models +trained on the TinyStories dataset to support our claims in a more realistic +setup. + +
+
+
+
+
+ + ♻ ☆ Accelerating Hopfield Network Dynamics: Beyond Synchronous Updates and + Forward Euler ECAI 2024 + + +
+ The Hopfield network serves as a fundamental energy-based model in machine +learning, capturing memory retrieval dynamics through an ordinary differential +equation (ODE). The model's output, the equilibrium point of the ODE, is +traditionally computed via synchronous updates using the forward Euler method. +This paper aims to overcome some of the disadvantages of this approach. We +propose a conceptual shift, viewing Hopfield networks as instances of Deep +Equilibrium Models (DEQs). The DEQ framework not only allows for the use of +specialized solvers, but also leads to new insights on an empirical inference +technique that we will refer to as 'even-odd splitting'. Our theoretical +analysis of the method uncovers a parallelizable asynchronous update scheme, +which should converge roughly twice as fast as the conventional synchronous +updates. Empirical evaluations validate these findings, showcasing the +advantages of both the DEQ framework and even-odd splitting in digitally +simulating energy minimization in Hopfield networks. The code is available at +https://github.com/cgoemaere/hopdeq + +
+
+ comment: Accepted at the ML-DE Workshop at ECAI 2024 +
+
+
+
+
+ + ♻ ☆ Spike-and-slab shrinkage priors for structurally sparse Bayesian neural + networks + + +
+ Network complexity and computational efficiency have become increasingly +significant aspects of deep learning. Sparse deep learning addresses these +challenges by recovering a sparse representation of the underlying target +function by reducing heavily over-parameterized deep neural networks. +Specifically, deep neural architectures compressed via structured sparsity +(e.g. node sparsity) provide low latency inference, higher data throughput, and +reduced energy consumption. In this paper, we explore two well-established +shrinkage techniques, Lasso and Horseshoe, for model compression in Bayesian +neural networks. To this end, we propose structurally sparse Bayesian neural +networks which systematically prune excessive nodes with (i) Spike-and-Slab +Group Lasso (SS-GL), and (ii) Spike-and-Slab Group Horseshoe (SS-GHS) priors, +and develop computationally tractable variational inference including +continuous relaxation of Bernoulli variables. We establish the contraction +rates of the variational posterior of our proposed models as a function of the +network topology, layer-wise node cardinalities, and bounds on the network +weights. We empirically demonstrate the competitive performance of our models +compared to the baseline models in prediction accuracy, model compression, and +inference latency. + +
+
+
+
+
+ + ♻ ☆ Deep Generative Models in Robotics: A Survey on Learning from Multimodal + Demonstrations + + +
+ Learning from Demonstrations, the field that proposes to learn robot behavior +models from data, is gaining popularity with the emergence of deep generative +models. Although the problem has been studied for years under names such as +Imitation Learning, Behavioral Cloning, or Inverse Reinforcement Learning, +classical methods have relied on models that don't capture complex data +distributions well or don't scale well to large numbers of demonstrations. In +recent years, the robot learning community has shown increasing interest in +using deep generative models to capture the complexity of large datasets. In +this survey, we aim to provide a unified and comprehensive review of the last +year's progress in the use of deep generative models in robotics. We present +the different types of models that the community has explored, such as +energy-based models, diffusion models, action value maps, or generative +adversarial networks. We also present the different types of applications in +which deep generative models have been used, from grasp generation to +trajectory generation or cost learning. One of the most important elements of +generative models is the generalization out of distributions. In our survey, we +review the different decisions the community has made to improve the +generalization of the learned models. Finally, we highlight the research +challenges and propose a number of future directions for learning deep +generative models in robotics. + +
+
+ comment: 20 pages, 11 figures, submitted to TRO +
+
+
+
+
+ + ♻ ☆ HYVE: Hybrid Vertex Encoder for Neural Distance Fields + + +
+ Neural shape representation generally refers to representing 3D geometry +using neural networks, e.g., computing a signed distance or occupancy value at +a specific spatial position. In this paper we present a neural-network +architecture suitable for accurate encoding of 3D shapes in a single forward +pass. Our architecture is based on a multi-scale hybrid system incorporating +graph-based and voxel-based components, as well as a continuously +differentiable decoder. The hybrid system includes a novel way of voxelizing +point-based features in neural networks, which we show can be used in +combination with oriented point-clouds to obtain smoother and more detailed +reconstructions. Furthermore, our network is trained to solve the eikonal +equation and only requires knowledge of the zero-level set for training and +inference. This means that in contrast to most previous shape encoder +architectures, our network is able to output valid signed distance fields +without explicit prior knowledge of non-zero distance values or shape +occupancy. It also requires only a single forward-pass, instead of the +latent-code optimization used in auto-decoder methods. We further propose a +modification to the loss function in case that surface normals are not well +defined, e.g., in the context of non-watertight surfaces and non-manifold +geometry, resulting in an unsigned distance field. Overall, our system can help +to reduce the computational overhead of training and evaluating neural distance +fields, as well as enabling the application to difficult geometry. + +
+
+
+
+
+ + ♻ ☆ Analysis of Systems' Performance in Natural Language Processing + Competitions + + +
+ Collaborative competitions have gained popularity in the scientific and +technological fields. These competitions involve defining tasks, selecting +evaluation scores, and devising result verification methods. In the standard +scenario, participants receive a training set and are expected to provide a +solution for a held-out dataset kept by organizers. An essential challenge for +organizers arises when comparing algorithms' performance, assessing multiple +participants, and ranking them. Statistical tools are often used for this +purpose; however, traditional statistical methods often fail to capture +decisive differences between systems' performance. This manuscript describes an +evaluation methodology for statistically analyzing competition results and +competition. The methodology is designed to be universally applicable; however, +it is illustrated using eight natural language competitions as case studies +involving classification and regression problems. The proposed methodology +offers several advantages, including off-the-shell comparisons with correction +mechanisms and the inclusion of confidence intervals. Furthermore, we introduce +metrics that allow organizers to assess the difficulty of competitions. Our +analysis shows the potential usefulness of our methodology for effectively +evaluating competition results. + +
+
+
+
+
+ + ♻ ☆ Improving global awareness of linkset predictions using Cross-Attentive + Modulation tokens + + +
+ Most of multiple link prediction or graph generation techniques rely on the +attention mechanism or on Graph Neural Networks (GNNs), which consist in +leveraging node-level information exchanges in order to form proper link +predictions. Such node-level interactions do not process nodes as an ordered +sequence, which would imply some kind of natural ordering of the nodes: they +are said to be permutation invariant mechanisms. They are well suited for graph +problems, but struggle at providing a global orchestration of the predicted +links, which can result in a loss of performance. Some typical issues can be +the difficulty to ensure high-level properties such as global connectedness, +fixed diameter or to avoid information bottleneck effects such as oversmoothing +and oversquashing, which respectively consist in abundant smoothing in dense +areas leading to a loss of information and a tendency to exclude isolated nodes +from the message passing scheme, and often result in irrelevant, unbalanced +link predictions. To tackle this problem, we hereby present Cross-Attentive +Modulation (CAM) tokens, which introduce cross-attentive units used to +condition node and edge-level modulations in order to enable context-aware +computations that improve the global consistency of the prediction links. We +will implement it on a few permutation invariant architectures, and showcase +benchmarks that prove the merits of our work. + +
+
+
+
+
+ + ♻ ☆ Quantum Inception Score + + +
+ Motivated by the great success of classical generative models in machine +learning, enthusiastic exploration of their quantum version has recently +started. To depart on this journey, it is important to develop a relevant +metric to evaluate the quality of quantum generative models; in the classical +case, one such example is the (classical) inception score (cIS). In this paper, +as a natural extension of cIS, we propose the quantum inception score (qIS) for +quantum generators. Importantly, qIS relates the quality to the Holevo +information of the quantum channel that classifies a given dataset. In this +context, we show several properties of qIS. First, qIS is greater than or equal +to the corresponding cIS, which is defined through projection measurements on +the system output. Second, the difference between qIS and cIS arises from the +presence of quantum coherence, as characterized by the resource theory of +asymmetry. Third, when a set of entangled generators is prepared, there exists +a classifying process leading to the further enhancement of qIS. Fourth, we +harness the quantum fluctuation theorem to characterize the physical limitation +of qIS. Finally, we apply qIS to assess the quality of the one-dimensional spin +chain model as a quantum generative model, with the quantum convolutional +neural network as a quantum classifier, for the phase classification problem in +the quantum many-body physics. + +
+
+ comment: very close to the published version +
+
+
+
+
+ + ♻ ☆ Carbon Connect: An Ecosystem for Sustainable Computing + + +
+ Computing is at a moment of profound opportunity. Emerging applications -- +such as capable artificial intelligence, immersive virtual realities, and +pervasive sensor systems -- drive unprecedented demand for computer. Despite +recent advances toward net zero carbon emissions, the computing industry's +gross energy usage continues to rise at an alarming rate, outpacing the growth +of new energy installations and renewable energy deployments. A shift towards +sustainability is needed to spark a transformation in how computer systems are +manufactured, allocated, and consumed. Carbon Connect envisions coordinated +research thrusts that produce design and management strategies for sustainable, +next-generation computer systems. These strategies must flatten and then +reverse growth trajectories for computing power and carbon for society's most +rapidly growing applications such as artificial intelligence and virtual +spaces. We will require accurate models for carbon accounting in computing +technology. For embodied carbon, we must re-think conventional design +strategies -- over-provisioned monolithic servers, frequent hardware refresh +cycles, custom silicon -- and adopt life-cycle design strategies that more +effectively reduce, reuse and recycle hardware at scale. For operational +carbon, we must not only embrace renewable energy but also design systems to +use that energy more efficiently. Finally, new hardware design and management +strategies must be cognizant of economic policy and regulatory landscape, +aligning private initiatives with societal goals. Many of these broader goals +will require computer scientists to develop deep, enduring collaborations with +researchers in economics, law, and industrial ecology to spark change in +broader practice. + +
+
+
+
+
+ + ♻ ☆ What Makes and Breaks Safety Fine-tuning? A Mechanistic Study + + +
+ Safety fine-tuning helps align Large Language Models (LLMs) with human +preferences for their safe deployment. To better understand the underlying +factors that make models safe via safety fine-tuning, we design a synthetic +data generation framework that captures salient aspects of an unsafe input by +modeling the interaction between the task the model is asked to perform (e.g., +"design") versus the specific concepts the task is asked to be performed upon +(e.g., a "cycle" vs. a "bomb"). Using this, we investigate three well-known +safety fine-tuning methods -- supervised safety fine-tuning, direct preference +optimization, and unlearning -- and provide significant evidence demonstrating +that these methods minimally transform MLP weights to specifically align unsafe +inputs into its weights' null space. This yields a clustering of inputs based +on whether the model deems them safe or not. Correspondingly, when an +adversarial input (e.g., a jailbreak) is provided, its activations are closer +to safer samples, leading to the model processing such an input as if it were +safe. We validate our findings, wherever possible, on real-world models -- +specifically, Llama-2 7B and Llama-3 8B. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ S4Sleep: Elucidating the design space of deep-learning-based sleep stage + classification models + + +
+ Scoring sleep stages in polysomnography recordings is a time-consuming task +plagued by significant inter-rater variability. Therefore, it stands to benefit +from the application of machine learning algorithms. While many algorithms have +been proposed for this purpose, certain critical architectural decisions have +not received systematic exploration. In this study, we meticulously investigate +these design choices within the broad category of encoder-predictor +architectures. We identify robust architectures applicable to both time series +and spectrogram input representations. These architectures incorporate +structured state space models as integral components and achieve statistically +significant performance improvements compared to state-of-the-art approaches on +the extensive Sleep Heart Health Study dataset. We anticipate that the +architectural insights gained from this study along with the refined +methodology for architecture search demonstrated herein will not only prove +valuable for future research in sleep staging but also hold relevance for other +time series annotation tasks. + +
+
+ comment: 33 pages, 3 figures, code available at + https://github.com/AI4HealthUOL/s4sleep +
+
+
+
+
+ + ♻ ☆ MIS-ME: A Multi-modal Framework for Soil Moisture Estimation + + +
+ Soil moisture estimation is an important task to enable precision agriculture +in creating optimal plans for irrigation, fertilization, and harvest. It is +common to utilize statistical and machine learning models to estimate soil +moisture from traditional data sources such as weather forecasts, soil +properties, and crop properties. However, there is a growing interest in +utilizing aerial and geospatial imagery to estimate soil moisture. Although +these images capture high-resolution crop details, they are expensive to curate +and challenging to interpret. Imagine, an AI-enhanced software tool that +predicts soil moisture using visual cues captured by smartphones and +statistical data given by weather forecasts. This work is a first step towards +that goal of developing a multi-modal approach for soil moisture estimation. In +particular, we curate a dataset consisting of real-world images taken from +ground stations and their corresponding weather data. We also propose MIS-ME - +Meteorological & Image based Soil Moisture Estimator, a multi-modal framework +for soil moisture estimation. Our extensive analysis shows that MIS-ME achieves +a MAPE of 10.14%, outperforming traditional unimodal approaches with a +reduction of 3.25% in MAPE for meteorological data and 2.15% in MAPE for image +data, highlighting the effectiveness of tailored multi-modal approaches. Our +code and dataset will be available at +https://github.com/OSU-Complex-Systems/MIS-ME.git. + +
+
+ comment: Accepted by DSAA2024 +
+
+
+
+
+ + ♻ ☆ Suppressing unknown disturbances to dynamical systems using machine + learning + + +
+ Identifying and suppressing unknown disturbances to dynamical systems is a +problem with applications in many different fields. Here we present a +model-free method to identify and suppress an unknown disturbance to an unknown +system based only on previous observations of the system under the influence of +a known forcing function. We find that, under very mild restrictions on the +training function, our method is able to robustly identify and suppress a large +class of unknown disturbances. We illustrate our scheme with the identification +of both deterministic and stochastic unknown disturbances to an analog electric +chaotic circuit and with numerical examples where a chaotic disturbance to +various chaotic dynamical systems is identified and suppressed. + +
+
+
+
+
+ + ♻ ☆ GNN-SKAN: Harnessing the Power of SwallowKAN to Advance Molecular + Representation Learning with GNNs + + +
+ Effective molecular representation learning is crucial for advancing +molecular property prediction and drug design. Mainstream molecular +representation learning approaches are based on Graph Neural Networks (GNNs). +However, these approaches struggle with three significant challenges: +insufficient annotations, molecular diversity, and architectural limitations +such as over-squashing, which leads to the loss of critical structural details. +To address these challenges, we introduce a new class of GNNs that integrates +the Kolmogorov-Arnold Networks (KANs), known for their robust data-fitting +capabilities and high accuracy in small-scale AI + Science tasks. By +incorporating KANs into GNNs, our model enhances the representation of +molecular structures. We further advance this approach with a variant called +SwallowKAN (SKAN), which employs adaptive Radial Basis Functions (RBFs) as the +core of the non-linear neurons. This innovation improves both computational +efficiency and adaptability to diverse molecular structures. Building on the +strengths of SKAN, we propose a new class of GNNs, GNN-SKAN, and its augmented +variant, GNN-SKAN+, which incorporates a SKAN-based classifier to further boost +performance. To our knowledge, this is the first work to integrate KANs into +GNN architectures tailored for molecular representation learning. Experiments +across 6 classification datasets, 6 regression datasets, and 4 few-shot +learning datasets demonstrate that our approach achieves new state-of-the-art +performance in terms of accuracy and computational cost. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Tracing Privacy Leakage of Language Models to Training Data via Adjusted + Influence Functions + + +
+ The responses generated by Large Language Models (LLMs) can include sensitive +information from individuals and organizations, leading to potential privacy +leakage. This work implements Influence Functions (IFs) to trace privacy +leakage back to the training data, thereby mitigating privacy concerns of +Language Models (LMs). However, we notice that current IFs struggle to +accurately estimate the influence of tokens with large gradient norms, +potentially overestimating their influence. When tracing the most influential +samples, this leads to frequently tracing back to samples with large gradient +norm tokens, overshadowing the actual most influential samples even if their +influences are well estimated. To address this issue, we propose Heuristically +Adjusted IF (HAIF), which reduces the weight of tokens with large gradient +norms, thereby significantly improving the accuracy of tracing the most +influential samples. To establish easily obtained groundtruth for tracing +privacy leakage, we construct two datasets, PII-E and PII-CR, representing two +distinct scenarios: one with identical text in the model outputs and +pre-training data, and the other where models leverage their reasoning +abilities to generate text divergent from pre-training data. HAIF significantly +improves tracing accuracy, enhancing it by 20.96\% to 73.71\% on the PII-E +dataset and 3.21\% to 45.93\% on the PII-CR dataset, compared to the best SOTA +IFs against various GPT-2 and QWen-1.5 models. HAIF also outperforms SOTA IFs +on real-world pretraining data CLUECorpus2020, demonstrating strong robustness +regardless prompt and response lengths. + +
+
+
+
+
+ + ♻ ☆ FairBalance: How to Achieve Equalized Odds With Data Pre-processing + + +
+ This research seeks to benefit the software engineering society by providing +a simple yet effective pre-processing approach to achieve equalized odds +fairness in machine learning software. Fairness issues have attracted +increasing attention since machine learning software is increasingly used for +high-stakes and high-risk decisions. Amongst all the existing fairness notions, +this work specifically targets "equalized odds" given its advantage in always +allowing perfect classifiers. Equalized odds requires that members of every +demographic group do not receive disparate mistreatment. Prior works either +optimize for an equalized odds related metric during the learning process like +a black-box, or manipulate the training data following some intuition. This +work studies the root cause of the violation of equalized odds and how to +tackle it. We found that equalizing the class distribution in each demographic +group with sample weights is a necessary condition for achieving equalized odds +without modifying the normal training process. In addition, an important +partial condition for equalized odds (zero average odds difference) can be +guaranteed when the class distributions are weighted to be not only equal but +also balanced (1:1). Based on these analyses, we proposed FairBalance, a +pre-processing algorithm which balances the class distribution in each +demographic group by assigning calculated weights to the training data. On +eight real-world datasets, our empirical results show that, at low +computational overhead, the proposed pre-processing algorithm FairBalance can +significantly improve equalized odds without much, if any damage to the +utility. FairBalance also outperforms existing state-of-the-art approaches in +terms of equalized odds. To facilitate reuse, reproduction, and validation, we +made our scripts available at https://github.com/hil-se/FairBalance. + +
+
+ comment: 16 pages. Accepted by TSE +
+
+
+
+
+ + ♻ ☆ Interpretable Deep Learning for Forecasting Online Advertising Costs: + Insights from the Competitive Bidding Landscape + + +
+ As advertisers increasingly shift their budgets toward digital advertising, +accurately forecasting advertising costs becomes essential for optimizing +marketing campaign returns. This paper presents a comprehensive study that +employs various time-series forecasting methods to predict daily average CPC in +the online advertising market. We evaluate the performance of statistical +models, machine learning techniques, and deep learning approaches, including +the Temporal Fusion Transformer (TFT). Our findings reveal that incorporating +multivariate models, enriched with covariates derived from competitors' CPC +patterns through time-series clustering, significantly improves forecasting +accuracy. We interpret the results by analyzing feature importance and temporal +attention, demonstrating how the models leverage both the advertiser's data and +insights from the competitive landscape. Additionally, our method proves robust +during major market shifts, such as the COVID-19 pandemic, consistently +outperforming models that rely solely on individual advertisers' data. This +study introduces a scalable technique for selecting relevant covariates from a +broad pool of advertisers, offering more accurate long-term forecasts and +strategic insights into budget allocation and competitive dynamics in digital +advertising. + +
+
+ comment: Acceptd at IEEE DSAA 2024, 10 pages, 8 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Fundamental computational limits of weak learnability in + high-dimensional multi-index models + + +
+ Multi-index models - functions which only depend on the covariates through a +non-linear transformation of their projection on a subspace - are a useful +benchmark for investigating feature learning with neural networks. This paper +examines the theoretical boundaries of efficient learnability in this +hypothesis class, focusing particularly on the minimum sample complexity +required for weakly recovering their low-dimensional structure with first-order +iterative algorithms, in the high-dimensional regime where the number of +samples is $n=\alpha d$ is proportional to the covariate dimension $d$. Our +findings unfold in three parts: (i) first, we identify under which conditions a +trivial subspace can be learned with a single step of a first-order algorithm +for any $\alpha\!>\!0$; (ii) second, in the case where the trivial subspace is +empty, we provide necessary and sufficient conditions for the existence of an +easy subspace consisting of directions that can be learned only above a certain +sample complexity $\alpha\!>\!\alpha_c$. The critical threshold $\alpha_{c}$ +marks the presence of a computational phase transition, in the sense that it is +conjectured that no efficient iterative algorithm can succeed for +$\alpha\!<\!\alpha_c$. In a limited but interesting set of really hard +directions - akin to the parity problem - $\alpha_c$ is found to diverge. +Finally, (iii) we demonstrate that interactions between different directions +can result in an intricate hierarchical learning phenomenon, where some +directions can be learned sequentially when coupled to easier ones. Our +analytical approach is built on the optimality of approximate message-passing +algorithms among first-order iterative methods, delineating the fundamental +learnability limit across a broad spectrum of algorithms, including neural +networks trained with gradient descent. + +
+
+
+
+
+ + ♻ ☆ Watch Out for Your Guidance on Generation! Exploring Conditional + Backdoor Attacks against Large Language Models + + +
+ Mainstream backdoor attacks on large language models (LLMs) typically set a +fixed trigger in the input instance and specific responses for triggered +queries. However, the fixed trigger setting (e.g., unusual words) may be easily +detected by human detection, limiting the effectiveness and practicality in +real-world scenarios. To enhance the stealthiness of backdoor activation, we +present a new poisoning paradigm against LLMs triggered by specifying +generation conditions, which are commonly adopted strategies by users during +model inference. The poisoned model performs normally for output under +normal/other generation conditions, while becomes harmful for output under +target generation conditions. To achieve this objective, we introduce BrieFool, +an efficient attack framework. It leverages the characteristics of generation +conditions by efficient instruction sampling and poisoning data generation, +thereby influencing the behavior of LLMs under target conditions. Our attack +can be generally divided into two types with different targets: Safety +unalignment attack and Ability degradation attack. Our extensive experiments +demonstrate that BrieFool is effective across safety domains and ability +domains, achieving higher success rates than baseline methods, with 94.3 % on +GPT-3.5-turbo + +
+
+
+
+
+ + ♻ ☆ PowerPM: Foundation Model for Power Systems + + +
+ The emergence of abundant electricity time series (ETS) data provides ample +opportunities for various applications in the power systems, including +demand-side management, grid stability, and consumer behavior analysis. Deep +learning models have advanced ETS modeling by effectively capturing sequence +dependence. Nevertheless, learning a generic representation of ETS data for +various applications remains challenging due to the inherently complex +hierarchical structure of ETS data. Moreover, ETS data exhibits intricate +temporal dependencies and is suscepti ble to the influence of exogenous +variables. Furthermore, different instances exhibit diverse electricity +consumption behavior. In this paper, we propose a foundation model PowerPM to +model ETS data, providing a large-scale, off-the-shelf model for power systems. +PowerPM consists of a temporal encoder and a hierarchical encoder. The temporal +encoder captures both temporal dependencies in ETS data, considering exogenous +variables. The hierarchical encoder models the correlation between hierarchy. +Furthermore, PowerPM leverages a novel self-supervised pretraining framework +consisting of masked ETS modeling and dual-view contrastive learning, which +enable PowerPM to capture temporal dependency within ETS windows and aware the +discrepancy across ETS windows, providing two different perspectives to learn +generic representation. Our experiments involve five real world scenario +datasets, comprising private and public data. Through pre-training on massive +ETS data, PowerPM achieves SOTA performance on diverse downstream tasks within +the private dataset. Impressively, when transferred to the public datasets, +PowerPM maintains its superiority, showcasing its remarkable generalization +ability across various tasks and domains. Moreover, ablation studies, few-shot +experiments provide additional evidence of the effectiveness of our model. + +
+
+ comment: 23 pages, 5 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ Model Merging in LLMs, MLLMs, and Beyond: Methods, Theories, + Applications and Opportunities + + +
+ Model merging is an efficient empowerment technique in the machine learning +community that does not require the collection of raw training data and does +not require expensive computation. As model merging becomes increasingly +prevalent across various fields, it is crucial to understand the available +model merging techniques comprehensively. However, there is a significant gap +in the literature regarding a systematic and thorough review of these +techniques. This survey provides a comprehensive overview of model merging +methods and theories, their applications in various domains and settings, and +future research directions. Specifically, we first propose a new taxonomic +approach that exhaustively discusses existing model merging methods. Secondly, +we discuss the application of model merging techniques in large language +models, multimodal large language models, and 10+ machine learning subfields, +including continual learning, multi-task learning, few-shot learning, etc. +Finally, we highlight the remaining challenges of model merging and discuss +future research directions. A comprehensive list of papers about model merging +is available at +\url{https://github.com/EnnengYang/Awesome-Model-Merging-Methods-Theories-Applications}. + +
+
+
+
+
+ + ♻ ☆ Generative AI in Industrial Machine Vision -- A Review + + +
+ Machine vision enhances automation, quality control, and operational +efficiency in industrial applications by enabling machines to interpret and act +on visual data. While traditional computer vision algorithms and approaches +remain widely utilized, machine learning has become pivotal in current research +activities. In particular, generative AI demonstrates promising potential by +improving pattern recognition capabilities, through data augmentation, +increasing image resolution, and identifying anomalies for quality control. +However, the application of generative AI in machine vision is still in its +early stages due to challenges in data diversity, computational requirements, +and the necessity for robust validation methods. A comprehensive literature +review is essential to understand the current state of generative AI in +industrial machine vision, focusing on recent advancements, applications, and +research trends. Thus, a literature review based on the PRISMA guidelines was +conducted, analyzing over 1,200 papers on generative AI in industrial machine +vision. Our findings reveal various patterns in current research, with the +primary use of generative AI being data augmentation, for machine vision tasks +such as classification and object detection. Furthermore, we gather a +collection of application challenges together with data requirements to enable +a successful application of generative AI in industrial machine vision. This +overview aims to provide researchers with insights into the different areas and +applications within current research, highlighting significant advancements and +identifying opportunities for future work. + +
+
+ comment: 44 pages, 7 figures, This work has been submitted to the Journal of + Intelligent Manufacturing +
+
+
+
+
+ + ♻ ☆ PackMamba: Efficient Processing of Variable-Length Sequences in Mamba + training + + +
+ With the evolution of large language models, traditional Transformer models +become computationally demanding for lengthy sequences due to the quadratic +growth in computation with respect to the sequence length. Mamba, emerging as a +groundbreaking architecture in the field of generative AI, demonstrates +remarkable proficiency in handling elongated sequences with reduced +computational and memory complexity. Nevertheless, the existing training +framework of Mamba presents inefficiency with variable-length sequence inputs. +Either single-sequence training results in low GPU utilization, or batched +processing of variable-length sequences to a maximum length incurs considerable +memory and computational overhead. To address this problem, we analyze the +performance of bottleneck operators in Mamba under diverse tensor shapes and +proposed PackMamba, a high-throughput Mamba that efficiently handles +variable-length sequences. Diving deep into state-space models (SSMs), we +modify the parallel operators to avoid passing information between individual +sequences while maintaining high performance. Experimental results on an NVIDIA +A100 GPU demonstrate throughput exceeding the baseline single-sequence +processing scheme: 3.06x speedup on the 1.4B model and 2.62x on the 2.8B model. + +
+
+
+
+
+ + ♻ ☆ An Analysis under a Unified Fomulation of Learning Algorithms with + Output Constraints + + +
+ Neural networks (NN) perform well in diverse tasks, but sometimes produce +nonsensical results to humans. Most NN models "solely" learn from (input, +output) pairs, occasionally conflicting with human knowledge. Many studies +indicate injecting human knowledge by reducing output constraints during +training can improve model performance and reduce constraint violations. While +there have been several attempts to compare different existing algorithms under +the same programming framework, nonetheless, there has been no previous work +that categorizes learning algorithms with output constraints in a unified +manner. Our contributions are as follows: (1) We categorize the previous +studies based on three axes: type of constraint loss used (e.g. probabilistic +soft logic, REINFORCE), exploration strategy of constraint-violating examples, +and integration mechanism of learning signals from main task and constraint. +(2) We propose new algorithms to integrate the information of main task and +constraint injection, inspired by continual-learning algorithms. (3) +Furthermore, we propose the $H\beta$-score as a metric for considering the main +task metric and constraint violation simultaneously. To provide a thorough +analysis, we examine all the algorithms on three NLP tasks: natural language +inference (NLI), synthetic transduction examples (STE), and semantic role +labeling (SRL). We explore and reveal the key factors of various algorithms +associated with achieving high $H\beta$-scores. + +
+
+
+
+
+ + ♻ ☆ Online Distributional Regression + + +
+ Large-scale streaming data are common in modern machine learning applications +and have led to the development of online learning algorithms. Many fields, +such as supply chain management, weather and meteorology, energy markets, and +finance, have pivoted towards using probabilistic forecasts, which yields the +need not only for accurate learning of the expected value but also for learning +the conditional heteroskedasticity and conditional distribution moments. +Against this backdrop, we present a methodology for online estimation of +regularized, linear distributional models. The proposed algorithm is based on a +combination of recent developments for the online estimation of LASSO models +and the well-known GAMLSS framework. We provide a case study on day-ahead +electricity price forecasting, in which we show the competitive performance of +the incremental estimation combined with strongly reduced computational effort. +Our algorithms are implemented in a computationally efficient Python package. + +
+
+
+
+
+ + ♻ ☆ Graph-based Time Series Clustering for End-to-End Hierarchical + Forecasting ICML 2024 + + +
+ Relationships among time series can be exploited as inductive biases in +learning effective forecasting models. In hierarchical time series, +relationships among subsets of sequences induce hard constraints (hierarchical +inductive biases) on the predicted values. In this paper, we propose a +graph-based methodology to unify relational and hierarchical inductive biases +in the context of deep learning for time series forecasting. In particular, we +model both types of relationships as dependencies in a pyramidal graph +structure, with each pyramidal layer corresponding to a level of the hierarchy. +By exploiting modern - trainable - graph pooling operators we show that the +hierarchical structure, if not available as a prior, can be learned directly +from data, thus obtaining cluster assignments aligned with the forecasting +objective. A differentiable reconciliation stage is incorporated into the +processing architecture, allowing hierarchical constraints to act both as an +architectural bias as well as a regularization element for predictions. +Simulation results on representative datasets show that the proposed method +compares favorably against the state of the art. + +
+
+ comment: Published at ICML 2024 +
+
+
+
+
+ + ♻ ☆ Self-Supervised Visual Preference Alignment + + +
+ This paper makes the first attempt towards unsupervised preference alignment +in Vision-Language Models (VLMs). We generate chosen and rejected responses +with regard to the original and augmented image pairs, and conduct preference +alignment with direct preference optimization. It is based on a core idea: +properly designed augmentation to the image input will induce VLM to generate +false but hard negative responses, which helps the model to learn from and +produce more robust and powerful answers. The whole pipeline no longer hinges +on supervision from GPT-4 or human involvement during alignment, and is highly +efficient with few lines of code. With only 8k randomly sampled unsupervised +data, it achieves 90\% relative score to GPT-4 on complex reasoning in +LLaVA-Bench, and improves LLaVA-7B/13B by 6.7\%/5.6\% score on complex +multi-modal benchmark MM-Vet. Visualizations shows its improved ability to +align with user-intentions. A series of ablations are firmly conducted to +reveal the latent mechanism of the approach, which also indicates its potential +towards further scaling. Code are available in +https://github.com/Kevinz-code/SeVa. + +
+
+ comment: MM2024 oral +
+
+
+
+
+ + ♻ ☆ Joint Constellation Shaping Using Gradient Descent Approach for MU-MIMO + Broadcast Channel + + +
+ We introduce a learning-based approach to optimize a joint constellation for +a multi-user MIMO broadcast channel ($T$ Tx antennas, $K$ users, each with $R$ +Rx antennas), with perfect channel knowledge. The aim of the optimizer +(MAX-MIN) is to maximize the minimum mutual information between the transmitter +and each receiver, under a sum-power constraint. The proposed optimization +method do neither impose the transmitter to use superposition coding (SC) or +any other linear precoding, nor to use successive interference cancellation +(SIC) at the receiver. Instead, the approach designs a joint constellation, +optimized such that its projection into the subspace of each receiver $k$, +maximizes the minimum mutual information $I(W_k;Y_k)$ between each transmitted +binary input $W_k$ and the output signal at the intended receiver $Y_k$. The +rates obtained by our method are compared to those achieved with linear +precoders. + +
+
+
+
+
+ + ♻ ☆ Source-Free Domain Adaptation Guided by Vision and Vision-Language + Pre-Training ICCV + + +
+ Source-free domain adaptation (SFDA) aims to adapt a source model trained on +a fully-labeled source domain to a related but unlabeled target domain. While +the source model is a key avenue for acquiring target pseudolabels, the +generated pseudolabels may exhibit source bias. In the conventional SFDA +pipeline, a large data (e.g. ImageNet) pre-trained feature extractor is used to +initialize the source model at the start of source training, and subsequently +discarded. Despite having diverse features important for generalization, the +pre-trained feature extractor can overfit to the source data distribution +during source training and forget relevant target domain knowledge. Rather than +discarding this valuable knowledge, we introduce an integrated framework to +incorporate pre-trained networks into the target adaptation process. The +proposed framework is flexible and allows us to plug modern pre-trained +networks into the adaptation process to leverage their stronger representation +learning capabilities. For adaptation, we propose the Co-learn algorithm to +improve target pseudolabel quality collaboratively through the source model and +a pre-trained feature extractor. Building on the recent success of the +vision-language model CLIP in zero-shot image recognition, we present an +extension Co-learn++ to further incorporate CLIP's zero-shot classification +decisions. We evaluate on 4 benchmark datasets and include more challenging +scenarios such as open-set, partial-set and open-partial SFDA. Experimental +results demonstrate that our proposed strategy improves adaptation performance +and can be successfully integrated with existing SFDA methods. + +
+
+ comment: Extension of ICCV paper arXiv:2212.07585, accepted to IJCV +
+
+
+
+
+ + ♻ ☆ TabReD: A Benchmark of Tabular Machine Learning in-the-Wild + + +
+ Benchmarks that closely reflect downstream application scenarios are +essential for the streamlined adoption of new research in tabular machine +learning (ML). In this work, we examine existing tabular benchmarks and find +two common characteristics of industry-grade tabular data that are +underrepresented in the datasets available to the academic community. First, +tabular data often changes over time in real-world deployment scenarios. This +impacts model performance and requires time-based train and test splits for +correct model evaluation. Yet, existing academic tabular datasets often lack +timestamp metadata to enable such evaluation. Second, a considerable portion of +datasets in production settings stem from extensive data acquisition and +feature engineering pipelines. For each specific dataset, this can have a +different impact on the absolute and relative number of predictive, +uninformative, and correlated features, which in turn can affect model +selection. To fill the aforementioned gaps in academic benchmarks, we introduce +TabReD -- a collection of eight industry-grade tabular datasets covering a wide +range of domains from finance to food delivery services. We assess a large +number of tabular ML models in the feature-rich, temporally-evolving data +setting facilitated by TabReD. We demonstrate that evaluation on time-based +data splits leads to different methods ranking, compared to evaluation on +random splits more common in academic benchmarks. Furthermore, on the TabReD +datasets, MLP-like architectures and GBDT show the best results, while more +sophisticated DL models are yet to prove their effectiveness. + +
+
+ comment: Code: https://github.com/yandex-research/tabred (V2: fix the link to + the code in this comment; no changes to the PDF) +
+
+
+
+
+ + ♻ ☆ One Law, Many Languages: Benchmarking Multilingual Legal Reasoning for + Judicial Support + + +
+ Recent strides in Large Language Models (LLMs) have saturated many Natural +Language Processing (NLP) benchmarks, emphasizing the need for more challenging +ones to properly assess LLM capabilities. However, domain-specific and +multilingual benchmarks are rare because they require in-depth expertise to +develop. Still, most public models are trained predominantly on English +corpora, while other languages remain understudied, particularly for practical +domain-specific NLP tasks. In this work, we introduce a novel NLP benchmark for +the legal domain that challenges LLMs in five key dimensions: processing +\emph{long documents} (up to 50K tokens), using \emph{domain-specific +knowledge} (embodied in legal texts), \emph{multilingual} understanding +(covering five languages), \emph{multitasking} (comprising legal +document-to-document Information Retrieval, Court View Generation, Leading +Decision Summarization, Citation Extraction, and eight challenging Text +Classification tasks) and \emph{reasoning} (comprising especially Court View +Generation, but also the Text Classification tasks). Our benchmark contains +diverse datasets from the Swiss legal system, allowing for a comprehensive +study of the underlying non-English, inherently multilingual legal system. +Despite the large size of our datasets (some with hundreds of thousands of +examples), existing publicly available multilingual models struggle with most +tasks, even after extensive in-domain pre-training and fine-tuning. We publish +all resources (benchmark suite, pre-trained models, code) under permissive open +CC BY-SA licenses. + +
+
+
+
+
+ + ♻ ☆ Universal Time-Series Representation Learning: A Survey + + +
+ Time-series data exists in every corner of real-world systems and services, +ranging from satellites in the sky to wearable devices on human bodies. +Learning representations by extracting and inferring valuable information from +these time series is crucial for understanding the complex dynamics of +particular phenomena and enabling informed decisions. With the learned +representations, we can perform numerous downstream analyses more effectively. +Among several approaches, deep learning has demonstrated remarkable performance +in extracting hidden patterns and features from time-series data without manual +feature engineering. This survey first presents a novel taxonomy based on three +fundamental elements in designing state-of-the-art universal representation +learning methods for time series. According to the proposed taxonomy, we +comprehensively review existing studies and discuss their intuitions and +insights into how these methods enhance the quality of learned representations. +Finally, as a guideline for future studies, we summarize commonly used +experimental setups and datasets and discuss several promising research +directions. An up-to-date corresponding resource is available at +https://github.com/itouchz/awesome-deep-time-series-representations. + +
+
+ comment: 43 pages, 7 figures, reference updates +
+
+
+
+
+ + ♻ ☆ TimeSieve: Extracting Temporal Dynamics through Information Bottlenecks + + +
+ Time series forecasting has become an increasingly popular research area due +to its critical applications in various real-world domains such as traffic +management, weather prediction, and financial analysis. Despite significant +advancements, existing models face notable challenges, including the necessity +of manual hyperparameter tuning for different datasets, and difficulty in +effectively distinguishing signal from redundant features in data characterized +by strong seasonality. These issues hinder the generalization and practical +application of time series forecasting models. To solve this issues, we propose +an innovative time series forecasting model TimeSieve designed to address these +challenges. Our approach employs wavelet transforms to preprocess time series +data, effectively capturing multi-scale features without the need for +additional parameters or manual hyperparameter tuning. Additionally, we +introduce the information bottleneck theory that filters out redundant features +from both detail and approximation coefficients, retaining only the most +predictive information. This combination reduces significantly improves the +model's accuracy. Extensive experiments demonstrate that our model outperforms +existing state-of-the-art methods on 70% of the datasets, achieving higher +predictive accuracy and better generalization across diverse datasets. Our +results validate the effectiveness of our approach in addressing the key +challenges in time series forecasting, paving the way for more reliable and +efficient predictive models in practical applications. The code for our model +is available at https://github.com/xll0328/TimeSieve. + +
+
+
+
+
+ + ♻ ☆ Logical Distillation of Graph Neural Networks KR 2024 + + +
+ We present a logic based interpretable model for learning on graphs and an +algorithm to distill this model from a Graph Neural Network (GNN). Recent +results have shown connections between the expressivity of GNNs and the +two-variable fragment of first-order logic with counting quantifiers (C2). We +introduce a decision-tree based model which leverages an extension of C2 to +distill interpretable logical classifiers from GNNs. We test our approach on +multiple GNN architectures. The distilled models are interpretable, succinct, +and attain similar accuracy to the underlying GNN. Furthermore, when the ground +truth is expressible in C2, our approach outperforms the GNN. + +
+
+ comment: To Appear in the Proceedings of KR 2024 +
+
+
+
+
+ + ♻ ☆ Quantifying the effect of X-ray scattering for data generation in + real-time defect detection + + +
+ Background: X-ray imaging is widely used for the non-destructive detection of +defects in industrial products on a conveyor belt. In-line detection requires +highly accurate, robust, and fast algorithms. Deep Convolutional Neural +Networks (DCNNs) satisfy these requirements when a large amount of labeled data +is available. To overcome the challenge of collecting these data, different +methods of X-ray image generation are considered. + Objective: Depending on the desired degree of similarity to real data, +different physical effects should either be simulated or can be ignored. X-ray +scattering is known to be computationally expensive to simulate, and this +effect can greatly affect the accuracy of a generated X-ray image. We aim to +quantitatively evaluate the effect of scattering on defect detection. + Methods: Monte-Carlo simulation is used to generate X-ray scattering +distribution. DCNNs are trained on the data with and without scattering and +applied to the same test datasets. Probability of Detection (POD) curves are +computed to compare their performance, characterized by the size of the +smallest detectable defect. + Results: We apply the methodology to a model problem of defect detection in +cylinders. When trained on data without scattering, DCNNs reliably detect +defects larger than 1.3 mm, and using data with scattering improves performance +by less than 5%. If the analysis is performed on the cases with large +scattering-to-primary ratio ($1 < SPR < 5$), the difference in performance +could reach 15% (approx. 0.4 mm). + Conclusion: Excluding the scattering signal from the training data has the +largest effect on the smallest detectable defects, and the difference decreases +for larger defects. The scattering-to-primary ratio has a significant effect on +detection performance and the required accuracy of data generation. + +
+
+ comment: This paper appears in: Journal of X-Ray Science and Technology, vol. + 32, no. 4, pp. 1099-1119, 2024. Print ISSN: 0895-3996 Online ISSN: 1095-9114 + Digital Object Identifier: https://doi.org/10.3233/XST-230389 +
+
+
+
+
+ + ♻ ☆ Federated Learning for Collaborative Inference Systems: The Case of + Early Exit Networks + + +
+ As Internet of Things (IoT) technology advances, end devices like sensors and +smartphones are progressively equipped with AI models tailored to their local +memory and computational constraints. Local inference reduces communication +costs and latency; however, these smaller models typically underperform +compared to more sophisticated models deployed on edge servers or in the cloud. +Cooperative Inference Systems (CISs) address this performance trade-off by +enabling smaller devices to offload part of their inference tasks to more +capable devices. These systems often deploy hierarchical models that share +numerous parameters, exemplified by Deep Neural Networks (DNNs) that utilize +strategies like early exits or ordered dropout. In such instances, Federated +Learning (FL) may be employed to jointly train the models within a CIS. Yet, +traditional training methods have overlooked the operational dynamics of CISs +during inference, particularly the potential high heterogeneity in serving +rates across clients. To address this gap, we propose a novel FL approach +designed explicitly for use in CISs that accounts for these variations in +serving rates. Our framework not only offers rigorous theoretical guarantees, +but also surpasses state-of-the-art (SOTA) training algorithms for CISs, +especially in scenarios where inference request rates or data availability are +uneven among clients. + +
+
+
+
+
+ + ♻ ☆ Lighter, Better, Faster Multi-Source Domain Adaptation with Gaussian + Mixture Models and Optimal Transport ECML-PKDD 2024 + + +
+ In this paper, we tackle Multi-Source Domain Adaptation (MSDA), a task in +transfer learning where one adapts multiple heterogeneous, labeled source +probability measures towards a different, unlabeled target measure. We propose +a novel framework for MSDA, based on Optimal Transport (OT) and Gaussian +Mixture Models (GMMs). Our framework has two key advantages. First, OT between +GMMs can be solved efficiently via linear programming. Second, it provides a +convenient model for supervised learning, especially classification, as +components in the GMM can be associated with existing classes. Based on the +GMM-OT problem, we propose a novel technique for calculating barycenters of +GMMs. Based on this novel algorithm, we propose two new strategies for MSDA: +GMM-Wasserstein Barycenter Transport (WBT) and GMM-Dataset Dictionary Learning +(DaDiL). We empirically evaluate our proposed methods on four benchmarks in +image classification and fault diagnosis, showing that we improve over the +prior art while being faster and involving fewer parameters. Our code is +publicly available at https://github.com/eddardd/gmm_msda + +
+
+ comment: 13 pages, 6 figures, accepted as a research track paper at the + ECML-PKDD 2024 conference +
+
+
+
+
+ + ♻ ☆ Large-scale Pre-trained Models are Surprisingly Strong in Incremental + Novel Class Discovery ICPR 2024 + + +
+ Discovering novel concepts in unlabelled datasets and in a continuous manner +is an important desideratum of lifelong learners. In the literature such +problems have been partially addressed under very restricted settings, where +novel classes are learned by jointly accessing a related labelled set (e.g., +NCD) or by leveraging only a supervisedly pre-trained model (e.g., class-iNCD). +In this work we challenge the status quo in class-iNCD and propose a learning +paradigm where class discovery occurs continuously and truly unsupervisedly, +without needing any related labelled set. In detail, we propose to exploit the +richer priors from strong self-supervised pre-trained models (PTM). To this +end, we propose simple baselines, composed of a frozen PTM backbone and a +learnable linear classifier, that are not only simple to implement but also +resilient under longer learning scenarios. We conduct extensive empirical +evaluation on a multitude of benchmarks and show the effectiveness of our +proposed baselines when compared with sophisticated state-of-the-art methods. +The code is open source. + +
+
+ comment: Accepted as a conference paper to ICPR 2024 +
+
+
+
+
+ + ♻ ☆ Recent Advances in Optimal Transport for Machine Learning + + +
+ Recently, Optimal Transport has been proposed as a probabilistic framework in +Machine Learning for comparing and manipulating probability distributions. This +is rooted in its rich history and theory, and has offered new solutions to +different problems in machine learning, such as generative modeling and +transfer learning. In this survey we explore contributions of Optimal Transport +for Machine Learning over the period 2012 -- 2023, focusing on four sub-fields +of Machine Learning: supervised, unsupervised, transfer and reinforcement +learning. We further highlight the recent development in computational Optimal +Transport and its extensions, such as partial, unbalanced, Gromov and Neural +Optimal Transport, and its interplay with Machine Learning practice. + +
+
+ comment: 20 pages,15 figures,under review +
+
+
+
+
+ + ♻ ☆ Nonequilbrium physics of generative diffusion models + + +
+ Generative diffusion models apply the concept of Langevin dynamics in physics +to machine leaning, attracting a lot of interests from engineering, statistics +and physics, but a complete picture about inherent mechanisms is still lacking. +In this paper, we provide a transparent physics analysis of diffusion models, +formulating the fluctuation theorem, entropy production, equilibrium measure, +and Franz-Parisi potential to understand the dynamic process and intrinsic +phase transitions. Our analysis is rooted in a path integral representation of +both forward and backward dynamics, and in treating the reverse diffusion +generative process as a statistical inference, where the time-dependent state +variables serve as quenched disorder akin to that in spin glass theory. Our +study thus links stochastic thermodynamics, statistical inference and geometry +based analysis together to yield a coherent picture about how the generative +diffusion models work. + +
+
+ comment: 24 pages, 9 figures, 30 refs +
+
+
+
+
+ + ♻ ☆ Resource-constrained Fairness + + +
+ Access to resources strongly constrains the decisions we make. While we might +wish to offer every student a scholarship, or schedule every patient for +follow-up meetings with a specialist, limited resources mean that this is not +possible. When deploying machine learning systems, these resource constraints +are simply enforced by varying the threshold of a classifier. However, these +finite resource limitations are disregarded by most existing tools for fair +machine learning, which do not allow the specification of resource limitations +and do not remain fair when varying thresholds. This makes them ill-suited for +real-world deployment. Our research introduces the concept of +"resource-constrained fairness" and quantifies the cost of fairness within this +framework. We demonstrate that the level of available resources significantly +influences this cost, a factor overlooked in previous evaluations. + +
+
+
+
+
+ + ♻ ☆ Enhancing Startup Success Predictions in Venture Capital: A GraphRAG + Augmented Multivariate Time Series Method + + +
+ In the Venture Capital(VC) industry, predicting the success of startups is +challenging due to limited financial data and the need for subjective revenue +forecasts. Previous methods based on time series analysis or deep learning +often fall short as they fail to incorporate crucial inter-company +relationships such as competition and collaboration. Regarding the issues, we +propose a novel approach using GrahphRAG augmented time series model. With +GraphRAG, time series predictive methods are enhanced by integrating these +vital relationships into the analysis framework, allowing for a more dynamic +understanding of the startup ecosystem in venture capital. Our experimental +results demonstrate that our model significantly outperforms previous models in +startup success predictions. To the best of our knowledge, our work is the +first application work of GraphRAG. + +
+
+
+
+
+ + ♻ ☆ CompilerDream: Learning a Compiler World Model for General Code + Optimization + + +
+ Effective code optimization in compilers is crucial for computer and software +engineering. The success of these optimizations primarily depends on the +selection and ordering of the optimization passes applied to the code. While +most compilers rely on a fixed sequence of optimization passes, current methods +to find the optimal sequence either employ impractically slow search algorithms +or learning methods that struggle to generalize to code unseen during training. +We introduce CompilerDream, a model-based reinforcement learning approach to +general code optimization. CompilerDream comprises a compiler world model that +accurately simulates the intrinsic properties of optimization passes and an +agent trained on this model to produce effective optimization strategies. By +training on a large-scale program dataset, CompilerDream is equipped to serve +as a general code optimizer across various application scenarios and +source-code languages. Our extensive experiments first highlight +CompilerDream's strong optimization capabilities for autotuning, where it leads +the CompilerGym leaderboard. More importantly, the zero-shot generalization +ability of large-scale trained compiler world model and agent, excels across +diverse datasets, surpassing LLVM's built-in optimizations and other +state-of-the-art methods in both settings of value prediction and end-to-end +code optimization. + +
+
+
+
+
+ + ♻ ☆ Lowering PyTorch's Memory Consumption for Selective Differentiation ICML'24 + + +
+ Memory is a limiting resource for many deep learning tasks. Beside the neural +network weights, one main memory consumer is the computation graph built up by +automatic differentiation (AD) for backpropagation. We observe that PyTorch's +current AD implementation neglects information about parameter +differentiability when storing the computation graph. This information is +useful though to reduce memory whenever gradients are requested for a parameter +subset, as is the case in many modern fine-tuning tasks. Specifically, inputs +to layers that act linearly in their parameters (dense, convolution, or +normalization layers) can be discarded whenever the parameters are marked as +non-differentiable. We provide a drop-in, differentiability-agnostic +implementation of such layers and demonstrate its ability to reduce memory +without affecting run time. + +
+
+ comment: The code is available at + https://github.com/plutonium-239/memsave_torch . This paper was accepted to + WANT@ICML'24 +
+
+
+
+
+ + ♻ ☆ Optimizing E-commerce Search: Toward a Generalizable and Rank-Consistent + Pre-Ranking Model + + +
+ In large e-commerce platforms, search systems are typically composed of a +series of modules, including recall, pre-ranking, and ranking phases. The +pre-ranking phase, serving as a lightweight module, is crucial for filtering +out the bulk of products in advance for the downstream ranking module. +Industrial efforts on optimizing the pre-ranking model have predominantly +focused on enhancing ranking consistency, model structure, and generalization +towards long-tail items. Beyond these optimizations, meeting the system +performance requirements presents a significant challenge. Contrasting with +existing industry works, we propose a novel method: a Generalizable and +RAnk-ConsistEnt Pre-Ranking Model (GRACE), which achieves: 1) Ranking +consistency by introducing multiple binary classification tasks that predict +whether a product is within the top-k results as estimated by the ranking +model, which facilitates the addition of learning objectives on common +point-wise ranking models; 2) Generalizability through contrastive learning of +representation for all products by pre-training on a subset of ranking product +embeddings; 3) Ease of implementation in feature construction and online +deployment. Our extensive experiments demonstrate significant improvements in +both offline metrics and online A/B test: a 0.75% increase in AUC and a 1.28% +increase in CVR. + +
+
+
+
+
+ + ♻ ☆ Semi-Supervised Learning with Multi-Head Co-Training AAAI + + +
+ Co-training, extended from self-training, is one of the frameworks for +semi-supervised learning. Without natural split of features, single-view +co-training works at the cost of training extra classifiers, where the +algorithm should be delicately designed to prevent individual classifiers from +collapsing into each other. To remove these obstacles which deter the adoption +of single-view co-training, we present a simple and efficient algorithm +Multi-Head Co-Training. By integrating base learners into a multi-head +structure, the model is in a minimal amount of extra parameters. Every +classification head in the unified model interacts with its peers through a +"Weak and Strong Augmentation" strategy, in which the diversity is naturally +brought by the strong data augmentation. Therefore, the proposed method +facilitates single-view co-training by 1). promoting diversity implicitly and +2). only requiring a small extra computational overhead. The effectiveness of +Multi-Head Co-Training is demonstrated in an empirical study on standard +semi-supervised learning benchmarks. + +
+
+ comment: The 36th AAAI Conference on Artificial Intelligence (AAAI-22) +
+
+
+
+
+ + ♻ ☆ Towards End-to-End GPS Localization with Neural Pseudorange Correction + + +
+ The pseudorange error is one of the root causes of localization inaccuracy in +GPS. Previous data-driven methods regress and eliminate pseudorange errors +using handcrafted intermediate labels. Unlike them, we propose an end-to-end +GPS localization framework, E2E-PrNet, to train a neural network for +pseudorange correction (PrNet) directly using the final task loss calculated +with the ground truth of GPS receiver states. The gradients of the loss with +respect to learnable parameters are backpropagated through a Differentiable +Nonlinear Least Squares (DNLS) optimizer to PrNet. The feasibility of fusing +the data-driven neural network and the model-based DNLS module is verified with +GPS data collected by Android phones, showing that E2E-PrNet outperforms the +baseline weighted least squares method and the state-of-the-art end-to-end +data-driven approach. Finally, we discuss the explainability of E2E-PrNet. + +
+
+
+
+
+ + ♻ ☆ FairX: A comprehensive benchmarking tool for model analysis using + fairness, utility, and explainability + + +
+ We present FairX, an open-source Python-based benchmarking tool designed for +the comprehensive analysis of models under the umbrella of fairness, utility, +and eXplainability (XAI). FairX enables users to train benchmarking +bias-removal models and evaluate their fairness using a wide array of fairness +metrics, data utility metrics, and generate explanations for model predictions, +all within a unified framework. Existing benchmarking tools do not have the way +to evaluate synthetic data generated from fair generative models, also they do +not have the support for training fair generative models either. In FairX, we +add fair generative models in the collection of our fair-model library +(pre-processing, in-processing, post-processing) and evaluation metrics for +evaluating the quality of synthetic fair data. This version of FairX supports +both tabular and image datasets. It also allows users to provide their own +custom datasets. The open-source FairX benchmarking package is publicly +available at https://github.com/fahim-sikder/FairX. + +
+
+
+
+
+ + ♻ ☆ Selective Prompt Anchoring for Code Generation + + +
+ Recent advances in large language models (LLMs) such as Copilot and ChatGPT +have transformed software development by automating coding tasks. Despite these +advancements, challenges remain in reducing error rates and fully meeting user +expectations. Our empirical study reveals LLMs tend to dilute their +self-attention on the initial prompt as more code tokens are generated. We +hypothesize this self-attention dilution issue is one of the root causes of +inaccuracies in LLM-generated code. To mitigate this issue, we propose +Selective Prompt Anchoring (SPA). SPA amplifies the influence of the selected +parts in the initial prompt, which we refer to as ``anchored text'', during +code generation. Specifically, SPA calculates the logit distribution difference +with and without the anchored text. We prove this difference approximates the +anchored text's contextual contribution to the output logits. SPA creates an +augmented logit distribution by linearly combining the original logit +distribution and the logit difference. We evaluate SPA with five LLMs on four +benchmarks. Our results demonstrate that using SPA can consistently improve +Pass@1 rates by up to 9.7% in all settings. Notably, with selective text +anchoring, a small version of DeepSeek-Coder (6.7B) can achieve better +performance than an original much larger version (33B). Our code is available +at https://github.com/magic-YuanTian/Selective-Prompt-Anchoring. + +
+
+
+
+
+ + ♻ ☆ Node Level Graph Autoencoder: Unified Pretraining for Textual Graph + Learning + + +
+ Textual graphs are ubiquitous in real-world applications, featuring rich text +information with complex relationships, which enables advanced research across +various fields. Textual graph representation learning aims to generate +low-dimensional feature embeddings from textual graphs that can improve the +performance of downstream tasks. A high-quality feature embedding should +effectively capture both the structural and the textual information in a +textual graph. However, most textual graph dataset benchmarks rely on word2vec +techniques to generate feature embeddings, which inherently limits their +capabilities. Recent works on textual graph representation learning can be +categorized into two folds: supervised and unsupervised methods. Supervised +methods finetune a language model on labeled nodes, which have limited +capabilities when labeled data is scarce. Unsupervised methods, on the other +hand, extract feature embeddings by developing complex training pipelines. To +address these limitations, we propose a novel unified unsupervised learning +autoencoder framework, named Node Level Graph AutoEncoder (NodeGAE). We employ +language models as the backbone of the autoencoder, with pretraining on text +reconstruction. Additionally, we add an auxiliary loss term to make the feature +embeddings aware of the local graph structure. Our method maintains simplicity +in the training process and demonstrates generalizability across diverse +textual graphs and downstream tasks. We evaluate our method on two core graph +representation learning downstream tasks: node classification and link +prediction. Comprehensive experiments demonstrate that our approach +substantially enhances the performance of diverse graph neural networks (GNNs) +across multiple textual graph datasets. + +
+
+
+
+
+ + ♻ ☆ Parameter-Efficient Fine-Tuning via Circular Convolution + + +
+ Low-Rank Adaptation (LoRA) has gained popularity for fine-tuning large +foundation models, leveraging low-rank matrices $\mathbf{A}$ and $\mathbf{B}$ +to represent weight changes (i.e., $\Delta \mathbf{W} = \mathbf{B} +\mathbf{A}$). This method reduces trainable parameters and mitigates heavy +memory consumption associated with full delta matrices by sequentially +multiplying $\mathbf{A}$ and $\mathbf{B}$ with the activation. Despite its +success, the intrinsic low-rank characteristic may limit its performance. +Although several variants have been proposed to address this issue, they often +overlook the crucial computational and memory efficiency brought by LoRA. In +this paper, we propose Circular Convolution Adaptation (C$^3$A), which not only +achieves high-rank adaptation with enhanced performance but also excels in both +computational power and memory utilization. Extensive experiments demonstrate +that C$^3$A consistently outperforms LoRA and its variants across various +fine-tuning tasks. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ What Drives Online Popularity: Author, Content or Sharers? Estimating + Spread Dynamics with Bayesian Mixture Hawkes ECML-PKDD + + +
+ The spread of content on social media is shaped by intertwining factors on +three levels: the source, the content itself, and the pathways of content +spread. At the lowest level, the popularity of the sharing user determines its +eventual reach. However, higher-level factors such as the nature of the online +item and the credibility of its source also play crucial roles in determining +how widely and rapidly the online item spreads. In this work, we propose the +Bayesian Mixture Hawkes (BMH) model to jointly learn the influence of source, +content and spread. We formulate the BMH model as a hierarchical mixture model +of separable Hawkes processes, accommodating different classes of Hawkes +dynamics and the influence of feature sets on these classes. We test the BMH +model on two learning tasks, cold-start popularity prediction and temporal +profile generalization performance, applying to two real-world retweet cascade +datasets referencing articles from controversial and traditional media +publishers. The BMH model outperforms the state-of-the-art models and +predictive baselines on both datasets and utilizes cascade- and item-level +information better than the alternatives. Lastly, we perform a counter-factual +analysis where we apply the trained publisher-level BMH models to a set of +article headlines and show that effectiveness of headline writing style +(neutral, clickbait, inflammatory) varies across publishers. The BMH model +unveils differences in style effectiveness between controversial and reputable +publishers, where we find clickbait to be notably more effective for reputable +publishers as opposed to controversial ones, which links to the latter's +overuse of clickbait. + +
+
+ comment: accepted as a full paper in the Research Track at the European + Conference on Machine Learning and Principles and Practice of Knowledge + Discovery in Databases (ECML-PKDD) 2024 +
+
+
+
+
+ + ♻ ☆ Operator SVD with Neural Networks via Nested Low-Rank Approximation ICML 2024 + + +
+ Computing eigenvalue decomposition (EVD) of a given linear operator, or +finding its leading eigenvalues and eigenfunctions, is a fundamental task in +many machine learning and scientific computing problems. For high-dimensional +eigenvalue problems, training neural networks to parameterize the +eigenfunctions is considered as a promising alternative to the classical +numerical linear algebra techniques. This paper proposes a new optimization +framework based on the low-rank approximation characterization of a truncated +singular value decomposition, accompanied by new techniques called +\emph{nesting} for learning the top-$L$ singular values and singular functions +in the correct order. The proposed method promotes the desired orthogonality in +the learned functions implicitly and efficiently via an unconstrained +optimization formulation, which is easy to solve with off-the-shelf +gradient-based optimization algorithms. We demonstrate the effectiveness of the +proposed optimization framework for use cases in computational physics and +machine learning. + +
+
+ comment: 36 pages, 7 figures. ICML 2024. Almost identical to the conference + version, except a few updates for fixing typos and mistakes +
+
+
+
+
+ + ♻ ☆ Mitigating Label Noise on Graph via Topological Sample Selection ICML 2024 + + +
+ Despite the success of the carefully-annotated benchmarks, the effectiveness +of existing graph neural networks (GNNs) can be considerably impaired in +practice when the real-world graph data is noisily labeled. Previous +explorations in sample selection have been demonstrated as an effective way for +robust learning with noisy labels, however, the conventional studies focus on +i.i.d data, and when moving to non-iid graph data and GNNs, two notable +challenges remain: (1) nodes located near topological class boundaries are very +informative for classification but cannot be successfully distinguished by the +heuristic sample selection. (2) there is no available measure that considers +the graph topological information to promote sample selection in a graph. To +address this dilemma, we propose a $\textit{Topological Sample Selection}$ +(TSS) method that boosts the informative sample selection process in a graph by +utilising topological information. We theoretically prove that our procedure +minimizes an upper bound of the expected risk under target clean distribution, +and experimentally show the superiority of our method compared with +state-of-the-art baselines. + +
+
+ comment: ICML 2024 +
+
+
+
+
+ + ♻ ☆ AdapTable: Test-Time Adaptation for Tabular Data via Shift-Aware + Uncertainty Calibrator and Label Distribution Handler AAAI 2025 + + +
+ In real-world applications, tabular data often suffer from distribution +shifts due to their widespread and abundant nature, leading to erroneous +predictions of pre-trained machine learning models. However, addressing such +distribution shifts in the tabular domain has been relatively underexplored due +to unique challenges such as varying attributes and dataset sizes, as well as +the limited representation learning capabilities of deep learning models for +tabular data. Particularly, with the recent promising paradigm of test-time +adaptation (TTA), where we adapt the off-the-shelf model to the unlabeled +target domain during the inference phase without accessing the source domain, +we observe that directly adopting commonly used TTA methods from other domains +often leads to model collapse. We systematically explore challenges in tabular +data test-time adaptation, including skewed entropy, complex latent space +decision boundaries, confidence calibration issues with both overconfident and +under-confident, and model bias towards source label distributions along with +class imbalances. Based on these insights, we introduce AdapTable, a novel +tabular test-time adaptation method that directly modifies output probabilities +by estimating target label distributions and adjusting initial probabilities +based on calibrated uncertainty. Extensive experiments on both natural +distribution shifts and synthetic corruptions demonstrate the adaptation +efficacy of the proposed method. + +
+
+ comment: Under Review at AAAI 2025 +
+
+
+
+
+ + ♻ ☆ Investigating Imperceptibility of Adversarial Attacks on Tabular Data: + An Empirical Analysis + + +
+ Adversarial attacks are a potential threat to machine learning models by +causing incorrect predictions through imperceptible perturbations to the input +data. While these attacks have been extensively studied in unstructured data +like images, applying them to tabular data, poses new challenges. These +challenges arise from the inherent heterogeneity and complex feature +interdependencies in tabular data, which differ from the image data. To account +for this distinction, it is necessary to establish tailored imperceptibility +criteria specific to tabular data. However, there is currently a lack of +standardised metrics for assessing the imperceptibility of adversarial attacks +on tabular data. To address this gap, we propose a set of key properties and +corresponding metrics designed to comprehensively characterise imperceptible +adversarial attacks on tabular data. These are: proximity to the original +input, sparsity of altered features, deviation from the original data +distribution, sensitivity in perturbing features with narrow distribution, +immutability of certain features that should remain unchanged, feasibility of +specific feature values that should not go beyond valid practical ranges, and +feature interdependencies capturing complex relationships between data +attributes. We evaluate the imperceptibility of five adversarial attacks, +including both bounded attacks and unbounded attacks, on tabular data using the +proposed imperceptibility metrics. The results reveal a trade-off between the +imperceptibility and effectiveness of these attacks. The study also identifies +limitations in current attack algorithms, offering insights that can guide +future research in the area. The findings gained from this empirical analysis +provide valuable direction for enhancing the design of adversarial attack +algorithms, thereby advancing adversarial machine learning on tabular data. + +
+
+ comment: 33 pages +
+
+
+
+
+ + ♻ ☆ Improving Generalization and Convergence by Enhancing Implicit + Regularization + + +
+ In this work, we propose an Implicit Regularization Enhancement (IRE) +framework to accelerate the discovery of flat solutions in deep learning, +thereby improving generalization and convergence. Specifically, IRE decouples +the dynamics of flat and sharp directions, which boosts the sharpness reduction +along flat directions while maintaining the training stability in sharp +directions. We show that IRE can be practically incorporated with {\em generic +base optimizers} without introducing significant computational overload. +Experiments show that IRE consistently improves the generalization performance +for image classification tasks across a variety of benchmark datasets +(CIFAR-10/100, ImageNet) and models (ResNets and ViTs). Surprisingly, IRE also +achieves a $2\times$ {\em speed-up} compared to AdamW in the pre-training of +Llama models (of sizes ranging from 60M to 229M) on datasets including +Wikitext-103, Minipile, and Openwebtext. Moreover, we provide theoretical +guarantees, showing that IRE can substantially accelerate the convergence +towards flat minima in Sharpness-aware Minimization (SAM). + +
+
+ comment: 35 pages +
+
+
+
+
+ + ♻ ☆ Provably Convergent Subgraph-wise Sampling for Fast GNN Training + + +
+ Subgraph-wise sampling -- a promising class of mini-batch training techniques +for graph neural networks (GNNs -- is critical for real-world applications. +During the message passing (MP) in GNNs, subgraph-wise sampling methods discard +messages outside the mini-batches in backward passes to avoid the well-known +neighbor explosion problem, i.e., the exponentially increasing dependencies of +nodes with the number of MP iterations. However, discarding messages may +sacrifice the gradient estimation accuracy, posing significant challenges to +their convergence analysis and convergence speeds. To address this challenge, +we propose a novel subgraph-wise sampling method with a convergence guarantee, +namely Local Message Compensation (LMC). To the best of our knowledge, LMC is +the first subgraph-wise sampling method with provable convergence. The key idea +is to retrieve the discarded messages in backward passes based on a message +passing formulation of backward passes. By efficient and effective +compensations for the discarded messages in both forward and backward passes, +LMC computes accurate mini-batch gradients and thus accelerates convergence. +Moreover, LMC is applicable to various MP-based GNN architectures, including +convolutional GNNs (finite message passing iterations with different layers) +and recurrent GNNs (infinite message passing iterations with a shared layer). +Experiments on large-scale benchmarks demonstrate that LMC is significantly +faster than state-of-the-art subgraph-wise sampling methods. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2302.00924 +
+
+
+
+
+ + ♻ ☆ ClaimVer: Explainable Claim-Level Verification and Evidence Attribution + of Text Through Knowledge Graphs + + +
+ In the midst of widespread misinformation and disinformation through social +media and the proliferation of AI-generated texts, it has become increasingly +difficult for people to validate and trust information they encounter. Many +fact-checking approaches and tools have been developed, but they often lack +appropriate explainability or granularity to be useful in various contexts. A +text validation method that is easy to use, accessible, and can perform +fine-grained evidence attribution has become crucial. More importantly, +building user trust in such a method requires presenting the rationale behind +each prediction, as research shows this significantly influences people's +belief in automated systems. Localizing and bringing users' attention to the +specific problematic content is also paramount, instead of providing simple +blanket labels. In this paper, we present ClaimVer, a human-centric framework +tailored to meet users' informational and verification needs by generating rich +annotations and thereby reducing cognitive load. Designed to deliver +comprehensive evaluations of texts, it highlights each claim, verifies it +against a trusted knowledge graph (KG), presents the evidence, and provides +succinct, clear explanations for each claim prediction. Finally, our framework +introduces an attribution score, enhancing applicability across a wide range of +downstream tasks. + +
+
+
+
+
+ + ♻ ☆ QET: Enhancing Quantized LLM Parameters and KV cache Compression through + Element Substitution and Residual Clustering + + +
+ Matrix quantization compresses matrix elements into a more compact form to +reduce storage requirements, with dequantization enabling reconstruction for +use. We define the Quantization Error Minimization (QEM) problem as minimizing +the difference between the original and quantized matrices while ensuring the +quantized matrix remains within fixed memory constraints. This technique is +crucial in applications like Large Language Model (LLM) weight compression and +KV cache compression, where large matrix sizes demand efficient storage +solutions. + As modern LLMs like GPT-4 and BERT continue to grow, effective matrix +compression is increasingly important. These models contain billions of +parameters in matrix form, making efficient weight quantization essential for +both storage and computational efficiency. Similarly, KV caches, storing +intermediate inference results, are matrix-based and benefit significantly from +optimized compression techniques. + To address the QEM problem in the context of LLM weight and KV cache +compression, we propose Quantum Entanglement Trees (QET). QET leverages the +local structure of matrix elements by iteratively swapping elements to create a +locally ordered matrix, which is then grouped and quantized column by column. +To enhance QET, we introduce two optimizations: residual quantization to +further reduce Mean Squared Error (MSE) and masking with batch processing to +accelerate the algorithm. + Our experiments demonstrate that QET can reduce MSE to 12.3% of its original +value at the same compression ratio, outperforming leading baseline methods. +Our contributions include framing the QEM problem specifically for LLM and KV +cache compression, developing the QET algorithm, and implementing optimizations +that improve accuracy and processing speed. + +
+
+
+
+
+ + ♻ ☆ Calibration and Correctness of Language Models for Code ICSE'25 + + +
+ Machine learning models are widely used, but can also often be wrong. Users +would benefit from a reliable indication of whether a given output from a given +model should be trusted, so a rational decision can be made whether to use the +output or not. For example, outputs can be associated with a confidence +measure; if this confidence measure is strongly associated with likelihood of +correctness, then the model is said to be well-calibrated. + A well-calibrated confidence measure can serve as a basis for rational, +graduated decision-making on how much review and care is needed when using +generated code. Calibration has so far been studied in mostly non-generative +(e.g. classification) settings, especially in software engineering. However, +generated code can quite often be wrong: Given generated code, developers must +decide whether to use directly, use after varying intensity of careful review, +or discard model-generated code. Thus, calibration is vital in generative +settings. + We make several contributions. We develop a framework for evaluating the +calibration of code-generating models. We consider several tasks, correctness +criteria, datasets, and approaches, and find that, by and large, generative +code models we test are not well-calibrated out of the box. We then show how +calibration can be improved using standard methods, such as Platt scaling. +Since Platt scaling relies on the prior availability of correctness data, we +evaluate the applicability and generalizability of Platt scaling in software +engineering, discuss settings where it has good potential for practical use, +and settings where it does not. Our contributions will lead to +better-calibrated decision-making in the current use of code generated by +language models, and offers a framework for future research to further improve +calibration methods for generative models in software engineering. + +
+
+ comment: Published in ICSE'25 +
+
+
+
+
+ + ♻ ☆ TabSketchFM: Sketch-based Tabular Representation Learning for Data + Discovery over Data Lakes + + +
+ Enterprises have a growing need to identify relevant tables in data lakes; +e.g. tables that are unionable, joinable, or subsets of each other. Tabular +neural models can be helpful for such data discovery tasks. In this paper, we +present TabSketchFM, a neural tabular model for data discovery over data lakes. +First, we propose novel pre-training: a sketch-based approach to enhance the +effectiveness of data discovery in neural tabular models. Second, we finetune +the pretrained model for identifying unionable, joinable, and subset table +pairs and show significant improvement over previous tabular neural models. +Third, we present a detailed ablation study to highlight which sketches are +crucial for which tasks. Fourth, we use these finetuned models to perform table +search; i.e., given a query table, find other tables in a corpus that are +unionable, joinable, or that are subsets of the query. Our results demonstrate +significant improvements in F1 scores for search compared to state-of-the-art +techniques. Finally, we show significant transfer across datasets and tasks +establishing that our model can generalize across different tasks and over +different data lakes. + +
+
+
+
+
+ + ♻ ☆ CrossFi: A Cross Domain Wi-Fi Sensing Framework Based on Siamese Network + + +
+ In recent years, Wi-Fi sensing has garnered significant attention due to its +numerous benefits, such as privacy protection, low cost, and penetration +ability. Extensive research has been conducted in this field, focusing on areas +such as gesture recognition, people identification, and fall detection. +However, many data-driven methods encounter challenges related to domain shift, +where the model fails to perform well in environments different from the +training data. One major factor contributing to this issue is the limited +availability of Wi-Fi sensing datasets, which makes models learn excessive +irrelevant information and over-fit to the training set. Unfortunately, +collecting large-scale Wi-Fi sensing datasets across diverse scenarios is a +challenging task. To address this problem, we propose CrossFi, a siamese +network-based approach that excels in both in-domain scenario and cross-domain +scenario, including few-shot, zero-shot scenarios, and even works in few-shot +new-class scenario where testing set contains new categories. The core +component of CrossFi is a sample-similarity calculation network called CSi-Net, +which improves the structure of the siamese network by using an attention +mechanism to capture similarity information, instead of simply calculating the +distance or cosine similarity. Based on it, we develop an extra Weight-Net that +can generate a template for each class, so that our CrossFi can work in +different scenarios. Experimental results demonstrate that our CrossFi achieves +state-of-the-art performance across various scenarios. In gesture recognition +task, our CrossFi achieves an accuracy of 98.17% in in-domain scenario, 91.72% +in one-shot cross-domain scenario, 64.81% in zero-shot cross-domain scenario, +and 84.75% in one-shot new-class scenario. To facilitate future research, we +will release the code for our model upon publication. + +
+
+
+
+
+ + ♻ ☆ It's Our Loss: No Privacy Amplification for Hidden State DP-SGD With + Non-Convex Loss + + +
+ Differentially Private Stochastic Gradient Descent (DP-SGD) is a popular +iterative algorithm used to train machine learning models while formally +guaranteeing the privacy of users. However, the privacy analysis of DP-SGD +makes the unrealistic assumption that all intermediate iterates (aka internal +state) of the algorithm are released since, in practice, only the final trained +model, i.e., the final iterate of the algorithm is released. In this hidden +state setting, prior work has provided tighter analyses, albeit only when the +loss function is constrained, e.g., strongly convex and smooth or linear. On +the other hand, the privacy leakage observed empirically from hidden state +DP-SGD, even when using non-convex loss functions, suggests that there is in +fact a gap between the theoretical privacy analysis and the privacy guarantees +achieved in practice. Therefore, it remains an open question whether hidden +state privacy amplification for DP-SGD is possible for all (possibly +non-convex) loss functions in general. + In this work, we design a counter-example and show, both theoretically and +empirically, that a hidden state privacy amplification result for DP-SGD for +all loss functions in general is not possible. By carefully constructing a loss +function for DP-SGD, we show that for specific loss functions, the final +iterate of DP-SGD alone leaks as much information as the sequence of all +iterates combined. Furthermore, we empirically verify this result by evaluating +the privacy leakage from the final iterate of DP-SGD with our loss function and +show that this exactly matches the theoretical upper bound guaranteed by DP. +Therefore, we show that the current privacy analysis for DP-SGD is tight for +general loss functions and conclude that no privacy amplification is possible +for DP-SGD in general for all (possibly non-convex) loss functions. + +
+
+
+
+
+ + ♻ ☆ Pre-Training Representations of Binary Code Using Contrastive Learning + + +
+ Compiled software is delivered as executable binary code. Developers write +source code to express the software semantics, but the compiler converts it to +a binary format that the CPU can directly execute. Therefore, binary code +analysis is critical to applications in reverse engineering and computer +security tasks where source code is not available. However, unlike source code +and natural language that contain rich semantic information, binary code is +typically difficult for human engineers to understand and analyze. While +existing work uses AI models to assist source code analysis, few studies have +considered binary code. In this paper, we propose a COntrastive learning Model +for Binary cOde Analysis, or COMBO, that incorporates source code and comment +information into binary code during representation learning. Specifically, we +present three components in COMBO: (1) a primary contrastive learning method +for cold-start pre-training, (2) a simplex interpolation method to incorporate +source code, comments, and binary code, and (3) an intermediate representation +learning algorithm to provide binary code embeddings. Finally, we evaluate the +effectiveness of the pre-trained representations produced by COMBO using three +indicative downstream tasks relating to binary code: algorithmic functionality +classification, binary code similarity, and vulnerability detection. Our +experimental results show that COMBO facilitates representation learning of +binary code visualized by distribution analysis, and improves the performance +on all three downstream tasks by 5.45% on average compared to state-of-the-art +large-scale language representation models. To the best of our knowledge, COMBO +is the first language representation model that incorporates source code, +binary code, and comments into contrastive code representation learning and +unifies multiple tasks for binary code analysis. + +
+
+
+
+
+ + ♻ ☆ Inflationary Flows: Calibrated Bayesian Inference with Diffusion-Based + Models + + +
+ Beyond estimating parameters of interest from data, one of the key goals of +statistical inference is to properly quantify uncertainty in these estimates. +In Bayesian inference, this uncertainty is provided by the posterior +distribution, the computation of which typically involves an intractable +high-dimensional integral. Among available approximation methods, +sampling-based approaches come with strong theoretical guarantees but scale +poorly to large problems, while variational approaches scale well but offer few +theoretical guarantees. In particular, variational methods are known to produce +overconfident estimates of posterior uncertainty and are typically +non-identifiable, with many latent variable configurations generating +equivalent predictions. Here, we address these challenges by showing how +diffusion-based models (DBMs), which have recently produced state-of-the-art +performance in generative modeling tasks, can be repurposed for performing +calibrated, identifiable Bayesian inference. By exploiting a previously +established connection between the stochastic and probability flow ordinary +differential equations (pfODEs) underlying DBMs, we derive a class of models, +inflationary flows, that uniquely and deterministically map high-dimensional +data to a lower-dimensional Gaussian distribution via ODE integration. This map +is both invertible and neighborhood-preserving, with controllable numerical +error, with the result that uncertainties in the data are correctly propagated +to the latent space. We demonstrate how such maps can be learned via standard +DBM training using a novel noise schedule and are effective at both preserving +and reducing intrinsic data dimensionality. The result is a class of highly +expressive generative models, uniquely defined on a low-dimensional latent +space, that afford principled Bayesian inference. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ NeRF-US: Removing Ultrasound Imaging Artifacts from Neural Radiance + Fields in the Wild + + +
+ Current methods for performing 3D reconstruction and novel view synthesis +(NVS) in ultrasound imaging data often face severe artifacts when training +NeRF-based approaches. The artifacts produced by current approaches differ from +NeRF floaters in general scenes because of the unique nature of ultrasound +capture. Furthermore, existing models fail to produce reasonable 3D +reconstructions when ultrasound data is captured or obtained casually in +uncontrolled environments, which is common in clinical settings. Consequently, +existing reconstruction and NVS methods struggle to handle ultrasound motion, +fail to capture intricate details, and cannot model transparent and reflective +surfaces. In this work, we introduced NeRF-US, which incorporates 3D-geometry +guidance for border probability and scattering density into NeRF training, +while also utilizing ultrasound-specific rendering over traditional volume +rendering. These 3D priors are learned through a diffusion model. Through +experiments conducted on our new "Ultrasound in the Wild" dataset, we observed +accurate, clinically plausible, artifact-free reconstructions. + +
+
+
+
+
+ + ♻ ☆ Evaluating the Stability of Deep Learning Latent Feature Spaces + + +
+ High-dimensional datasets present substantial challenges in statistical +modeling across various disciplines, necessitating effective dimensionality +reduction methods. Deep learning approaches, notable for their capacity to +distill essential features from complex data, facilitate modeling, +visualization, and compression through reduced dimensionality latent feature +spaces, have wide applications from bioinformatics to earth sciences. This +study introduces a novel workflow to evaluate the stability of these latent +spaces, ensuring consistency and reliability in subsequent analyses. Stability, +defined as the invariance of latent spaces to minor data, training +realizations, and parameter perturbations, is crucial yet often overlooked. + Our proposed methodology delineates three stability types, sample, +structural, and inferential, within latent spaces, and introduces a suite of +metrics for comprehensive evaluation. We implement this workflow across 500 +autoencoder realizations and three datasets, encompassing both synthetic and +real-world scenarios to explain latent space dynamics. Employing k-means +clustering and the modified Jonker-Volgenant algorithm for class alignment, +alongside anisotropy metrics and convex hull analysis, we introduce adjusted +stress and Jaccard dissimilarity as novel stability indicators. + Our findings highlight inherent instabilities in latent feature spaces and +demonstrate the workflow's efficacy in quantifying and interpreting these +instabilities. This work advances the understanding of latent feature spaces, +promoting improved model interpretability and quality control for more informed +decision-making for diverse analytical workflows that leverage deep learning. + +
+
+ comment: 30 pages, 11 figures, submitted to Journal +
+
+
+
+
+ + ♻ ☆ JPEG-LM: LLMs as Image Generators with Canonical Codec Representations + + +
+ Recent work in image and video generation has been adopting the +autoregressive LLM architecture due to its generality and potentially easy +integration into multi-modal systems. The crux of applying autoregressive +training in language generation to visual generation is discretization -- +representing continuous data like images and videos as discrete tokens. Common +methods of discretizing images and videos include modeling raw pixel values, +which are prohibitively lengthy, or vector quantization, which requires +convoluted pre-hoc training. In this work, we propose to directly model images +and videos as compressed files saved on computers via canonical codecs (e.g., +JPEG, AVC/H.264). Using the default Llama architecture without any +vision-specific modifications, we pretrain JPEG-LM from scratch to generate +images (and AVC-LM to generate videos as a proof of concept), by directly +outputting compressed file bytes in JPEG and AVC formats. Evaluation of image +generation shows that this simple and straightforward approach is more +effective than pixel-based modeling and sophisticated vector quantization +baselines (on which our method yields a 31% reduction in FID). Our analysis +shows that JPEG-LM has an especial advantage over vector quantization models in +generating long-tail visual elements. Overall, we show that using canonical +codec representations can help lower the barriers between language generation +and visual generation, facilitating future research on multi-modal +language/image/video LLMs. + +
+
+
+
+
+ + ♻ ☆ Efficient generative adversarial networks using linear + additive-attention Transformers + + +
+ Although the capacity of deep generative models for image generation, such as +Diffusion Models (DMs) and Generative Adversarial Networks (GANs), has +dramatically improved in recent years, much of their success can be attributed +to computationally expensive architectures. This has limited their adoption and +use to research laboratories and companies with large resources, while +significantly raising the carbon footprint for training, fine-tuning, and +inference. In this work, we present LadaGAN, an efficient generative +adversarial network that is built upon a novel Transformer block named +Ladaformer. The main component of this block is a linear additive-attention +mechanism that computes a single attention vector per head instead of the +quadratic dot-product attention. We employ Ladaformer in both the generator and +discriminator, which reduces the computational complexity and overcomes the +training instabilities often associated with Transformer GANs. LadaGAN +consistently outperforms existing convolutional and Transformer GANs on +benchmark datasets at different resolutions while being significantly more +efficient. Moreover, LadaGAN shows competitive performance compared to +state-of-the-art multi-step generative models (e.g. DMs) using orders of +magnitude less computational resources. + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+
+
+
+ + Multimedia 9 + +
+
+
+ + ☆ MCDubber: Multimodal Context-Aware Expressive Video Dubbing + + +
+ Automatic Video Dubbing (AVD) aims to take the given script and generate +speech that aligns with lip motion and prosody expressiveness. Current AVD +models mainly utilize visual information of the current sentence to enhance the +prosody of synthesized speech. However, it is crucial to consider whether the +prosody of the generated dubbing aligns with the multimodal context, as the +dubbing will be combined with the original context in the final video. This +aspect has been overlooked in previous studies. To address this issue, we +propose a Multimodal Context-aware video Dubbing model, termed +\textbf{MCDubber}, to convert the modeling object from a single sentence to a +longer sequence with context information to ensure the consistency of the +global context prosody. MCDubber comprises three main components: (1) A context +duration aligner aims to learn the context-aware alignment between the text and +lip frames; (2) A context prosody predictor seeks to read the global context +visual sequence and predict the context-aware global energy and pitch; (3) A +context acoustic decoder ultimately predicts the global context mel-spectrogram +with the assistance of adjacent ground-truth mel-spectrograms of the target +sentence. Through this process, MCDubber fully considers the influence of +multimodal context on the prosody expressiveness of the current sentence when +dubbing. The extracted mel-spectrogram belonging to the target sentence from +the output context mel-spectrograms is the final required dubbing audio. +Extensive experiments on the Chem benchmark dataset demonstrate that our +MCDubber significantly improves dubbing expressiveness compared to all advanced +baselines. The code and demos are available at +https://github.com/XiaoYuanJun-zy/MCDubber. + +
+
+
+
+
+ + ☆ Let Community Rules Be Reflected in Online Content Moderation + + +
+ Content moderation is a widely used strategy to prevent the dissemination of +irregular information on social media platforms. Despite extensive research on +developing automated models to support decision-making in content moderation, +there remains a notable scarcity of studies that integrate the rules of online +communities into content moderation. This study addresses this gap by proposing +a community rule-based content moderation framework that directly integrates +community rules into the moderation of user-generated content. Our experiment +results with datasets collected from two domains demonstrate the superior +performance of models based on the framework to baseline models across all +evaluation metrics. In particular, incorporating community rules substantially +enhances model performance in content moderation. The findings of this research +have significant research and practical implications for improving the +effectiveness and generalizability of content moderation models in online +communities. + +
+
+ comment: 10 pages, 3 figures +
+
+
+
+
+ + ☆ AIM 2024 Challenge on Compressed Video Quality Assessment: Methods and + Results + + +
+ Video quality assessment (VQA) is a crucial task in the development of video +compression standards, as it directly impacts the viewer experience. This paper +presents the results of the Compressed Video Quality Assessment challenge, held +in conjunction with the Advances in Image Manipulation (AIM) workshop at ECCV +2024. The challenge aimed to evaluate the performance of VQA methods on a +diverse dataset of 459 videos, encoded with 14 codecs of various compression +standards (AVC/H.264, HEVC/H.265, AV1, and VVC/H.266) and containing a +comprehensive collection of compression artifacts. To measure the methods +performance, we employed traditional correlation coefficients between their +predictions and subjective scores, which were collected via large-scale +crowdsourced pairwise human comparisons. For training purposes, participants +were provided with the Compressed Video Quality Assessment Dataset (CVQAD), a +previously developed dataset of 1022 videos. Up to 30 participating teams +registered for the challenge, while we report the results of 6 teams, which +submitted valid final solutions and code for reproducing the results. Moreover, +we calculated and present the performance of state-of-the-art VQA methods on +the developed dataset, providing a comprehensive benchmark for future research. +The dataset, results, and online leaderboard are publicly available at +https://challenges.videoprocessing.ai/challenges/compressed-video-quality-assessment.html. + +
+
+
+
+
+ + ☆ Video-Foley: Two-Stage Video-To-Sound Generation via Temporal Event + Condition For Foley Sound + + +
+ Foley sound synthesis is crucial for multimedia production, enhancing user +experience by synchronizing audio and video both temporally and semantically. +Recent studies on automating this labor-intensive process through +video-to-sound generation face significant challenges. Systems lacking explicit +temporal features suffer from poor controllability and alignment, while +timestamp-based models require costly and subjective human annotation. We +propose Video-Foley, a video-to-sound system using Root Mean Square (RMS) as a +temporal event condition with semantic timbre prompts (audio or text). RMS, a +frame-level intensity envelope feature closely related to audio semantics, +ensures high controllability and synchronization. The annotation-free +self-supervised learning framework consists of two stages, Video2RMS and +RMS2Sound, incorporating novel ideas including RMS discretization and +RMS-ControlNet with a pretrained text-to-audio model. Our extensive evaluation +shows that Video-Foley achieves state-of-the-art performance in audio-visual +alignment and controllability for sound timing, intensity, timbre, and nuance. +Code, model weights, and demonstrations are available on the accompanying +website. (https://jnwnlee.github.io/video-foley-demo) + +
+
+
+
+
+ + ♻ ☆ ContextualStory: Consistent Visual Storytelling with Spatially-Enhanced + and Storyline Context + + +
+ Visual storytelling involves generating a sequence of coherent frames from a +textual storyline while maintaining consistency in characters and scenes. +Existing autoregressive methods, which rely on previous frame-sentence pairs, +struggle with high memory usage, slow generation speeds, and limited context +integration. To address these issues, we propose ContextualStory, a novel +framework designed to generate coherent story frames and extend frames for +story continuation. ContextualStory utilizes Spatially-Enhanced Temporal +Attention to capture spatial and temporal dependencies, handling significant +character movements effectively. Additionally, we introduces a Storyline +Contextualizer to enrich context in storyline embedding and a StoryFlow Adapter +to measure scene changes between frames for guiding model. Extensive +experiments on PororoSV and FlintstonesSV benchmarks demonstrate that +ContextualStory significantly outperforms existing methods in both story +visualization and story continuation. + +
+
+
+
+
+ + ♻ ☆ Freehand Sketch Generation from Mechanical Components ACM MM + + +
+ Drawing freehand sketches of mechanical components on multimedia devices for +AI-based engineering modeling has become a new trend. However, its development +is being impeded because existing works cannot produce suitable sketches for +data-driven research. These works either generate sketches lacking a freehand +style or utilize generative models not originally designed for this task +resulting in poor effectiveness. To address this issue, we design a two-stage +generative framework mimicking the human sketching behavior pattern, called +MSFormer, which is the first time to produce humanoid freehand sketches +tailored for mechanical components. The first stage employs Open CASCADE +technology to obtain multi-view contour sketches from mechanical components, +filtering perturbing signals for the ensuing generation process. Meanwhile, we +design a view selector to simulate viewpoint selection tasks during human +sketching for picking out information-rich sketches. The second stage +translates contour sketches into freehand sketches by a transformer-based +generator. To retain essential modeling features as much as possible and +rationalize stroke distribution, we introduce a novel edge-constraint stroke +initialization. Furthermore, we utilize a CLIP vision encoder and a new loss +function incorporating the Hausdorff distance to enhance the generalizability +and robustness of the model. Extensive experiments demonstrate that our +approach achieves state-of-the-art performance for generating freehand sketches +in the mechanical domain. Project page: https://mcfreeskegen.github.io . + +
+
+ comment: Published at ACM Multimedia (ACM MM) 2024 +
+
+
+
+
+ + ♻ ☆ ICE: Interactive 3D Game Character Editing via Dialogue + + +
+ ost recent popular Role-Playing Games (RPGs) allow players to create in-game +characters with hundreds of adjustable parameters, including bone positions and +various makeup options. Although text-driven auto-customization systems have +been developed to simplify the complex process of adjusting these intricate +character parameters, they are limited by their single-round generation and +lack the capability for further editing and fine-tuning. In this paper, we +propose an Interactive Character Editing framework (ICE) to achieve a +multi-round dialogue-based refinement process. In a nutshell, our ICE offers a +more user-friendly way to enable players to convey creative ideas iteratively +while ensuring that created characters align with the expectations of players. +Specifically, we propose an Instruction Parsing Module (IPM) that utilizes +large language models (LLMs) to parse multi-round dialogues into clear editing +instruction prompts in each round. To reliably and swiftly modify character +control parameters at a fine-grained level, we propose a Semantic-guided +Low-dimension Parameter Solver (SLPS) that edits character control parameters +according to prompts in a zero-shot manner. Our SLPS first localizes the +character control parameters related to the fine-grained modification, and then +optimizes the corresponding parameters in a low-dimension space to avoid +unrealistic results. Extensive experimental results demonstrate the +effectiveness of our proposed ICE for in-game character creation and the +superior editing performance of ICE. + +
+
+
+
+
+ + ♻ ☆ Medical MLLM is Vulnerable: Cross-Modality Jailbreak and Mismatched + Attacks on Medical Multimodal Large Language Models + + +
+ Security concerns related to Large Language Models (LLMs) have been +extensively explored, yet the safety implications for Multimodal Large Language +Models (MLLMs), particularly in medical contexts (MedMLLMs), remain +insufficiently studied. This paper delves into the underexplored security +vulnerabilities of MedMLLMs, especially when deployed in clinical environments +where the accuracy and relevance of question-and-answer interactions are +critically tested against complex medical challenges. By combining existing +clinical medical data with atypical natural phenomena, we define the mismatched +malicious attack (2M-attack) and introduce its optimized version, known as the +optimized mismatched malicious attack (O2M-attack or 2M-optimization). Using +the voluminous 3MAD dataset that we construct, which covers a wide range of +medical image modalities and harmful medical scenarios, we conduct a +comprehensive analysis and propose the MCM optimization method, which +significantly enhances the attack success rate on MedMLLMs. Evaluations with +this dataset and attack methods, including white-box attacks on LLaVA-Med and +transfer attacks (black-box) on four other SOTA models, indicate that even +MedMLLMs designed with enhanced security features remain vulnerable to security +breaches. Our work underscores the urgent need for a concerted effort to +implement robust security measures and enhance the safety and efficacy of +open-source MedMLLMs, particularly given the potential severity of jailbreak +attacks and other malicious or clinically significant exploits in medical +settings. Our code is available at https://github.com/dirtycomputer/O2M_attack. + +
+
+
+
+
+ + ♻ ☆ SZTU-CMU at MER2024: Improving Emotion-LLaMA with Conv-Attention for + Multimodal Emotion Recognition IJCAI + + +
+ This paper presents our winning approach for the MER-NOISE and MER-OV tracks +of the MER2024 Challenge on multimodal emotion recognition. Our system +leverages the advanced emotional understanding capabilities of Emotion-LLaMA to +generate high-quality annotations for unlabeled samples, addressing the +challenge of limited labeled data. To enhance multimodal fusion while +mitigating modality-specific noise, we introduce Conv-Attention, a lightweight +and efficient hybrid framework. Extensive experimentation vali-dates the +effectiveness of our approach. In the MER-NOISE track, our system achieves a +state-of-the-art weighted average F-score of 85.30%, surpassing the second and +third-place teams by 1.47% and 1.65%, respectively. For the MER-OV track, our +utilization of Emotion-LLaMA for open-vocabulary annotation yields an 8.52% +improvement in average accuracy and recall compared to GPT-4V, securing the +highest score among all participating large multimodal models. The code and +model for Emotion-LLaMA are available at +https://github.com/ZebangCheng/Emotion-LLaMA. + +
+
+ comment: Ranked 1st in MER24@IJCAI and MRAC24@ACM MM (MER-NOISE & MER-OV + (self-evaluated)) +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 115 + +
+
+
+ + ☆ FLAME: Learning to Navigate with Multimodal LLM in Urban Environments + + +
+ Large Language Models (LLMs) have demonstrated potential in +Vision-and-Language Navigation (VLN) tasks, yet current applications face +challenges. While LLMs excel in general conversation scenarios, they struggle +with specialized navigation tasks, yielding suboptimal performance compared to +specialized VLN models. We introduce FLAME (FLAMingo-Architected Embodied +Agent), a novel Multimodal LLM-based agent and architecture designed for urban +VLN tasks that efficiently handles multiple observations. Our approach +implements a three-phase tuning technique for effective adaptation to +navigation tasks, including single perception tuning for street view +description, multiple perception tuning for trajectory summarization, and +end-to-end training on VLN datasets. The augmented datasets are synthesized +automatically. Experimental results demonstrate FLAME's superiority over +existing methods, surpassing state-of-the-art methods by a 7.3% increase in +task completion rate on Touchdown dataset. This work showcases the potential of +Multimodal LLMs (MLLMs) in complex navigation tasks, representing an +advancement towards practical applications of MLLMs in embodied AI. Project +page: https://flame-sjtu.github.io + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ MagicDec: Breaking the Latency-Throughput Tradeoff for Long Context + Generation with Speculative Decoding + + +
+ Large Language Models (LLMs) have become more prevalent in long-context +applications such as interactive chatbots, document analysis, and agent +workflows, but it is challenging to serve long-context requests with low +latency and high throughput. Speculative decoding (SD) is a widely used +technique to reduce latency without sacrificing performance but the +conventional wisdom suggests that its efficacy is limited to small batch sizes. +In MagicDec, we show that surprisingly SD can achieve speedup even for a high +throughput inference regime for moderate to long sequences. More interestingly, +an intelligent drafting strategy can achieve better speedup with increasing +batch size based on our rigorous analysis. MagicDec first identifies the +bottleneck shifts with increasing batch size and sequence length, and uses +these insights to deploy speculative decoding more effectively for high +throughput inference. Then, it leverages draft models with sparse KV cache to +address the KV bottleneck that scales with both sequence length and batch size. + +
+
+
+
+
+ + ☆ Inside the Black Box: Detecting Data Leakage in Pre-trained Language + Encoders ECAI24 + + +
+ Despite being prevalent in the general field of Natural Language Processing +(NLP), pre-trained language models inherently carry privacy and copyright +concerns due to their nature of training on large-scale web-scraped data. In +this paper, we pioneer a systematic exploration of such risks associated with +pre-trained language encoders, specifically focusing on the membership leakage +of pre-training data exposed through downstream models adapted from pre-trained +language encoders-an aspect largely overlooked in existing literature. Our +study encompasses comprehensive experiments across four types of pre-trained +encoder architectures, three representative downstream tasks, and five +benchmark datasets. Intriguingly, our evaluations reveal, for the first time, +the existence of membership leakage even when only the black-box output of the +downstream model is exposed, highlighting a privacy risk far greater than +previously assumed. Alongside, we present in-depth analysis and insights toward +guiding future researchers and practitioners in addressing the privacy +considerations in developing pre-trained language models. + +
+
+ comment: ECAI24 +
+
+
+
+
+ + ☆ Scaling Law with Learning Rate Annealing + + +
+ We find that the cross-entropy loss curves of neural language models +empirically adhere to a scaling law with learning rate (LR) annealing over +training steps ($s$): $$L(s) = L_0 + A\cdot S_1^{-\alpha} - C\cdot S_2$$ Where +$S_1$ is forward area and $S_2$ is learning rate annealing area. This +formulation takes into account two factors: (1) The forward scaling defined as +typical scaling law, and (2) the additional loss drop brought by LR annealing. +Therefore, this formulation can describe the full loss curve at each step, +rather than the single loss point at the end of training. Applying the scaling +law with LR annealing and fitting only one or two training curves, we can +accurately predict the loss of language model training at any given step and +across any learning rate scheduler (LRS). Furthermore, this equation accurately +describes the dynamics during training process, and provides a theoretical +verification and explanation for numerous experimental findings of previous +studies, particularly those focusing on LR schedule and LR annealing. The +resulting insights, also serve as a guide for researchers to select critical +LRS in advance by prediction using our equation. Most significantly, since all +the points in a full training curve follow the equation, we can achieve +accurate loss prediction at any given step across any learning rate scheduler, +while expending less than 1\% of the computational cost required by the +chinchilla scaling law to fit language modeling loss. This approach extremely +democratizes scaling law fitting and predicting in developing large language +models. + +
+
+ comment: 25 pages, 23 figures +
+
+
+
+
+ + ☆ Athena: Safe Autonomous Agents with Verbal Contrastive Learning + + +
+ Due to emergent capabilities, large language models (LLMs) have been utilized +as language-based agents to perform a variety of tasks and make decisions with +an increasing degree of autonomy. These autonomous agents can understand +high-level instructions, interact with their environments, and execute complex +tasks using a selection of tools available to them. As the capabilities of the +agents expand, ensuring their safety and trustworthiness becomes more +imperative. In this study, we introduce the Athena framework which leverages +the concept of verbal contrastive learning where past safe and unsafe +trajectories are used as in-context (contrastive) examples to guide the agent +towards safety while fulfilling a given task. The framework also incorporates a +critiquing mechanism to guide the agent to prevent risky actions at every step. +Furthermore, due to the lack of existing benchmarks on the safety reasoning +ability of LLM-based agents, we curate a set of 80 toolkits across 8 categories +with 180 scenarios to provide a safety evaluation benchmark. Our experimental +evaluation, with both closed- and open-source LLMs, indicates verbal +contrastive learning and interaction-level critiquing improve the safety rate +significantly. + +
+
+ comment: 9 pages, 2 figures, 4 tables +
+
+
+
+
+ + ☆ While GitHub Copilot Excels at Coding, Does It Ensure Responsible + Output? + + +
+ The rapid development of large language models (LLMs) has significantly +advanced code completion capabilities, giving rise to a new generation of +LLM-based Code Completion Tools (LCCTs). Unlike general-purpose LLMs, these +tools possess unique workflows, integrating multiple information sources as +input and prioritizing code suggestions over natural language interaction, +which introduces distinct security challenges. Additionally, LCCTs often rely +on proprietary code datasets for training, raising concerns about the potential +exposure of sensitive data. This paper exploits these distinct characteristics +of LCCTs to develop targeted attack methodologies on two critical security +risks: jailbreaking and training data extraction attacks. Our experimental +results expose significant vulnerabilities within LCCTs, including a 99.4% +success rate in jailbreaking attacks on GitHub Copilot and a 46.3% success rate +on Amazon Q. Furthermore, We successfully extracted sensitive user data from +GitHub Copilot, including 54 real email addresses and 314 physical addresses +associated with GitHub usernames. Our study also demonstrates that these +code-based attack methods are effective against general-purpose LLMs, such as +the GPT series, highlighting a broader security misalignment in the handling of +code by modern LLMs. These findings underscore critical security challenges +associated with LCCTs and suggest essential directions for strengthening their +security frameworks. The example code and attack samples from our research are +provided at https://github.com/Sensente/Security-Attacks-on-LCCTs. + +
+
+
+
+
+ + ☆ Disentangling segmental and prosodic factors to non-native speech + comprehensibility + + +
+ Current accent conversion (AC) systems do not disentangle the two main +sources of non-native accent: segmental and prosodic characteristics. Being +able to manipulate a non-native speaker's segmental and/or prosodic channels +independently is critical to quantify how these two channels contribute to +speech comprehensibility and social attitudes. We present an AC system that not +only decouples voice quality from accent, but also disentangles the latter into +its segmental and prosodic characteristics. The system is able to generate +accent conversions that combine (1) the segmental characteristics from a source +utterance, (2) the voice characteristics from a target utterance, and (3) the +prosody of a reference utterance. We show that vector quantization of acoustic +embeddings and removal of consecutive duplicated codewords allows the system to +transfer prosody and improve voice similarity. We conduct perceptual listening +tests to quantify the individual contributions of segmental features and +prosody on the perceived comprehensibility of non-native speech. Our results +indicate that, contrary to prior research in non-native speech, segmental +features have a larger impact on comprehensibility than prosody. The proposed +AC system may also be used to study how segmental and prosody cues affect +social attitudes towards non-native speech. + +
+
+
+
+
+ + ☆ CTP-LLM: Clinical Trial Phase Transition Prediction Using Large Language + Models + + +
+ New medical treatment development requires multiple phases of clinical +trials. Despite the significant human and financial costs of bringing a drug to +market, less than 20% of drugs in testing will make it from the first phase to +final approval. Recent literature indicates that the design of the trial +protocols significantly contributes to trial performance. We investigated +Clinical Trial Outcome Prediction (CTOP) using trial design documents to +predict phase transitions automatically. We propose CTP-LLM, the first Large +Language Model (LLM) based model for CTOP. We also introduce the +PhaseTransition (PT) Dataset; which labels trials based on their progression +through the regulatory process and serves as a benchmark for CTOP evaluation. +Our fine-tuned GPT-3.5-based model (CTP-LLM) predicts clinical trial phase +transition by analyzing the trial's original protocol texts without requiring +human-selected features. CTP-LLM achieves a 67% accuracy rate in predicting +trial phase transitions across all phases and a 75% accuracy rate specifically +in predicting the transition from Phase~III to final approval. Our experimental +performance highlights the potential of LLM-powered applications in forecasting +clinical trial outcomes and assessing trial design. + +
+
+
+
+
+ + ☆ The fusion of phonography and ideographic characters into virtual + Chinese characters -- Based on Chinese and English + + +
+ The characters used in modern countries are mainly divided into ideographic +characters and phonetic characters, both of which have their advantages and +disadvantages. Chinese is difficult to learn and easy to master, while English +is easy to learn but has a large vocabulary. There is still no language that +combines the advantages of both languages and has less memory capacity, can +form words, and is easy to learn. Therefore, inventing new characters that can +be combined and the popularization of deep knowledge, and reduce disputes +through communication. Firstly, observe the advantages and disadvantages of +Chinese and English, such as their vocabulary, information content, and ease of +learning in deep scientific knowledge, and create a new writing system. Then, +use comparative analysis to observe the total score of the new language. +Through this article, it can be concluded that the new text combines the +advantages of both pictographic and alphabetical writing: new characters that +can be combined into words reduces the vocabulary that needs to be learned; +Special prefixes allow beginners to quickly guess the approximate category and +meaning of unseen words; New characters can enable humans to quickly learn more +advanced knowledge. + +
+
+ comment: 14 pages, 7 figures +
+
+
+
+
+ + ☆ NLP for The Greek Language: A Longer Survey + + +
+ English language is in the spotlight of the Natural Language Processing (NLP) +community with other languages, like Greek, lagging behind in terms of offered +methods, tools and resources. Due to the increasing interest in NLP, in this +paper we try to condense research efforts for the automatic processing of Greek +language covering the last three decades. In particular, we list and briefly +discuss related works, resources and tools, categorized according to various +processing layers and contexts. We are not restricted to the modern form of +Greek language but also cover Ancient Greek and various Greek dialects. This +survey can be useful for researchers and students interested in NLP tasks, +Information Retrieval and Knowledge Management for the Greek language. + +
+
+
+
+
+ + ☆ Dr.Academy: A Benchmark for Evaluating Questioning Capability in + Education for Large Language Models ACL 2024 + + +
+ Teachers are important to imparting knowledge and guiding learners, and the +role of large language models (LLMs) as potential educators is emerging as an +important area of study. Recognizing LLMs' capability to generate educational +content can lead to advances in automated and personalized learning. While LLMs +have been tested for their comprehension and problem-solving skills, their +capability in teaching remains largely unexplored. In teaching, questioning is +a key skill that guides students to analyze, evaluate, and synthesize core +concepts and principles. Therefore, our research introduces a benchmark to +evaluate the questioning capability in education as a teacher of LLMs through +evaluating their generated educational questions, utilizing Anderson and +Krathwohl's taxonomy across general, monodisciplinary, and interdisciplinary +domains. We shift the focus from LLMs as learners to LLMs as educators, +assessing their teaching capability through guiding them to generate questions. +We apply four metrics, including relevance, coverage, representativeness, and +consistency, to evaluate the educational quality of LLMs' outputs. Our results +indicate that GPT-4 demonstrates significant potential in teaching general, +humanities, and science courses; Claude2 appears more apt as an +interdisciplinary teacher. Furthermore, the automatic scores align with human +perspectives. + +
+
+ comment: Accepted to ACL 2024 +
+
+
+
+
+ + ☆ SysBench: Can Large Language Models Follow System Messages? + + +
+ Large Language Models (LLMs) have become instrumental across various +applications, with the customization of these models to specific scenarios +becoming increasingly critical. System message, a fundamental component of +LLMs, is consist of carefully crafted instructions that guide the behavior of +model to meet intended goals. Despite the recognized potential of system +messages to optimize AI-driven solutions, there is a notable absence of a +comprehensive benchmark for evaluating how well different LLMs follow these +system messages. To fill this gap, we introduce SysBench, a benchmark that +systematically analyzes system message following ability in terms of three +challenging aspects: constraint complexity, instruction misalignment and +multi-turn stability. In order to enable effective evaluation, SysBench +constructs multi-turn user conversations covering various interaction +relationships, based on six common types of constraints from system messages in +real-world scenarios. Our dataset contains 500 system messages from various +domains, each paired with 5 turns of user conversations, which have been +manually formulated and checked to guarantee high quality. SysBench provides +extensive evaluation across various LLMs, measuring their ability to follow +specified constraints given in system messages. The results highlight both the +strengths and weaknesses of existing models, offering key insights and +directions for future research. The open source library SysBench is available +at https://github.com/PKU-Baichuan-MLSystemLab/SysBench. + +
+
+
+
+
+ + ☆ LBC: Language-Based-Classifier for Out-Of-Variable Generalization + + +
+ Large Language Models (LLMs) have great success in natural language +processing tasks such as response generation. However, their use in tabular +data has been limited due to their inferior performance compared to traditional +machine learning models (TMLs) such as XGBoost. We find that the pre-trained +knowledge of LLMs enables them to interpret new variables that appear in a test +without additional training, a capability central to the concept of +Out-of-Variable (OOV). From the findings, we propose a +Language-Based-Classifier (LBC), a classifier that maximizes the benefits of +LLMs to outperform TMLs on OOV tasks. LBC employs three key methodological +strategies: 1) Categorical changes to adjust data to better fit the model's +understanding, 2) Advanced order and indicator to enhance data representation +to the model, and 3) Using verbalizer to map logit scores to classes during +inference to generate model predictions. These strategies, combined with the +pre-trained knowledge of LBC, emphasize the model's ability to effectively +handle OOV tasks. We empirically and theoretically validate the superiority of +LBC. LBC is the first study to apply an LLM-based model to OOV tasks. The +source code is at +https://github.com/ASDASDanonymous/Language-Based-Classifier-forOOVtasks. + +
+
+ comment: 16 pages, 7 figures, 4 tables +
+
+
+
+
+ + ☆ CHECKWHY: Causal Fact Verification via Argument Structure ACL2024 + + +
+ With the growing complexity of fact verification tasks, the concern with +"thoughtful" reasoning capabilities is increasing. However, recent fact +verification benchmarks mainly focus on checking a narrow scope of semantic +factoids within claims and lack an explicit logical reasoning process. In this +paper, we introduce CheckWhy, a challenging dataset tailored to a novel causal +fact verification task: checking the truthfulness of the causal relation within +claims through rigorous reasoning steps. CheckWhy consists of over 19K "why" +claim-evidence-argument structure triplets with supports, refutes, and not +enough info labels. Each argument structure is composed of connected evidence, +representing the reasoning process that begins with foundational evidence and +progresses toward claim establishment. Through extensive experiments on +state-of-the-art models, we validate the importance of incorporating the +argument structure for causal fact verification. Moreover, the automated and +human evaluation of argument structure generation reveals the difficulty in +producing satisfying argument structure by fine-tuned models or +Chain-of-Thought prompted LLMs, leaving considerable room for future +improvements. + +
+
+ comment: Accepted by ACL2024; Awarded as Outstanding Paper Award and Area + Chair Award +
+
+
+
+
+ + ☆ To Code, or Not To Code? Exploring Impact of Code in Pre-training + + +
+ Including code in the pre-training data mixture, even for models not +specifically designed for code, has become a common practice in LLMs +pre-training. While there has been anecdotal consensus among practitioners that +code data plays a vital role in general LLMs' performance, there is only +limited work analyzing the precise impact of code on non-code tasks. In this +work, we systematically investigate the impact of code data on general +performance. We ask "what is the impact of code data used in pre-training on a +large variety of downstream tasks beyond code generation". We conduct extensive +ablations and evaluate across a broad range of natural language reasoning +tasks, world knowledge tasks, code benchmarks, and LLM-as-a-judge win-rates for +models with sizes ranging from 470M to 2.8B parameters. Across settings, we +find a consistent results that code is a critical building block for +generalization far beyond coding tasks and improvements to code quality have an +outsized impact across all tasks. In particular, compared to text-only +pre-training, the addition of code results in up to relative increase of 8.2% +in natural language (NL) reasoning, 4.2% in world knowledge, 6.6% improvement +in generative win-rates, and a 12x boost in code performance respectively. Our +work suggests investments in code quality and preserving code during +pre-training have positive impacts. + +
+
+
+
+
+ + ☆ BEYOND DIALOGUE: A Profile-Dialogue Alignment Framework Towards General + Role-Playing Language Model + + +
+ The rapid advancement of large language models (LLMs) has revolutionized +role-playing, enabling the development of general role-playing models. However, +current role-playing training has two significant issues: (I) Using a +predefined role profile to prompt dialogue training for specific scenarios +usually leads to inconsistencies and even conflicts between the dialogue and +the profile, resulting in training biases. (II) The model learns to imitate the +role based solely on the profile, neglecting profile-dialogue alignment at the +sentence level. In this work, we propose a simple yet effective framework +called BEYOND DIALOGUE, designed to overcome these hurdles. This framework +innovatively introduces "beyond dialogue" tasks to align dialogue with profile +traits based on each specific scenario, thereby eliminating biases during +training. Furthermore, by adopting an innovative prompting mechanism that +generates reasoning outcomes for training, the framework allows the model to +achieve fine-grained alignment between profile and dialogue at the sentence +level. The aforementioned methods are fully automated and low-cost. +Additionally, the integration of automated dialogue and objective evaluation +methods forms a comprehensive framework, paving the way for general +role-playing. Experimental results demonstrate that our model excels in +adhering to and reflecting various dimensions of role profiles, outperforming +most proprietary general and specialized role-playing baselines. All code and +datasets are available at https://github.com/yuyouyu32/BeyondDialogue. + +
+
+
+
+
+ + ☆ Soda-Eval: Open-Domain Dialogue Evaluation in the age of LLMs + + +
+ Although human evaluation remains the gold standard for open-domain dialogue +evaluation, the growing popularity of automated evaluation using Large Language +Models (LLMs) has also extended to dialogue. However, most frameworks leverage +benchmarks that assess older chatbots on aspects such as fluency and relevance, +which are not reflective of the challenges associated with contemporary models. +In fact, a qualitative analysis on Soda, a GPT-3.5 generated dialogue dataset, +suggests that current chatbots may exhibit several recurring issues related to +coherence and commonsense knowledge, but generally produce highly fluent and +relevant responses. + Noting the aforementioned limitations, this paper introduces Soda-Eval, an +annotated dataset based on Soda that covers over 120K turn-level assessments +across 10K dialogues, where the annotations were generated by GPT-4. Using +Soda-Eval as a benchmark, we then study the performance of several open-access +instruction-tuned LLMs, finding that dialogue evaluation remains challenging. +Fine-tuning these models improves performance over few-shot inferences, both in +terms of correlation and explanation. + +
+
+ comment: 22 pages, 10 figures +
+
+
+
+
+ + ☆ Benchmarking Large Language Models for Math Reasoning Tasks + + +
+ The use of Large Language Models (LLMs) in mathematical reasoning has become +a cornerstone of related research, demonstrating the intelligence of these +models and enabling potential practical applications through their advanced +performance, such as in educational settings. Despite the variety of datasets +and in-context learning algorithms designed to improve the ability of LLMs to +automate mathematical problem solving, the lack of comprehensive benchmarking +across different datasets makes it complicated to select an appropriate model +for specific tasks. In this project, we present a benchmark that fairly +compares seven state-of-the-art in-context learning algorithms for mathematical +problem solving across five widely used mathematical datasets on four powerful +foundation models. Furthermore, we explore the trade-off between efficiency and +performance, highlighting the practical applications of LLMs for mathematical +reasoning. Our results indicate that larger foundation models like GPT-4o and +LLaMA 3-70B can solve mathematical reasoning independently from the concrete +prompting strategy, while for smaller models the in-context learning approach +significantly influences the performance. Moreover, the optimal prompt depends +on the chosen foundation model. We open-source our benchmark code to support +the integration of additional models in future research. + +
+
+
+
+
+ + ☆ Exploiting Large Language Models Capabilities for Question Answer-Driven + Knowledge Graph Completion Across Static and Temporal Domains + + +
+ Knowledge graph completion (KGC) aims to identify missing triples in a +knowledge graph (KG). This is typically achieved through tasks such as link +prediction and instance completion. However, these methods often focus on +either static knowledge graphs (SKGs) or temporal knowledge graphs (TKGs), +addressing only within-scope triples. This paper introduces a new generative +completion framework called Generative Subgraph-based KGC (GS-KGC). GS-KGC +employs a question-answering format to directly generate target entities, +addressing the challenge of questions having multiple possible answers. We +propose a strategy that extracts subgraphs centered on entities and +relationships within the KG, from which negative samples and neighborhood +information are separately obtained to address the one-to-many problem. Our +method generates negative samples using known facts to facilitate the discovery +of new information. Furthermore, we collect and refine neighborhood path data +of known entities, providing contextual information to enhance reasoning in +large language models (LLMs). Our experiments evaluated the proposed method on +four SKGs and two TKGs, achieving state-of-the-art Hits@1 metrics on five +datasets. Analysis of the results shows that GS-KGC can discover new triples +within existing KGs and generate new facts beyond the closed KG, effectively +bridging the gap between closed-world and open-world KGC. + +
+
+
+
+
+ + ☆ Beyond English-Centric LLMs: What Language Do Multilingual Language + Models Think in? + + +
+ In this study, we investigate whether non-English-centric LLMs, despite their +strong performance, `think' in their respective dominant language: more +precisely, `think' refers to how the representations of intermediate layers, +when un-embedded into the vocabulary space, exhibit higher probabilities for +certain dominant languages during generation. We term such languages as +internal $\textbf{latent languages}$. + We examine the latent language of three typical categories of models for +Japanese processing: Llama2, an English-centric model; Swallow, an +English-centric model with continued pre-training in Japanese; and LLM-jp, a +model pre-trained on balanced English and Japanese corpora. Our empirical +findings reveal that, unlike Llama2 which relies exclusively on English as the +internal latent language, Japanese-specific Swallow and LLM-jp employ both +Japanese and English, exhibiting dual internal latent languages. For any given +target language, the model preferentially activates the latent language most +closely related to it. In addition, we explore how intermediate layers respond +to questions involving cultural conflicts between latent internal and target +output languages. We further explore how the language identity shifts across +layers while keeping consistent semantic meaning reflected in the intermediate +layer representations. + This study deepens the understanding of non-English-centric large language +models, highlighting the intricate dynamics of language representation within +their intermediate layers. + +
+
+ comment: work in progress +
+
+
+
+
+ + ☆ ColBERT Retrieval and Ensemble Response Scoring for Language Model + Question Answering + + +
+ Domain-specific question answering remains challenging for language models, +given the deep technical knowledge required to answer questions correctly. This +difficulty is amplified for smaller language models that cannot encode as much +information in their parameters as larger models. The "Specializing Large +Language Models for Telecom Networks" challenge aimed to enhance the +performance of two small language models, Phi-2 and Falcon-7B in +telecommunication question answering. In this paper, we present our question +answering systems for this challenge. Our solutions achieved leading marks of +81.9% accuracy for Phi-2 and 57.3% for Falcon-7B. We have publicly released our +code and fine-tuned models. + +
+
+ comment: This work has been submitted to the 2024 IEEE Globecom Workshops for + possible publication. Copyright may be transferred without notice, after + which this version may no longer be accessible +
+
+
+
+
+ + ☆ Adversarial Attack for Explanation Robustness of Rationalization Models + + +
+ Rationalization models, which select a subset of input text as +rationale-crucial for humans to understand and trust predictions-have recently +emerged as a prominent research area in eXplainable Artificial Intelligence. +However, most of previous studies mainly focus on improving the quality of the +rationale, ignoring its robustness to malicious attack. Specifically, whether +the rationalization models can still generate high-quality rationale under the +adversarial attack remains unknown. To explore this, this paper proposes UAT2E, +which aims to undermine the explainability of rationalization models without +altering their predictions, thereby eliciting distrust in these models from +human users. UAT2E employs the gradient-based search on triggers and then +inserts them into the original input to conduct both the non-target and target +attack. Experimental results on five datasets reveal the vulnerability of +rationalization models in terms of explanation, where they tend to select more +meaningless tokens under attacks. Based on this, we make a series of +recommendations for improving rationalization models in terms of explanation. + +
+
+
+
+
+ + ☆ Flexora: Flexible Low Rank Adaptation for Large Language Models + + +
+ Large Language Models (LLMs) are driving advancements in artificial +intelligence by increasing the scale of model parameters, which has +significantly enhanced generalization ability and unlocked new capabilities in +practice. However, their performance in specific downstream tasks is usually +hindered by their knowledge boundaries on these tasks. Thus, fine-tuning +techniques, especially the widely used Low-Rank Adaptation (LoRA) method, have +been introduced to expand the boundaries on these tasks, whereas LoRA would +underperform on certain tasks owing to its potential overfitting on these +tasks. To overcome this overfitting and improve the performance of LoRA, we +propose the flexible low rank adaptation (Flexora) method to automatically and +flexibly select the most important layers needing to be fine-tuned to achieve +the best performance on different downstream tasks. Specifically, Flexora +firstly frames this layer selection problem as a well-defined hyperparameter +optimization (HPO) problem, then addresses it using the unrolled +differentiation (UD) method, and finally selects the most useful layers based +on the optimized hyperparameters. Our extensive experiments on many pretrained +models and natural language tasks show that Flexora is able to consistently +improve over the existing baselines, indicating the effectiveness of our +Flexora in practice. We additionally provide insightful theoretical results and +many ablation studies to deliver a comprehensive understanding of our Flexora. + +
+
+ comment: 29 pages, 13 figures +
+
+
+
+
+ + ☆ Predicting Rewards Alongside Tokens: Non-disruptive Parameter Insertion + for Efficient Inference Intervention in Large Language Model + + +
+ Transformer-based large language models (LLMs) exhibit limitations such as +generating unsafe responses, unreliable reasoning, etc. Existing inference +intervention approaches attempt to mitigate these issues by finetuning +additional models to produce calibration signals (such as rewards) that guide +the LLM's decoding process. However, this solution introduces substantial time +and space overhead due to the separate models required. This work proposes +Non-disruptive parameters insertion (Otter), inserting extra parameters into +the transformer architecture to predict calibration signals along with the +original LLM output. Otter offers state-of-the-art performance on multiple +demanding tasks while saving up to 86.5\% extra space and 98.5\% extra time. +Furthermore, Otter seamlessly integrates with existing inference engines, +requiring only a one-line code change, and the original model response remains +accessible after the parameter insertion. Our code is publicly available at +\url{https://github.com/chenhan97/Otter} + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ Towards Efficient Large Language Models for Scientific Text: A Review + + +
+ Large language models (LLMs) have ushered in a new era for processing complex +information in various fields, including science. The increasing amount of +scientific literature allows these models to acquire and understand scientific +knowledge effectively, thus improving their performance in a wide range of +tasks. Due to the power of LLMs, they require extremely expensive computational +resources, intense amounts of data, and training time. Therefore, in recent +years, researchers have proposed various methodologies to make scientific LLMs +more affordable. The most well-known approaches align in two directions. It can +be either focusing on the size of the models or enhancing the quality of data. +To date, a comprehensive review of these two families of methods has not yet +been undertaken. In this paper, we (I) summarize the current advances in the +emerging abilities of LLMs into more accessible AI solutions for science, and +(II) investigate the challenges and opportunities of developing affordable +solutions for scientific domains using LLMs. + +
+
+
+
+
+ + ☆ Crafting Tomorrow's Headlines: Neural News Generation and Detection in + English, Turkish, Hungarian, and Persian + + +
+ In the era dominated by information overload and its facilitation with Large +Language Models (LLMs), the prevalence of misinformation poses a significant +threat to public discourse and societal well-being. A critical concern at +present involves the identification of machine-generated news. In this work, we +take a significant step by introducing a benchmark dataset designed for neural +news detection in four languages: English, Turkish, Hungarian, and Persian. The +dataset incorporates outputs from multiple multilingual generators (in both, +zero-shot and fine-tuned setups) such as BloomZ, LLaMa-2, Mistral, Mixtral, and +GPT-4. Next, we experiment with a variety of classifiers, ranging from those +based on linguistic features to advanced Transformer-based models and LLMs +prompting. We present the detection results aiming to delve into the +interpretablity and robustness of machine-generated texts detectors across all +target languages. + +
+
+
+
+
+ + ☆ MEGen: Generative Backdoor in Large Language Models via Model Editing + + +
+ Large language models (LLMs) have demonstrated remarkable capabilities. Their +powerful generative abilities enable flexible responses based on various +queries or instructions. Emerging as widely adopted generalists for diverse +tasks, LLMs are still vulnerable to backdoors. This paper proposes an +editing-based generative backdoor, named MEGen, aiming to create a customized +backdoor for NLP tasks with the least side effects. In our approach, we first +leverage a language model to insert a trigger selected on fixed metrics into +the input, then design a pipeline of model editing to directly embed a backdoor +into an LLM. By adjusting a small set of local parameters with a mini-batch of +samples, MEGen significantly enhances time efficiency and achieves high +robustness. Experimental results indicate that our backdoor attack strategy +achieves a high attack success rate on poison data while maintaining the +model's performance on clean data. Notably, the backdoored model, when +triggered, can freely output pre-set dangerous information while successfully +completing downstream tasks. This suggests that future LLM applications could +be guided to deliver certain dangerous information, thus altering the LLM's +generative style. We believe this approach provides insights for future LLM +applications and the execution of backdoor attacks on conversational AI +systems. + +
+
+ comment: Working in progress +
+
+
+
+
+ + ☆ CodeJudge-Eval: Can Large Language Models be Good Judges in Code + Understanding? + + +
+ Recent advancements in large language models (LLMs) have showcased impressive +code generation capabilities, primarily evaluated through language-to-code +benchmarks. However, these benchmarks may not fully capture a model's code +understanding abilities. We introduce CodeJudge-Eval (CJ-Eval), a novel +benchmark designed to assess LLMs' code understanding abilities from the +perspective of code judging rather than code generation. CJ-Eval challenges +models to determine the correctness of provided code solutions, encompassing +various error types and compilation issues. By leveraging a diverse set of +problems and a fine-grained judging system, CJ-Eval addresses the limitations +of traditional benchmarks, including the potential memorization of solutions. +Evaluation of 12 well-known LLMs on CJ-Eval reveals that even state-of-the-art +models struggle, highlighting the benchmark's ability to probe deeper into +models' code understanding abilities. Our benchmark will be available at +\url{https://github.com/CodeLLM-Research/CodeJudge-Eval}. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Ferret: Faster and Effective Automated Red Teaming with Reward-Based + Scoring Technique + + +
+ In today's era, where large language models (LLMs) are integrated into +numerous real-world applications, ensuring their safety and robustness is +crucial for responsible AI usage. Automated red-teaming methods play a key role +in this process by generating adversarial attacks to identify and mitigate +potential vulnerabilities in these models. However, existing methods often +struggle with slow performance, limited categorical diversity, and high +resource demands. While Rainbow Teaming, a recent approach, addresses the +diversity challenge by framing adversarial prompt generation as a +quality-diversity search, it remains slow and requires a large fine-tuned +mutator for optimal performance. To overcome these limitations, we propose +Ferret, a novel approach that builds upon Rainbow Teaming by generating +multiple adversarial prompt mutations per iteration and using a scoring +function to rank and select the most effective adversarial prompt. We explore +various scoring functions, including reward models, Llama Guard, and +LLM-as-a-judge, to rank adversarial mutations based on their potential harm to +improve the efficiency of the search for harmful mutations. Our results +demonstrate that Ferret, utilizing a reward model as a scoring function, +improves the overall attack success rate (ASR) to 95%, which is 46% higher than +Rainbow Teaming. Additionally, Ferret reduces the time needed to achieve a 90% +ASR by 15.2% compared to the baseline and generates adversarial prompts that +are transferable i.e. effective on other LLMs of larger size. Our codes are +available at https://github.com/declare-lab/ferret. + +
+
+
+
+
+ + ☆ Unconditional Truthfulness: Learning Conditional Dependency for + Uncertainty Quantification of Large Language Models + + +
+ Uncertainty quantification (UQ) is a perspective approach to detecting Large +Language Model (LLM) hallucinations and low quality output. In this work, we +address one of the challenges of UQ in generation tasks that arises from the +conditional dependency between the generation steps of an LLM. We propose to +learn this dependency from data. We train a regression model, which target +variable is the gap between the conditional and the unconditional generation +confidence. During LLM inference, we use this learned conditional dependency +model to modulate the uncertainty of the current generation step based on the +uncertainty of the previous step. Our experimental evaluation on nine datasets +and three LLMs shows that the proposed method is highly effective for +uncertainty quantification, achieving substantial improvements over rivaling +approaches. + +
+
+
+
+
+ + ☆ Towards Robust Knowledge Unlearning: An Adversarial Framework for + Assessing and Improving Unlearning Robustness in Large Language Models + + +
+ LLM have achieved success in many fields but still troubled by problematic +content in the training corpora. LLM unlearning aims at reducing their +influence and avoid undesirable behaviours. However, existing unlearning +methods remain vulnerable to adversarial queries and the unlearned knowledge +resurfaces after the manually designed attack queries. As part of a red-team +effort to proactively assess the vulnerabilities of unlearned models, we design +Dynamic Unlearning Attack (DUA), a dynamic and automated framework to attack +these models and evaluate their robustness. It optimizes adversarial suffixes +to reintroduce the unlearned knowledge in various scenarios. We find that +unlearned knowledge can be recovered in $55.2\%$ of the questions, even without +revealing the unlearned model's parameters. In response to this vulnerability, +we propose Latent Adversarial Unlearning (LAU), a universal framework that +effectively enhances the robustness of the unlearned process. It formulates the +unlearning process as a min-max optimization problem and resolves it through +two stages: an attack stage, where perturbation vectors are trained and added +to the latent space of LLMs to recover the unlearned knowledge, and a defense +stage, where previously trained perturbation vectors are used to enhance +unlearned model's robustness. With our LAU framework, we obtain two robust +unlearning methods, AdvGA and AdvNPO. We conduct extensive experiments across +multiple unlearning benchmarks and various models, and demonstrate that they +improve the unlearning effectiveness by over $53.5\%$, cause only less than a +$11.6\%$ reduction in neighboring knowledge, and have almost no impact on the +model's general capabilities. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ HMoE: Heterogeneous Mixture of Experts for Language Modeling + + +
+ Mixture of Experts (MoE) offers remarkable performance and computational +efficiency by selectively activating subsets of model parameters. +Traditionally, MoE models use homogeneous experts, each with identical +capacity. However, varying complexity in input data necessitates experts with +diverse capabilities, while homogeneous MoE hinders effective expert +specialization and efficient parameter utilization. In this study, we propose a +novel Heterogeneous Mixture of Experts (HMoE), where experts differ in size and +thus possess diverse capacities. This heterogeneity allows for more specialized +experts to handle varying token complexities more effectively. To address the +imbalance in expert activation, we propose a novel training objective that +encourages the frequent activation of smaller experts, enhancing computational +efficiency and parameter utilization. Extensive experiments demonstrate that +HMoE achieves lower loss with fewer activated parameters and outperforms +conventional homogeneous MoE models on various pre-training evaluation +benchmarks. Codes will be released upon acceptance. + +
+
+
+
+
+ + ☆ Towards Rehearsal-Free Multilingual ASR: A LoRA-based Case Study on + Whisper + + +
+ Pre-trained multilingual speech foundation models, like Whisper, have shown +impressive performance across different languages. However, adapting these +models to new or specific languages is computationally extensive and faces +catastrophic forgetting problems. Addressing these issues, our study +investigates strategies to enhance the model on new languages in the absence of +original training data, while also preserving the established performance on +the original languages. Specifically, we first compare various LoRA-based +methods to find out their vulnerability to forgetting. To mitigate this issue, +we propose to leverage the LoRA parameters from the original model for +approximate orthogonal gradient descent on the new samples. Additionally, we +also introduce a learnable rank coefficient to allocate trainable parameters +for more efficient training. Our experiments with a Chinese Whisper model (for +Uyghur and Tibetan) yield better results with a more compact parameter set. + +
+
+
+
+
+ + ☆ REInstruct: Building Instruction Data from Unlabeled Corpus ACL2024 + + +
+ Manually annotating instruction data for large language models is difficult, +costly, and hard to scale. Meanwhile, current automatic annotation methods +typically rely on distilling synthetic data from proprietary LLMs, which not +only limits the upper bound of the quality of the instruction data but also +raises potential copyright issues. In this paper, we propose REInstruct, a +simple and scalable method to automatically build instruction data from an +unlabeled corpus without heavy reliance on proprietary LLMs and human +annotation. Specifically, REInstruct first selects a subset of unlabeled texts +that potentially contain well-structured helpful and insightful content and +then generates instructions for these texts. To generate accurate and relevant +responses for effective and robust training, REInstruct further proposes a +rewriting-based approach to improve the quality of the generated instruction +data. By training Llama-7b on a combination of 3k seed data and 32k synthetic +data from REInstruct, fine-tuned model achieves a 65.41\% win rate on +AlpacaEval leaderboard against text-davinci-003, outperforming other +open-source, non-distilled instruction data construction methods. The code is +publicly available at \url{https://github.com/cs32963/REInstruct}. + +
+
+ comment: Accepted by ACL2024 Findings +
+
+
+
+
+ + ☆ Beneath the Surface of Consistency: Exploring Cross-lingual Knowledge + Representation Sharing in LLMs + + +
+ The veracity of a factoid is largely independent of the language it is +written in. However, language models are inconsistent in their ability to +answer the same factual question across languages. This raises questions about +how LLMs represent a given fact across languages. We explore multilingual +factual knowledge through two aspects: the model's ability to answer a query +consistently across languages, and the ability to ''store'' answers in a shared +representation for several languages. We propose a methodology to measure the +extent of representation sharing across languages by repurposing knowledge +editing methods. We examine LLMs with various multilingual configurations using +a new multilingual dataset. We reveal that high consistency does not +necessarily imply shared representation, particularly for languages with +different scripts. Moreover, we find that script similarity is a dominant +factor in representation sharing. Finally, we observe that if LLMs could fully +share knowledge across languages, their accuracy in their best-performing +language could benefit an increase of up to 150\% on average. These findings +highlight the need for improved multilingual knowledge representation in LLMs +and suggest a path for the development of more robust and consistent +multilingual LLMs. + +
+
+
+
+
+ + ☆ Minor SFT loss for LLM fine-tune to increase performance and reduce + model deviation + + +
+ Instruct LLM provide a paradigm used in large scale language model to align +LLM to human preference. The paradigm contains supervised fine tuning and +reinforce learning from human feedback. This paradigm is also used in +downstream scenarios to adapt LLM to specific corpora and applications. +Comparing to SFT, there are many efforts focused on RLHF and several algorithms +being proposed, such as PPO, DPO, IPO, KTO, MinorDPO and etc. Meanwhile most +efforts for SFT are focused on how to collect, filter and mix high quality +data. In this article with insight from DPO and MinorDPO, we propose a training +metric for SFT to measure the discrepancy between the optimized model and the +original model, and a loss function MinorSFT that can increase the training +effectiveness, and reduce the discrepancy between the optimized LLM and +original LLM. + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ☆ Strategist: Learning Strategic Skills by LLMs via Bi-Level Tree Search + + +
+ In this paper, we propose a new method Strategist that utilizes LLMs to +acquire new skills for playing multi-agent games through a self-improvement +process. Our method gathers quality feedback through self-play simulations with +Monte Carlo tree search and LLM-based reflection, which can then be used to +learn high-level strategic skills such as how to evaluate states that guide the +low-level execution.We showcase how our method can be used in both action +planning and dialogue generation in the context of games, achieving good +performance on both tasks. Specifically, we demonstrate that our method can +help train agents with better performance than both traditional reinforcement +learning-based approaches and other LLM-based skill learning approaches in +games including the Game of Pure Strategy (GOPS) and The Resistance: Avalon. + +
+
+ comment: website: https://llm-strategist.github.io +
+
+
+
+
+ + ☆ LLM-Barber: Block-Aware Rebuilder for Sparsity Mask in One-Shot for + Large Language Models + + +
+ Large language models (LLMs) have grown significantly in scale, leading to a +critical need for efficient model pruning techniques. Existing post-training +pruning techniques primarily focus on measuring weight importance on converged +dense models to determine salient weights to retain. However, they often +overlook the changes in weight importance during the pruning process, which can +lead to performance degradation in the pruned models. To address this issue, we +present LLM-Barber (Block-Aware Rebuilder for Sparsity Mask in One-Shot), a +novel one-shot pruning framework that rebuilds the sparsity mask of pruned +models without any retraining or weight reconstruction. LLM-Barber incorporates +block-aware error optimization across Self-Attention and MLP blocks, ensuring +global performance optimization. Inspired by the recent discovery of prominent +outliers in LLMs, LLM-Barber introduces an innovative pruning metric that +identifies weight importance using weights multiplied by gradients. Our +experiments show that LLM-Barber can efficiently prune models like LLaMA and +OPT families with 7B to 13B parameters on a single A100 GPU in just 30 minutes, +achieving state-of-the-art results in both perplexity and zero-shot performance +across various language benchmarks. Code is available at +https://github.com/YupengSu/LLM-Barber. + +
+
+
+
+
+ + ☆ Enhancing Robustness in Large Language Models: Prompting for Mitigating + the Impact of Irrelevant Information + + +
+ In recent years, Large language models (LLMs) have garnered significant +attention due to their superior performance in complex reasoning tasks. +However, recent studies may diminish their reasoning capabilities markedly when +problem descriptions contain irrelevant information, even with the use of +advanced prompting techniques. To further investigate this issue, a dataset of +primary school mathematics problems containing irrelevant information, named +GSMIR, was constructed. Testing prominent LLMs and prompting techniques on this +dataset revealed that while LLMs can identify irrelevant information, they do +not effectively mitigate the interference it causes once identified. A novel +automatic construction method, ATF, which enhances the ability of LLMs to +identify and self-mitigate the influence of irrelevant information, is proposed +to address this shortcoming. This method operates in two steps: first, analysis +of irrelevant information, followed by its filtering. The ATF method, as +demonstrated by experimental results, significantly improves the reasoning +performance of LLMs and prompting techniques, even in the presence of +irrelevant information on the GSMIR dataset. + +
+
+
+
+
+ + ☆ Promoting Equality in Large Language Models: Identifying and Mitigating + the Implicit Bias based on Bayesian Theory + + +
+ Large language models (LLMs) are trained on extensive text corpora, which +inevitably include biased information. Although techniques such as Affective +Alignment can mitigate some negative impacts of these biases, existing +prompt-based attack methods can still extract these biases from the model's +weights. Moreover, these biases frequently appear subtly when LLMs are prompted +to perform identical tasks across different demographic groups, thereby +camouflaging their presence. To address this issue, we have formally defined +the implicit bias problem and developed an innovative framework for bias +removal based on Bayesian theory, Bayesian-Theory based Bias Removal (BTBR). +BTBR employs likelihood ratio screening to pinpoint data entries within +publicly accessible biased datasets that represent biases inadvertently +incorporated during the LLM training phase. It then automatically constructs +relevant knowledge triples and expunges bias information from LLMs using model +editing techniques. Through extensive experimentation, we have confirmed the +presence of the implicit bias problem in LLMs and demonstrated the +effectiveness of our BTBR approach. + +
+
+
+
+
+ + ☆ Multilingual Non-Factoid Question Answering with Silver Answers + + +
+ Most existing Question Answering Datasets (QuADs) primarily focus on +factoid-based short-context Question Answering (QA) in high-resource languages. +However, the scope of such datasets for low-resource languages remains limited, +with only a few works centered on factoid-based QuADs and none on non-factoid +QuADs. Therefore, this work presents MuNfQuAD, a multilingual QuAD with +non-factoid questions. It utilizes interrogative sub-headings from BBC news +articles as questions and the corresponding paragraphs as silver answers. The +dataset comprises over 370K QA pairs across 38 languages, encompassing several +low-resource languages, and stands as the largest multilingual QA dataset to +date. Based on the manual annotations of 790 QA-pairs from MuNfQuAD (golden +set), we observe that 98\% of questions can be answered using their +corresponding silver answer. Our fine-tuned Answer Paragraph Selection (APS) +model outperforms the baselines. The APS model attained an accuracy of 80\% and +72\%, as well as a macro F1 of 72\% and 66\%, on the MuNfQuAD testset and the +golden set, respectively. Furthermore, the APS model effectively generalizes +certain a language within the golden set, even after being fine-tuned on silver +labels. + +
+
+
+
+
+ + ☆ An Efficient Sign Language Translation Using Spatial Configuration and + Motion Dynamics with LLMs + + +
+ Gloss-free Sign Language Translation (SLT) converts sign videos directly into +spoken language sentences without relying on glosses. Recently, Large Language +Models (LLMs) have shown remarkable translation performance in gloss-free +methods by harnessing their powerful natural language generation capabilities. +However, these methods often rely on domain-specific fine-tuning of visual +encoders to achieve optimal results. By contrast, this paper emphasizes the +importance of capturing the spatial configurations and motion dynamics inherent +in sign language. With this in mind, we introduce Spatial and Motion-based Sign +Language Translation (SpaMo), a novel LLM-based SLT framework. The core idea of +SpaMo is simple yet effective. We first extract spatial and motion features +using off-the-shelf visual encoders and then input these features into an LLM +with a language prompt. Additionally, we employ a visual-text alignment process +as a warm-up before the SLT supervision. Our experiments demonstrate that SpaMo +achieves state-of-the-art performance on two popular datasets, PHOENIX14T and +How2Sign. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Putting People in LLMs' Shoes: Generating Better Answers via Question + Rewriter + + +
+ Large Language Models (LLMs) have demonstrated significant capabilities, +particularly in the domain of question answering (QA). However, their +effectiveness in QA is often undermined by the vagueness of user questions. To +address this issue, we introduce single-round instance-level prompt +optimization, referred to as question rewriter. By enhancing the +intelligibility of human questions for black-box LLMs, our question rewriter +improves the quality of generated answers. The rewriter is optimized using +direct preference optimization based on feedback collected from automatic +criteria for evaluating generated answers; therefore, its training does not +require costly human annotations. The experiments across multiple black-box +LLMs and long-form question answering (LFQA) datasets demonstrate the efficacy +of our method. This paper provides a practical framework for training question +rewriters and sets a precedent for future explorations in prompt optimization +within LFQA tasks. Code is available at +\url{https://github.com/3244we/Question-Rewriter}. + +
+
+ comment: 7 pages, 4 figures, 5 tables +
+
+
+
+
+ + ☆ Speech Representation Learning Revisited: The Necessity of Separate + Learnable Parameters and Robust Data Augmentation + + +
+ Speech modeling methods learn one embedding for a fixed segment of speech, +typically in between 10-25 ms. The information present in speech can be divided +into two categories: "what is being said" (content) and "how it is expressed" +(other) and these two are orthogonal in nature causing the optimization +algorithm to find a sub-optimal solution if forced to optimize together. This +leads to sub-optimal performance in one or all downstream tasks as shown by +previous studies. Current self-supervised learning (SSL) methods such as HuBERT +are very good at modeling the content information present in speech. Data +augmentation improves the performance on tasks which require effective modeling +of other information but this leads to a divided capacity of the model. In this +work, we conduct a preliminary study to understand the importance of modeling +other information using separate learnable parameters. We propose a modified +version of HuBERT, termed Other HuBERT (O-HuBERT), to test our hypothesis. Our +findings are twofold: first, the O-HuBERT method is able to utilize all layers +to build complex features to encode other information; second, a robust data +augmentation strategy is essential for learning the information required by +tasks that depend on other information and to achieve state-of-the-art (SOTA) +performance on the SUPERB benchmark with a similarly sized model (100 million +parameters) and pre-training data (960 hours). + +
+
+
+
+
+ + ☆ Language Modeling on Tabular Data: A Survey of Foundations, Techniques + and Evolution + + +
+ Tabular data, a prevalent data type across various domains, presents unique +challenges due to its heterogeneous nature and complex structural +relationships. Achieving high predictive performance and robustness in tabular +data analysis holds significant promise for numerous applications. Influenced +by recent advancements in natural language processing, particularly transformer +architectures, new methods for tabular data modeling have emerged. Early +techniques concentrated on pre-training transformers from scratch, often +encountering scalability issues. Subsequently, methods leveraging pre-trained +language models like BERT have been developed, which require less data and +yield enhanced performance. The recent advent of large language models, such as +GPT and LLaMA, has further revolutionized the field, facilitating more advanced +and diverse applications with minimal fine-tuning. Despite the growing +interest, a comprehensive survey of language modeling techniques for tabular +data remains absent. This paper fills this gap by providing a systematic review +of the development of language modeling for tabular data, encompassing: (1) a +categorization of different tabular data structures and data types; (2) a +review of key datasets used in model training and tasks used for evaluation; +(3) a summary of modeling techniques including widely-adopted data processing +methods, popular architectures, and training objectives; (4) the evolution from +adapting traditional Pre-training/Pre-trained language models to the +utilization of large language models; (5) an identification of persistent +challenges and potential future research directions in language modeling for +tabular data analysis. GitHub page associated with this survey is available at: +https://github.com/lanxiang1017/Language-Modeling-on-Tabular-Data-Survey.git. + +
+
+
+
+
+ + ☆ Synergistic Approach for Simultaneous Optimization of Monolingual, + Cross-lingual, and Multilingual Information Retrieval + + +
+ Information retrieval across different languages is an increasingly important +challenge in natural language processing. Recent approaches based on +multilingual pre-trained language models have achieved remarkable success, yet +they often optimize for either monolingual, cross-lingual, or multilingual +retrieval performance at the expense of others. This paper proposes a novel +hybrid batch training strategy to simultaneously improve zero-shot retrieval +performance across monolingual, cross-lingual, and multilingual settings while +mitigating language bias. The approach fine-tunes multilingual language models +using a mix of monolingual and cross-lingual question-answer pair batches +sampled based on dataset size. Experiments on XQuAD-R, MLQA-R, and MIRACL +benchmark datasets show that the proposed method consistently achieves +comparable or superior results in zero-shot retrieval across various languages +and retrieval tasks compared to monolingual-only or cross-lingual-only +training. Hybrid batch training also substantially reduces language bias in +multilingual retrieval compared to monolingual training. These results +demonstrate the effectiveness of the proposed approach for learning +language-agnostic representations that enable strong zero-shot retrieval +performance across diverse languages. + +
+
+ comment: 15 pages, 2 figures, 13 tables +
+
+
+
+
+ + ☆ NoMatterXAI: Generating "No Matter What" Alterfactual Examples for + Explaining Black-Box Text Classification Models + + +
+ In Explainable AI (XAI), counterfactual explanations (CEs) are a well-studied +method to communicate feature relevance through contrastive reasoning of "what +if" to explain AI models' predictions. However, they only focus on important +(i.e., relevant) features and largely disregard less important (i.e., +irrelevant) ones. Such irrelevant features can be crucial in many applications, +especially when users need to ensure that an AI model's decisions are not +affected or biased against specific attributes such as gender, race, religion, +or political affiliation. To address this gap, the concept of alterfactual +explanations (AEs) has been proposed. AEs explore an alternative reality of "no +matter what", where irrelevant features are substituted with alternative +features (e.g., "republicans" -> "democrats") within the same attribute (e.g., +"politics") while maintaining a similar prediction output. This serves to +validate whether AI model predictions are influenced by the specified +attributes. Despite the promise of AEs, there is a lack of computational +approaches to systematically generate them, particularly in the text domain, +where creating AEs for AI text classifiers presents unique challenges. This +paper addresses this challenge by formulating AE generation as an optimization +problem and introducing MoMatterXAI, a novel algorithm that generates AEs for +text classification tasks. Our approach achieves high fidelity of up to 95% +while preserving context similarity of over 90% across multiple models and +datasets. A human study further validates the effectiveness of AEs in +explaining AI text classifiers to end users. All codes will be publicly +available. + +
+
+
+
+
+ + ☆ XCB: an effective contextual biasing approach to bias cross-lingual + phrases in speech recognition SC 2024 + + +
+ Contextualized ASR models have been demonstrated to effectively improve the +recognition accuracy of uncommon phrases when a predefined phrase list is +available. However, these models often struggle with bilingual settings, which +are prevalent in code-switching speech recognition. In this study, we make the +initial attempt to address this challenge by introducing a Cross-lingual +Contextual Biasing(XCB) module. Specifically, we augment a pre-trained ASR +model for the dominant language by integrating an auxiliary language biasing +module and a supplementary language-specific loss, aimed at enhancing the +recognition of phrases in the secondary language. Experimental results +conducted on our in-house code-switching dataset have validated the efficacy of +our approach, demonstrating significant improvements in the recognition of +biasing phrases in the secondary language, even without any additional +inference overhead. Additionally, our proposed system exhibits both efficiency +and generalization when is applied by the unseen ASRU-2019 test set. + +
+
+ comment: accepted to NCMMSC 2024 +
+
+
+
+
+ + ☆ Data Augmentation Integrating Dialogue Flow and Style to Adapt Spoken + Dialogue Systems to Low-Resource User Groups SIGDIAL 2024 + + +
+ This study addresses the interaction challenges encountered by spoken +dialogue systems (SDSs) when engaging with users who exhibit distinct +conversational behaviors, particularly minors, in scenarios where data are +scarce. We propose a novel data augmentation framework to enhance SDS +performance for user groups with limited resources. Our approach leverages a +large language model (LLM) to extract speaker styles and a pre-trained language +model (PLM) to simulate dialogue act history. This method generates enriched +and personalized dialogue data, facilitating improved interactions with unique +user demographics. Extensive experiments validate the efficacy of our +methodology, highlighting its potential to foster the development of more +adaptive and inclusive dialogue systems. + +
+
+ comment: Accepted to SIGDIAL 2024 +
+
+
+
+
+ + ☆ QUITO-X: An Information Bottleneck-based Compression Algorithm with + Cross-Attention + + +
+ Generative LLM have achieved significant success in various industrial tasks +and can effectively adapt to vertical domains and downstream tasks through ICL. +However, with tasks becoming increasingly complex, the context length required +by ICL is also getting longer, and two significant issues arise: (i) The +excessively long context leads to high costs and inference delays. (ii) A +substantial amount of task-irrelevant information introduced by long contexts +exacerbates the "lost in the middle" problem. + Recently, compressing prompts by removing tokens according to some metric +obtained from some causal language models, such as llama-7b, has emerged as an +effective approach to mitigate these issues. However, the metric used by prior +method such as self-information or PPL do not fully align with the objective of +distinuishing the most important tokens when conditioning on query. In this +work, we introduce information bottleneck theory to carefully examine the +properties required by the metric. Inspired by this, we use cross-attention in +encoder-decoder architecture as a new metric. Our simple method leads to +significantly better performance in smaller models with lower latency. + We evaluate our method on four datasets: DROP, CoQA, SQuAD, and Quoref. The +experimental results show that, while maintaining the same performance, our +compression rate can improve by nearly 25% over previous SOTA. Remarkably, in +experiments where 25% of the tokens are removed, our model's EM score for +answers sometimes even exceeds that of the control group using uncompressed +text as context. + +
+
+
+
+
+ + ☆ Analysis of Plan-based Retrieval for Grounded Text Generation + + +
+ In text generation, hallucinations refer to the generation of seemingly +coherent text that contradicts established knowledge. One compelling hypothesis +is that hallucinations occur when a language model is given a generation task +outside its parametric knowledge (due to rarity, recency, domain, etc.). A +common strategy to address this limitation is to infuse the language models +with retrieval mechanisms, providing the model with relevant knowledge for the +task. In this paper, we leverage the planning capabilities of instruction-tuned +LLMs and analyze how planning can be used to guide retrieval to further reduce +the frequency of hallucinations. We empirically evaluate several variations of +our proposed approach on long-form text generation tasks. By improving the +coverage of relevant facts, plan-guided retrieval and generation can produce +more informative responses while providing a higher rate of attribution to +source documents. + +
+
+
+
+
+ + ☆ Event Stream based Sign Language Translation: A High-Definition + Benchmark Dataset and A New Algorithm + + +
+ Sign Language Translation (SLT) is a core task in the field of AI-assisted +disability. Unlike traditional SLT based on visible light videos, which is +easily affected by factors such as lighting, rapid hand movements, and privacy +breaches, this paper proposes the use of high-definition Event streams for SLT, +effectively mitigating the aforementioned issues. This is primarily because +Event streams have a high dynamic range and dense temporal signals, which can +withstand low illumination and motion blur well. Additionally, due to their +sparsity in space, they effectively protect the privacy of the target person. +More specifically, we propose a new high-resolution Event stream sign language +dataset, termed Event-CSL, which effectively fills the data gap in this area of +research. It contains 14,827 videos, 14,821 glosses, and 2,544 Chinese words in +the text vocabulary. These samples are collected in a variety of indoor and +outdoor scenes, encompassing multiple angles, light intensities, and camera +movements. We have benchmarked existing mainstream SLT works to enable fair +comparison for future efforts. Based on this dataset and several other +large-scale datasets, we propose a novel baseline method that fully leverages +the Mamba model's ability to integrate temporal information of CNN features, +resulting in improved sign language translation outcomes. Both the benchmark +dataset and source code will be released on +https://github.com/Event-AHU/OpenESL + +
+
+ comment: First Large-scale and High-Definition Benchmark Dataset for + Event-based Sign Language Translation +
+
+
+
+
+ + ☆ LeCov: Multi-level Testing Criteria for Large Language Models + + +
+ Large Language Models (LLMs) are widely used in many different domains, but +because of their limited interpretability, there are questions about how +trustworthy they are in various perspectives, e.g., truthfulness and toxicity. +Recent research has started developing testing methods for LLMs, aiming to +uncover untrustworthy issues, i.e., defects, before deployment. However, +systematic and formalized testing criteria are lacking, which hinders a +comprehensive assessment of the extent and adequacy of testing exploration. To +mitigate this threat, we propose a set of multi-level testing criteria, LeCov, +for LLMs. The criteria consider three crucial LLM internal components, i.e., +the attention mechanism, feed-forward neurons, and uncertainty, and contain +nine types of testing criteria in total. We apply the criteria in two +scenarios: test prioritization and coverage-guided testing. The experiment +evaluation, on three models and four datasets, demonstrates the usefulness and +effectiveness of LeCov. + +
+
+
+
+
+ + ☆ Enhancing One-shot Pruned Pre-trained Language Models through + Sparse-Dense-Sparse Mechanism + + +
+ Pre-trained language models (PLMs) are engineered to be robust in contextual +understanding and exhibit outstanding performance in various natural language +processing tasks. However, their considerable size incurs significant +computational and storage costs. Modern pruning strategies employ one-shot +techniques to compress PLMs without the need for retraining on task-specific or +otherwise general data; however, these approaches often lead to an +indispensable reduction in performance. In this paper, we propose SDS, a +Sparse-Dense-Sparse pruning framework to enhance the performance of the pruned +PLMs from a weight distribution optimization perspective. We outline the +pruning process in three steps. Initially, we prune less critical connections +in the model using conventional one-shot pruning methods. Next, we reconstruct +a dense model featuring a pruning-friendly weight distribution by reactivating +pruned connections with sparse regularization. Finally, we perform a second +pruning round, yielding a superior pruned model compared to the initial +pruning. Experimental results demonstrate that SDS outperforms the +state-of-the-art pruning techniques SparseGPT and Wanda under an identical +sparsity configuration. For instance, SDS reduces perplexity by 9.13 on +Raw-Wikitext2 and improves accuracy by an average of 2.05% across multiple +zero-shot benchmarks for OPT-125M with 2:4 sparsity. + +
+
+
+
+
+ + ☆ Tracing Privacy Leakage of Language Models to Training Data via Adjusted + Influence Functions + + +
+ The responses generated by Large Language Models (LLMs) can include sensitive +information from individuals and organizations, leading to potential privacy +leakage. This work implements Influence Functions (IFs) to trace privacy +leakage back to the training data, thereby mitigating privacy concerns of +Language Models (LMs). However, we notice that current IFs struggle to +accurately estimate the influence of tokens with large gradient norms, +potentially overestimating their influence. When tracing the most influential +samples, this leads to frequently tracing back to samples with large gradient +norm tokens, overshadowing the actual most influential samples even if their +influences are well estimated. To address this issue, we propose Heuristically +Adjusted IF (HAIF), which reduces the weight of tokens with large gradient +norms, thereby significantly improving the accuracy of tracing the most +influential samples. To establish easily obtained groundtruth for tracing +privacy leakage, we construct two datasets, PII-E and PII-CR, representing two +distinct scenarios: one with identical text in the model outputs and +pre-training data, and the other where models leverage their reasoning +abilities to generate text divergent from pre-training data. HAIF significantly +improves tracing accuracy, enhancing it by 20.96\% to 73.71\% on the PII-E +dataset and 3.21\% to 45.93\% on the PII-CR dataset, compared to the best SOTA +IFs against various GPT-2 and QWen-1.5 models. HAIF also outperforms SOTA IFs +on real-world pretraining data CLUECorpus2020, demonstrating strong robustness +regardless prompt and response lengths. + +
+
+
+
+
+ + ☆ Unboxing Occupational Bias: Grounded Debiasing LLMs with U.S. Labor Data AAAI + + +
+ Large Language Models (LLMs) are prone to inheriting and amplifying societal +biases embedded within their training data, potentially reinforcing harmful +stereotypes related to gender, occupation, and other sensitive categories. This +issue becomes particularly problematic as biased LLMs can have far-reaching +consequences, leading to unfair practices and exacerbating social inequalities +across various domains, such as recruitment, online content moderation, or even +the criminal justice system. Although prior research has focused on detecting +bias in LLMs using specialized datasets designed to highlight intrinsic biases, +there has been a notable lack of investigation into how these findings +correlate with authoritative datasets, such as those from the U.S. National +Bureau of Labor Statistics (NBLS). To address this gap, we conduct empirical +research that evaluates LLMs in a ``bias-out-of-the-box" setting, analyzing how +the generated outputs compare with the distributions found in NBLS data. +Furthermore, we propose a straightforward yet effective debiasing mechanism +that directly incorporates NBLS instances to mitigate bias within LLMs. Our +study spans seven different LLMs, including instructable, base, and +mixture-of-expert models, and reveals significant levels of bias that are often +overlooked by existing bias detection techniques. Importantly, our debiasing +method, which does not rely on external datasets, demonstrates a substantial +reduction in bias scores, highlighting the efficacy of our approach in creating +fairer and more reliable LLMs. + +
+
+ comment: Accepted in AAAI Spring Symposium 2024 +
+
+
+
+
+ + ☆ A Little Confidence Goes a Long Way + + +
+ We introduce a group of related methods for binary classification tasks using +probes of the hidden state activations in large language models (LLMs). +Performance is on par with the largest and most advanced LLMs currently +available, but requiring orders of magnitude fewer computational resources and +not requiring labeled data. This approach involves translating class labels +into a semantically rich description, spontaneous symmetry breaking of +multilayer perceptron probes for unsupervised learning and inference, training +probes to generate confidence scores (prior probabilities) from hidden state +activations subject to known constraints via entropy maximization, and +selecting the most confident probe model from an ensemble for prediction. These +techniques are evaluated on four datasets using five base LLMs. + +
+
+ comment: 13 pages, 2 figures +
+
+
+
+
+ + ☆ Out-of-Distribution Detection with Attention Head Masking for Multimodal + Document Classification + + +
+ Detecting out-of-distribution (OOD) data is crucial in machine learning +applications to mitigate the risk of model overconfidence, thereby enhancing +the reliability and safety of deployed systems. The majority of existing OOD +detection methods predominantly address uni-modal inputs, such as images or +texts. In the context of multi-modal documents, there is a notable lack of +extensive research on the performance of these methods, which have primarily +been developed with a focus on computer vision tasks. We propose a novel +methodology termed as attention head masking (AHM) for multi-modal OOD tasks in +document classification systems. Our empirical results demonstrate that the +proposed AHM method outperforms all state-of-the-art approaches and +significantly decreases the false positive rate (FPR) compared to existing +solutions up to 7.5\%. This methodology generalizes well to multi-modal data, +such as documents, where visual and textual information are modeled under the +same Transformer architecture. To address the scarcity of high-quality publicly +available document datasets and encourage further research on OOD detection for +documents, we introduce FinanceDocs, a new document AI dataset. Our code and +dataset are publicly available. + +
+
+
+
+
+ + ☆ CoDi: Conversational Distillation for Grounded Question Answering + + +
+ Distilling conversational skills into Small Language Models (SLMs) with +approximately 1 billion parameters presents significant challenges. Firstly, +SLMs have limited capacity in their model parameters to learn extensive +knowledge compared to larger models. Secondly, high-quality conversational +datasets are often scarce, small, and domain-specific. Addressing these +challenges, we introduce a novel data distillation framework named CoDi (short +for Conversational Distillation, pronounced "Cody"), allowing us to synthesize +large-scale, assistant-style datasets in a steerable and diverse manner. +Specifically, while our framework is task agnostic at its core, we explore and +evaluate the potential of CoDi on the task of conversational grounded reasoning +for question answering. This is a typical on-device scenario for specialist +SLMs, allowing for open-domain model responses, without requiring the model to +"memorize" world knowledge in its limited weights. Our evaluations show that +SLMs trained with CoDi-synthesized data achieve performance comparable to +models trained on human-annotated data in standard metrics. Additionally, when +using our framework to generate larger datasets from web data, our models +surpass larger, instruction-tuned models in zero-shot conversational grounded +reasoning tasks. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ DSP-MLIR: A MLIR Dialect for Digital Signal Processing + + +
+ Traditional Digital Signal Processing ( DSP ) compilers work at low level ( +C-level / assembly level ) and hence lose much of the optimization +opportunities present at high-level ( domain-level ). The emerging multi-level +compiler infrastructure MLIR ( Multi-level Intermediate Representation ) allows +to specify optimizations at higher level. In this paper, we utilize MLIR +framework to introduce a DSP Dialect and perform domain-specific optimizations +at dialect -level ( high-level ) and show the usefulness of these optimizations +on sample DSP apps. In particular, we develop a compiler for DSP and a DSL +(Domain Specific Language) to ease the development of apps. We show the +performance improvement in execution time for these sample apps by upto 10x +which would have been difficult if the IR were at C/ affine level. + +
+
+
+
+
+ + ☆ Reading with Intent + + +
+ Retrieval augmented generation (RAG) systems augment how knowledge language +models are by integrating external information sources such as Wikipedia, +internal documents, scientific papers, or the open internet. RAG systems that +rely on the open internet as their knowledge source have to contend with the +complexities of human-generated content. Human communication extends much +deeper than just the words rendered as text. Intent, tonality, and connotation +can all change the meaning of what is being conveyed. Recent real-world +deployments of RAG systems have shown some difficulty in understanding these +nuances of human communication. One significant challenge for these systems +lies in processing sarcasm. Though the Large Language Models (LLMs) that make +up the backbone of these RAG systems are able to detect sarcasm, they currently +do not always use these detections for the subsequent processing of text. To +address these issues, in this paper, we synthetically generate sarcastic +passages from Natural Question's Wikipedia retrieval corpus. We then test the +impact of these passages on the performance of both the retriever and reader +portion of the RAG pipeline. We introduce a prompting system designed to +enhance the model's ability to interpret and generate responses in the presence +of sarcasm, thus improving overall system performance. Finally, we conduct +ablation studies to validate the effectiveness of our approach, demonstrating +improvements in handling sarcastic content within RAG systems. + +
+
+
+
+
+ + ☆ Combining Objective and Subjective Perspectives for Political News + Understanding + + +
+ Researchers and practitioners interested in computational politics rely on +automatic content analysis tools to make sense of the large amount of political +texts available on the Web. Such tools should provide objective and subjective +aspects at different granularity levels to make the analyses useful in +practice. Existing methods produce interesting insights for objective aspects, +but are limited for subjective ones, are often limited to national contexts, +and have limited explainability. We introduce a text analysis framework which +integrates both perspectives and provides a fine-grained processing of +subjective aspects. Information retrieval techniques and knowledge bases +complement powerful natural language processing components to allow a flexible +aggregation of results at different granularity levels. Importantly, the +proposed bottom-up approach facilitates the explainability of the obtained +results. We illustrate its functioning with insights on news outlets, political +orientations, topics, individual entities, and demographic segments. The +approach is instantiated on a large corpus of French news, but is designed to +work seamlessly for other languages and countries. + +
+
+
+
+
+ + ☆ SubgoalXL: Subgoal-based Expert Learning for Theorem Proving + + +
+ Formal theorem proving, a field at the intersection of mathematics and +computer science, has seen renewed interest with advancements in large language +models (LLMs). This paper introduces SubgoalXL, a novel approach that +synergizes subgoal-based proofs with expert learning to enhance LLMs' +capabilities in formal theorem proving within the Isabelle environment. +SubgoalXL addresses two critical challenges: the scarcity of specialized +mathematics and theorem-proving data, and the need for improved multi-step +reasoning abilities in LLMs. By optimizing data efficiency and employing +subgoal-level supervision, SubgoalXL extracts richer information from limited +human-generated proofs. The framework integrates subgoal-oriented proof +strategies with an expert learning system, iteratively refining formal +statement, proof, and subgoal generators. Leveraging the Isabelle environment's +advantages in subgoal-based proofs, SubgoalXL achieves a new state-of-the-art +performance of 56.1\% in Isabelle on the standard miniF2F dataset, marking an +absolute improvement of 4.9\%. Notably, SubgoalXL successfully solves 41 AMC12, +9 AIME, and 3 IMO problems from miniF2F. These results underscore the +effectiveness of maximizing limited data utility and employing targeted +guidance for complex reasoning in formal theorem proving, contributing to the +ongoing advancement of AI reasoning capabilities. The implementation is +available at \url{https://github.com/zhaoxlpku/SubgoalXL}. + +
+
+
+
+
+ + ☆ Public Health in Disaster: Emotional Health and Life Incidents + Extraction during Hurricane Harvey + + +
+ Countless disasters have resulted from climate change, causing severe damage +to infrastructure and the economy. These disasters have significant societal +impacts, necessitating mental health services for the millions affected. To +prepare for and respond effectively to such events, it is important to +understand people's emotions and the life incidents they experience before and +after a disaster strikes. In this case study, we collected a dataset of +approximately 400,000 public tweets related to the storm. Using a BERT-based +model, we predicted the emotions associated with each tweet. To efficiently +identify these topics, we utilized the Latent Dirichlet Allocation (LDA) +technique for topic modeling, which allowed us to bypass manual content +analysis and extract meaningful patterns from the data. However, rather than +stopping at topic identification like previous methods \cite{math11244910}, we +further refined our analysis by integrating Graph Neural Networks (GNN) and +Large Language Models (LLM). The GNN was employed to generate embeddings and +construct a similarity graph of the tweets, which was then used to optimize +clustering. Subsequently, we used an LLM to automatically generate descriptive +names for each event cluster, offering critical insights for disaster +preparedness and response strategies. + +
+
+
+
+
+ + ☆ DOMBA: Double Model Balancing for Access-Controlled Language Models via + Minimum-Bounded Aggregation + + +
+ The utility of large language models (LLMs) depends heavily on the quality +and quantity of their training data. Many organizations possess large data +corpora that could be leveraged to train or fine-tune LLMs tailored to their +specific needs. However, these datasets often come with access restrictions +that are based on user privileges and enforced by access control mechanisms. +Training LLMs on such datasets could result in exposure of sensitive +information to unauthorized users. A straightforward approach for preventing +such exposure is to train a separate model for each access level. This, +however, may result in low utility models due to the limited amount of training +data per model compared to the amount in the entire organizational corpus. +Another approach is to train a single LLM on all the data while limiting the +exposure of unauthorized information. However, current exposure-limiting +methods for LLMs are ineffective for access-controlled data, where sensitive +information appears frequently across many training examples. We propose DOMBA +- double model balancing - a simple approach for training and deploying LLMs +that provides high utility and access-control functionality with security +guarantees. DOMBA aggregates the probability distributions of two models, each +trained on documents with (potentially many) different access levels, using a +"min-bounded" average function (a function that is bounded by the smaller +value, e.g., harmonic mean). A detailed mathematical analysis and extensive +evaluation show that DOMBA safeguards restricted information while offering +utility comparable to non-secure models. + +
+
+ comment: 11 pages, 3 figures +
+
+
+
+
+ + ☆ Mistral-SPLADE: LLMs for for better Learned Sparse Retrieval + + +
+ Learned Sparse Retrievers (LSR) have evolved into an effective retrieval +strategy that can bridge the gap between traditional keyword-based sparse +retrievers and embedding-based dense retrievers. At its core, learned sparse +retrievers try to learn the most important semantic keyword expansions from a +query and/or document which can facilitate better retrieval with overlapping +keyword expansions. LSR like SPLADE has typically been using encoder only +models with MLM (masked language modeling) style objective in conjunction with +known ways of retrieval performance improvement such as hard negative mining, +distillation, etc. In this work, we propose to use decoder-only model for +learning semantic keyword expansion. We posit, decoder only models that have +seen much higher magnitudes of data are better equipped to learn keyword +expansions needed for improved retrieval. We use Mistral as the backbone to +develop our Learned Sparse Retriever similar to SPLADE and train it on a subset +of sentence-transformer data which is often used for training text embedding +models. Our experiments support the hypothesis that a sparse retrieval model +based on decoder only large language model (LLM) surpasses the performance of +existing LSR systems, including SPLADE and all its variants. The LLM based +model (Echo-Mistral-SPLADE) now stands as a state-of-the-art learned sparse +retrieval model on the BEIR text retrieval benchmark. + +
+
+
+
+
+ + ☆ What can Large Language Models Capture about Code Functional + Equivalence? + + +
+ Code-LLMs, LLMs pre-trained on large code corpora, have shown great progress +in learning rich representations of the structure and syntax of code, +successfully using it to generate or classify code fragments. At the same time, +understanding if they are able to do so because they capture code semantics, +and how well, is still an open question. In this paper, we tackle this problem +by introducing SeqCoBench, a benchmark for systematically assessing how +Code-LLMs can capture code functional equivalence. SeqCoBench contains over 20 +code transformations that either preserve or alter the semantics of Python +programs. We conduct extensive evaluations in different settings, including +zero-shot and parameter-efficient finetuning methods on state-of-the-art +(Code-)LLMs to see if they can discern semantically equivalent or different +pairs of programs in SeqCoBench. We find that the performance gap between these +LLMs and classical match-based retrieval scores is minimal, with both +approaches showing a concerning lack of depth in understanding code semantics. + +
+
+ comment: 37 pages +
+
+
+
+
+ + ♻ ☆ LongVILA: Scaling Long-Context Visual Language Models for Long Videos + + +
+ Long-context capability is critical for multi-modal foundation models. We +introduce LongVILA, a full-stack solution for long-context vision-language +models, including system, model training, and dataset development. On the +system side, we introduce the first long-context Multi-Modal Sequence +Parallelism (MM-SP) system that enables long training and inference, enabling +2M context length training on 256 GPUs without any gradient checkpointing. +MM-SP is 2.1x - 5.7x faster than ring sequence parallelism and 1.1x - 1.4x +faster than Megatron context parallelism + tensor parallelism in text-only +settings. Moreover, it seamlessly integrates with Hugging Face Transformers. +For model training, we propose a five-stage pipeline comprising alignment, +pre-training, short supervised fine-tuning, context extension, and long +supervised fine-tuning. On datasets, we construct large-scale visual language +pre-training datasets and long video instruction-following datasets to support +our multi-stage training process. LongVILA extends the number of frames of VILA +from 8 to 1024, and improves the long video captioning score from 2.00 to 3.26 +(1.6x), achieving 99.5% accuracy in 1400-frames video (274k context length) +needle-in-a-haystack. LongVILA-8B demonstrates consistent accuracy improvements +on long videos in the VideoMME benchmark as the number of frames increases. + +
+
+ comment: Code and models are available at + https://github.com/NVlabs/VILA/blob/main/LongVILA.md +
+
+
+
+
+ + ♻ ☆ What is in Your Safe Data? Identifying Benign Data that Breaks Safety + + +
+ Current Large Language Models (LLMs), even those tuned for safety and +alignment, are susceptible to jailbreaking. Some have found that just further +fine-tuning an aligned model with benign data (i.e., data without harmful +content) surprisingly leads to substantial degradation in safety. We delve into +the data-centric aspects of why benign fine-tuning inadvertently contributes to +jailbreaking. First, we represent fine-tuning data through two lenses: +representation and gradient spaces. Additionally, we propose a bi-directional +anchoring method that, during the selection process, prioritizes data points +that are close to harmful examples and far from benign ones. Our approach +effectively identifies subsets of benign data that are more likely to degrade +the model's safety after fine-tuning. Training on just 100 of these seemingly +benign datapoints surprisingly leads to the fine-tuned model affirmatively +responding to >70% of tested harmful requests, compared to <20% after +fine-tuning on randomly selected data. We also observe that the selected data +frequently appear as lists, bullet points, or math questions, indicating a +systematic pattern in fine-tuning data that contributes to jailbreaking. + +
+
+
+
+
+ + ♻ ☆ Fake News in Sheep's Clothing: Robust Fake News Detection Against + LLM-Empowered Style Attacks KDD 2024 + + +
+ It is commonly perceived that fake news and real news exhibit distinct +writing styles, such as the use of sensationalist versus objective language. +However, we emphasize that style-related features can also be exploited for +style-based attacks. Notably, the advent of powerful Large Language Models +(LLMs) has empowered malicious actors to mimic the style of trustworthy news +sources, doing so swiftly, cost-effectively, and at scale. Our analysis reveals +that LLM-camouflaged fake news content significantly undermines the +effectiveness of state-of-the-art text-based detectors (up to 38% decrease in +F1 Score), implying a severe vulnerability to stylistic variations. To address +this, we introduce SheepDog, a style-robust fake news detector that prioritizes +content over style in determining news veracity. SheepDog achieves this +resilience through (1) LLM-empowered news reframings that inject style +diversity into the training process by customizing articles to match different +styles; (2) a style-agnostic training scheme that ensures consistent veracity +predictions across style-diverse reframings; and (3) content-focused veracity +attributions that distill content-centric guidelines from LLMs for debunking +fake news, offering supplementary cues and potential intepretability that +assist veracity prediction. Extensive experiments on three real-world +benchmarks demonstrate SheepDog's style robustness and adaptability to various +backbones. + +
+
+ comment: Accepted to KDD 2024 (Research Track) +
+
+
+
+
+ + ♻ ☆ Causal Reasoning and Large Language Models: Opening a New Frontier for + Causality + + +
+ The causal capabilities of large language models (LLMs) are a matter of +significant debate, with critical implications for the use of LLMs in +societally impactful domains such as medicine, science, law, and policy. We +conduct a "behavorial" study of LLMs to benchmark their capability in +generating causal arguments. Across a wide range of tasks, we find that LLMs +can generate text corresponding to correct causal arguments with high +probability, surpassing the best-performing existing methods. Algorithms based +on GPT-3.5 and 4 outperform existing algorithms on a pairwise causal discovery +task (97%, 13 points gain), counterfactual reasoning task (92%, 20 points gain) +and event causality (86% accuracy in determining necessary and sufficient +causes in vignettes). We perform robustness checks across tasks and show that +the capabilities cannot be explained by dataset memorization alone, especially +since LLMs generalize to novel datasets that were created after the training +cutoff date. + That said, LLMs exhibit unpredictable failure modes, and we discuss the kinds +of errors that may be improved and what are the fundamental limits of LLM-based +answers. Overall, by operating on the text metadata, LLMs bring capabilities so +far understood to be restricted to humans, such as using collected knowledge to +generate causal graphs or identifying background causal context from natural +language. As a result, LLMs may be used by human domain experts to save effort +in setting up a causal analysis, one of the biggest impediments to the +widespread adoption of causal methods. Given that LLMs ignore the actual data, +our results also point to a fruitful research direction of developing +algorithms that combine LLMs with existing causal techniques. Code and datasets +are available at https://github.com/py-why/pywhy-llm. + +
+
+ comment: Added three novel datasets. To be published in TMLR. Authors listed + alphabetically +
+
+
+
+
+ + ♻ ☆ Unc-TTP: A Method for Classifying LLM Uncertainty to Improve In-Context + Example Selection + + +
+ Nowadays, Large Language Models (LLMs) have demonstrated exceptional +performance across various downstream tasks. However, it is challenging for +users to discern whether the responses are generated with certainty or are +fabricated to meet user expectations. Estimating the uncertainty of LLMs is +particularly challenging due to their vast scale and the lack of white-box +access. In this work, we propose a novel Uncertainty Tripartite Testing +Paradigm (Unc-TTP) to classify LLM uncertainty, via evaluating the consistency +of LLM outputs when incorporating label interference into the sampling-based +approach. Based on Unc-TTP outputs, we aggregate instances into certain and +uncertain categories. Further, we conduct a detailed analysis of the +uncertainty properties of LLMs and show Unc-TTP's superiority over the existing +sampling-based methods. In addition, we leverage the obtained uncertainty +information to guide in-context example selection, demonstrating that Unc-TTP +obviously outperforms retrieval-based and sampling-based approaches in +selecting more informative examples. Our work paves a new way to classify the +uncertainty of both open- and closed-source LLMs, and introduces a practical +approach to exploit this uncertainty to improve LLMs performance. + +
+
+ comment: 9 pages, long paper +
+
+
+
+
+ + ♻ ☆ InstructRAG: Instructing Retrieval-Augmented Generation via + Self-Synthesized Rationales + + +
+ Retrieval-augmented generation (RAG) has shown promising potential to enhance +the accuracy and factuality of language models (LMs). However, imperfect +retrievers or noisy corpora can introduce misleading or even erroneous +information to the retrieved contents, posing a significant challenge to the +generation quality. Existing RAG methods typically address this challenge by +directly predicting final answers despite potentially noisy inputs, resulting +in an implicit denoising process that is difficult to interpret and verify. On +the other hand, the acquisition of explicit denoising supervision is often +costly, involving significant human efforts. In this work, we propose +InstructRAG, where LMs explicitly learn the denoising process through +self-synthesized rationales -- First, we instruct the LM to explain how the +ground-truth answer is derived from retrieved documents. Then, these rationales +can be used either as demonstrations for in-context learning of explicit +denoising or as supervised fine-tuning data to train the model. Compared to +standard RAG approaches, InstructRAG requires no additional supervision, allows +for easier verification of the predicted answers, and effectively improves +generation accuracy. Experiments show InstructRAG consistently outperforms +existing RAG methods in both training-free and trainable scenarios, achieving a +relative improvement of 8.3% over the best baseline method on average across +five knowledge-intensive benchmarks. Extensive analysis indicates that +InstructRAG scales well with increased numbers of retrieved documents and +consistently exhibits robust denoising ability even in out-of-domain datasets, +demonstrating strong generalizability. + +
+
+ comment: Code: https://github.com/weizhepei/InstructRAG +
+
+
+
+
+ + ♻ ☆ Which Side Are You On? A Multi-task Dataset for End-to-End Argument + Summarisation and Evaluation ACL 2024 + + +
+ With the recent advances of large language models (LLMs), it is no longer +infeasible to build an automated debate system that helps people to synthesise +persuasive arguments. Previous work attempted this task by integrating multiple +components. In our work, we introduce an argument mining dataset that captures +the end-to-end process of preparing an argumentative essay for a debate, which +covers the tasks of claim and evidence identification (Task 1 ED), evidence +convincingness ranking (Task 2 ECR), argumentative essay summarisation and +human preference ranking (Task 3 ASR) and metric learning for automated +evaluation of resulting essays, based on human feedback along argument quality +dimensions (Task 4 SQE). Our dataset contains 14k examples of claims that are +fully annotated with the various properties supporting the aforementioned +tasks. We evaluate multiple generative baselines for each of these tasks, +including representative LLMs. We find, that while they show promising results +on individual tasks in our benchmark, their end-to-end performance on all four +tasks in succession deteriorates significantly, both in automated measures as +well as in human-centred evaluation. This challenge presented by our proposed +dataset motivates future research on end-to-end argument mining and +summarisation. The repository of this project is available at +https://github.com/HaoBytes/ArgSum-Datatset + +
+
+ comment: Published on ACL 2024 Findings +
+
+
+
+
+ + ♻ ☆ Reference-Guided Verdict: LLMs-as-Judges in Automatic Evaluation of + Free-Form Text + + +
+ The emergence of Large Language Models (LLMs) as chat assistants capable of +generating human-like conversations has amplified the need for robust +evaluation methods, particularly for open-ended tasks. Conventional metrics +like BLEU and ROUGE, while useful, are increasingly inadequate for capturing +the subtle semantics and contextual richness of such generative outputs. We +propose a reference-guided verdict method that automates the evaluation process +by leveraging multiple LLMs-as-judges. Through experiments on three open-ended +question-answering tasks, we demonstrate that combining multiple LLMs-as-judges +significantly improves the reliability and accuracy of evaluations, +particularly in complex tasks where a single model might struggle. Our findings +reveal a strong correlation with human evaluations, establishing our method as +a viable and effective alternative to traditional metrics and human judgments, +particularly in the context of LLM-based chat assistants where the complexity +and diversity of responses challenge existing benchmarks. + +
+
+
+
+
+ + ♻ ☆ Affordances-Oriented Planning using Foundation Models for Continuous + Vision-Language Navigation + + +
+ LLM-based agents have demonstrated impressive zero-shot performance in +vision-language navigation (VLN) task. However, existing LLM-based methods +often focus only on solving high-level task planning by selecting nodes in +predefined navigation graphs for movements, overlooking low-level control in +navigation scenarios. To bridge this gap, we propose AO-Planner, a novel +Affordances-Oriented Planner for continuous VLN task. Our AO-Planner integrates +various foundation models to achieve affordances-oriented low-level motion +planning and high-level decision-making, both performed in a zero-shot setting. +Specifically, we employ a Visual Affordances Prompting (VAP) approach, where +the visible ground is segmented by SAM to provide navigational affordances, +based on which the LLM selects potential candidate waypoints and plans +low-level paths towards selected waypoints. We further propose a high-level +PathAgent which marks planned paths into the image input and reasons the most +probable path by comprehending all environmental information. Finally, we +convert the selected path into 3D coordinates using camera intrinsic parameters +and depth information, avoiding challenging 3D predictions for LLMs. +Experiments on the challenging R2R-CE and RxR-CE datasets show that AO-Planner +achieves state-of-the-art zero-shot performance (8.8% improvement on SPL). Our +method can also serve as a data annotator to obtain pseudo-labels, distilling +its waypoint prediction ability into a learning-based predictor. This new +predictor does not require any waypoint data from the simulator and achieves +47% SR competing with supervised methods. We establish an effective connection +between LLM and 3D world, presenting novel prospects for employing foundation +models in low-level motion control. + +
+
+
+
+
+ + ♻ ☆ Elephants Never Forget: Memorization and Learning of Tabular Data in + Large Language Models + + +
+ While many have shown how Large Language Models (LLMs) can be applied to a +diverse set of tasks, the critical issues of data contamination and +memorization are often glossed over. In this work, we address this concern for +tabular data. Specifically, we introduce a variety of different techniques to +assess whether a language model has seen a tabular dataset during training. +This investigation reveals that LLMs have memorized many popular tabular +datasets verbatim. We then compare the few-shot learning performance of LLMs on +datasets that were seen during training to the performance on datasets released +after training. We find that LLMs perform better on datasets seen during +training, indicating that memorization leads to overfitting. At the same time, +LLMs show non-trivial performance on novel datasets and are surprisingly robust +to data transformations. We then investigate the in-context statistical +learning abilities of LLMs. While LLMs are significantly better than random at +solving statistical classification problems, the sample efficiency of few-shot +learning lags behind traditional statistical learning algorithms, especially as +the dimension of the problem increases. This suggests that much of the observed +few-shot performance on novel real-world datasets is due to the LLM's world +knowledge. Overall, our results highlight the importance of testing whether an +LLM has seen an evaluation dataset during pre-training. We release the +https://github.com/interpretml/LLM-Tabular-Memorization-Checker Python package +to test LLMs for memorization of tabular datasets. + +
+
+ comment: COLM camera ready +
+
+
+
+
+ + ♻ ☆ SoftTiger: A Clinical Foundation Model for Healthcare Workflows AAAI 2024 + + +
+ We introduce SoftTiger, a clinical large language model (CLaM) designed as a +foundation model for healthcare workflows. The narrative and unstructured +nature of clinical notes is a major obstacle for healthcare intelligentization. +We address a critical problem of structuring clinical notes into clinical data, +according to international interoperability standards. We collect and annotate +data for three subtasks, namely, international patient summary, clinical +impression and medical encounter. We then supervised fine-tuned a +state-of-the-art LLM using public and credentialed clinical data. The training +is orchestrated in a way that the target model can first support basic clinical +tasks such as abbreviation expansion and temporal information extraction, and +then learn to perform more complex downstream clinical tasks. Moreover, we +address several modeling challenges in the healthcare context, e.g., extra long +context window. Our blind pairwise evaluation shows that SoftTiger outperforms +other popular open-source models and GPT-3.5, comparable to Gemini-pro, with a +mild gap from GPT-4. We believe that LLMs may become a step-stone towards +healthcare digitalization and democratization. Therefore, we publicly release +SoftTiger models at scales of 13 billion and 70 billion parameters, as well as +datasets and code for our innovative scalable evaluation, hopefully, making a +significant contribution to the healthcare industry. + +
+
+ comment: Accepted to AAAI 2024 Spring Symposium on Clinical Foundation Models, + Stanford University, Stanford, California +
+
+
+
+
+ + ♻ ☆ Can LLMs Beat Humans in Debating? A Dynamic Multi-agent Framework for + Competitive Debate + + +
+ Competitive debate is a complex task of computational argumentation. Large +Language Models (LLMs) suffer from hallucinations and lack competitiveness in +this field. To address these challenges, we introduce Agent for Debate +(Agent4Debate), a dynamic multi-agent framework based on LLMs designed to +enhance their capabilities in competitive debate. Drawing inspiration from +human behavior in debate preparation and execution, Agent4Debate employs a +collaborative architecture where four specialized agents, involving Searcher, +Analyzer, Writer, and Reviewer, dynamically interact and cooperate. These +agents work throughout the debate process, covering multiple stages from +initial research and argument formulation to rebuttal and summary. To +comprehensively evaluate framework performance, we construct the Competitive +Debate Arena, comprising 66 carefully selected Chinese debate motions. We +recruit ten experienced human debaters and collect records of 200 debates +involving Agent4Debate, baseline models, and humans. The evaluation employs the +Debatrix automatic scoring system and professional human reviewers based on the +established Debatrix-Elo and Human-Elo ranking. Experimental results indicate +that the state-of-the-art Agent4Debate exhibits capabilities comparable to +those of humans. Furthermore, ablation studies demonstrate the effectiveness of +each component in the agent structure. + +
+
+ comment: 12 pages (including appendix), 7 figures +
+
+
+
+
+ + ♻ ☆ CAUSE: Counterfactual Assessment of User Satisfaction Estimation in + Task-Oriented Dialogue Systems + + +
+ An important unexplored aspect in previous work on user satisfaction +estimation for Task-Oriented Dialogue (TOD) systems is their evaluation in +terms of robustness for the identification of user dissatisfaction: current +benchmarks for user satisfaction estimation in TOD systems are highly skewed +towards dialogues for which the user is satisfied. The effect of having a more +balanced set of satisfaction labels on performance is unknown. However, +balancing the data with more dissatisfactory dialogue samples requires further +data collection and human annotation, which is costly and time-consuming. In +this work, we leverage large language models (LLMs) and unlock their ability to +generate satisfaction-aware counterfactual dialogues to augment the set of +original dialogues of a test collection. We gather human annotations to ensure +the reliability of the generated samples. We evaluate two open-source LLMs as +user satisfaction estimators on our augmented collection against +state-of-the-art fine-tuned models. Our experiments show that when used as +few-shot user satisfaction estimators, open-source LLMs show higher robustness +to the increase in the number of dissatisfaction labels in the test collection +than the fine-tuned state-of-the-art models. Our results shed light on the need +for data augmentation approaches for user satisfaction estimation in TOD +systems. We release our aligned counterfactual dialogues, which are curated by +human annotation, to facilitate further research on this topic. + +
+
+
+
+
+ + ♻ ☆ DSLR: Document Refinement with Sentence-Level Re-ranking and + Reconstruction to Enhance Retrieval-Augmented Generation + + +
+ Recent advancements in Large Language Models (LLMs) have significantly +improved their performance across various Natural Language Processing (NLP) +tasks. However, LLMs still struggle with generating non-factual responses due +to limitations in their parametric memory. Retrieval-Augmented Generation (RAG) +systems address this issue by incorporating external knowledge with a retrieval +module. Despite their successes, however, current RAG systems face challenges +with retrieval failures and the limited ability of LLMs to filter out +irrelevant information. Therefore, in this work, we propose DSLR (Document +Refinement with Sentence-Level Re-ranking and Reconstruction), an unsupervised +framework that decomposes retrieved documents into sentences, filters out +irrelevant sentences, and reconstructs them again into coherent passages. We +experimentally validate DSLR on multiple open-domain QA datasets and the +results demonstrate that DSLR significantly enhances the RAG performance over +conventional fixed-size passage. Furthermore, our DSLR enhances performance in +specific, yet realistic scenarios without the need for additional training, +providing an effective and efficient solution for refining retrieved documents +in RAG systems. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ♻ ☆ Breaking Language Barriers with MMTweets: Advancing Cross-Lingual + Debunked Narrative Retrieval for Fact-Checking + + +
+ Finding previously debunked narratives involves identifying claims that have +already undergone fact-checking. The issue intensifies when similar false +claims persist in multiple languages, despite the availability of debunks for +several months in another language. Hence, automatically finding debunks (or +fact-checks) in multiple languages is crucial to make the best use of scarce +fact-checkers' resources. Mainly due to the lack of readily available data, +this is an understudied problem, particularly when considering the +cross-lingual scenario, i.e. the retrieval of debunks in a language different +from the language of the online post being checked. This study introduces +cross-lingual debunked narrative retrieval and addresses this research gap by: +(i) creating Multilingual Misinformation Tweets (MMTweets): a dataset that +stands out, featuring cross-lingual pairs, images, human annotations, and +fine-grained labels, making it a comprehensive resource compared to its +counterparts; (ii) conducting an extensive experiment to benchmark +state-of-the-art cross-lingual retrieval models and introducing multistage +retrieval methods tailored for the task; and (iii) comprehensively evaluating +retrieval models for their cross-lingual and cross-dataset transfer +capabilities within MMTweets, and conducting a retrieval latency analysis. We +find that MMTweets presents challenges for cross-lingual debunked narrative +retrieval, highlighting areas for improvement in retrieval models. Nonetheless, +the study provides valuable insights for creating MMTweets datasets and +optimising debunked narrative retrieval models to empower fact-checking +endeavours. The dataset and annotation codebook are publicly available at +https://doi.org/10.5281/zenodo.10637161. + +
+
+
+
+
+ + ♻ ☆ The FruitShell French synthesis system at the Blizzard 2023 Challenge + + +
+ This paper presents a French text-to-speech synthesis system for the Blizzard +Challenge 2023. The challenge consists of two tasks: generating high-quality +speech from female speakers and generating speech that closely resembles +specific individuals. Regarding the competition data, we conducted a screening +process to remove missing or erroneous text data. We organized all symbols +except for phonemes and eliminated symbols that had no pronunciation or zero +duration. Additionally, we added word boundary and start/end symbols to the +text, which we have found to improve speech quality based on our previous +experience. For the Spoke task, we performed data augmentation according to the +competition rules. We used an open-source G2P model to transcribe the French +texts into phonemes. As the G2P model uses the International Phonetic Alphabet +(IPA), we applied the same transcription process to the provided competition +data for standardization. However, due to compiler limitations in recognizing +special symbols from the IPA chart, we followed the rules to convert all +phonemes into the phonetic scheme used in the competition data. Finally, we +resampled all competition audio to a uniform sampling rate of 16 kHz. We +employed a VITS-based acoustic model with the hifigan vocoder. For the Spoke +task, we trained a multi-speaker model and incorporated speaker information +into the duration predictor, vocoder, and flow layers of the model. The +evaluation results of our system showed a quality MOS score of 3.6 for the Hub +task and 3.4 for the Spoke task, placing our system at an average level among +all participating teams. + +
+
+
+
+
+ + ♻ ☆ Identifying Query-Relevant Neurons in Large Language Models for + Long-Form Texts + + +
+ Large Language Models (LLMs) possess vast amounts of knowledge within their +parameters, prompting research into methods for locating and editing this +knowledge. Previous work has largely focused on locating entity-related (often +single-token) facts in smaller models. However, several key questions remain +unanswered: (1) How can we effectively locate query-relevant neurons in +contemporary autoregressive LLMs, such as Llama and Mistral? (2) How can we +address the challenge of long-form text generation? (3) Are there localized +knowledge regions in LLMs? In this study, we introduce Query-Relevant Neuron +Cluster Attribution (QRNCA), a novel architecture-agnostic framework capable of +identifying query-relevant neurons in LLMs. QRNCA allows for the examination of +long-form answers beyond triplet facts by employing the proxy task of +multi-choice question answering. To evaluate the effectiveness of our detected +neurons, we build two multi-choice QA datasets spanning diverse domains and +languages. Empirical evaluations demonstrate that our method outperforms +baseline methods significantly. Further, analysis of neuron distributions +reveals the presence of visible localized regions, particularly within +different domains. Finally, we show potential applications of our detected +neurons in knowledge editing and neuron-based prediction. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ♻ ☆ Large Language Models Might Not Care What You Are Saying: Prompt Format + Beats Descriptions + + +
+ With the help of in-context learning (ICL), large language models (LLMs) have +achieved impressive performance across various tasks. However, the function of +descriptive instructions during ICL remains under-explored. In this work, we +propose an ensemble prompt framework to describe the selection criteria of +multiple in-context examples, and preliminary experiments on machine +translation (MT) across six translation directions confirm that this framework +boosts ICL perfromance. But to our surprise, LLMs might not necessarily care +what the descriptions actually say, and the performance gain is primarily +caused by the ensemble format, since the framework could lead to improvement +even with random descriptive nouns. We further apply this new ensemble prompt +on a range of commonsense, math, logical reasoning and hallucination tasks with +three LLMs and achieve promising results, suggesting again that designing a +proper prompt format would be much more effective and efficient than paying +effort into specific descriptions. Our code will be publicly available once +this paper is published. + +
+
+ comment: 10 pages, 6 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ "Image, Tell me your story!" Predicting the original meta-context of + visual misinformation + + +
+ To assist human fact-checkers, researchers have developed automated +approaches for visual misinformation detection. These methods assign veracity +scores by identifying inconsistencies between the image and its caption, or by +detecting forgeries in the image. However, they neglect a crucial point of the +human fact-checking process: identifying the original meta-context of the +image. By explaining what is actually true about the image, fact-checkers can +better detect misinformation, focus their efforts on check-worthy visual +content, engage in counter-messaging before misinformation spreads widely, and +make their explanation more convincing. Here, we fill this gap by introducing +the task of automated image contextualization. We create 5Pils, a dataset of +1,676 fact-checked images with question-answer pairs about their original +meta-context. Annotations are based on the 5 Pillars fact-checking framework. +We implement a first baseline that grounds the image in its original +meta-context using the content of the image and textual evidence retrieved from +the open web. Our experiments show promising results while highlighting several +open challenges in retrieval and reasoning. We make our code and data publicly +available. + +
+
+ comment: Preprint. Code available at https://github.com/UKPLab/5pils +
+
+
+
+
+ + ♻ ☆ Strong hallucinations from negation and how to fix them + + +
+ Despite great performance on many tasks, language models (LMs) still struggle +with reasoning, sometimes providing responses that cannot possibly be true +because they stem from logical incoherence. We call such responses +\textit{strong hallucinations} and prove that they follow from an LM's +computation of its internal representations for logical operators and outputs +from those representations. Focusing on negation, we provide a novel solution +in which negation is treated not as another element of a latent representation, +but as \textit{an operation over an LM's latent representations that constrains +how they may evolve}. We show that our approach improves model performance in +cloze prompting and natural language inference tasks with negation without +requiring training on sparse negative data. + +
+
+ comment: Proceedings of the 62nd Annual Meeting of the Association for + Computational Linguistics (Findings) +
+
+
+
+
+ + ♻ ☆ Explainable Depression Symptom Detection in Social Media + + +
+ Users of social platforms often perceive these sites as supportive spaces to +post about their mental health issues. Those conversations contain important +traces about individuals' health risks. Recently, researchers have exploited +this online information to construct mental health detection models, which aim +to identify users at risk on platforms like Twitter, Reddit or Facebook. Most +of these models are centred on achieving good classification results, ignoring +the explainability and interpretability of the decisions. Recent research has +pointed out the importance of using clinical markers, such as the use of +symptoms, to improve trust in the computational models by health professionals. +In this paper, we propose using transformer-based architectures to detect and +explain the appearance of depressive symptom markers in the users' writings. We +present two approaches: i) train a model to classify, and another one to +explain the classifier's decision separately and ii) unify the two tasks +simultaneously using a single model. Additionally, for this latter manner, we +also investigated the performance of recent conversational LLMs when using +in-context learning. Our natural language explanations enable clinicians to +interpret the models' decisions based on validated symptoms, enhancing trust in +the automated process. We evaluate our approach using recent symptom-based +datasets, employing both offline and expert-in-the-loop metrics to assess the +quality of the explanations generated by our models. The experimental results +show that it is possible to achieve good classification results while +generating interpretable symptom-based explanations. + +
+
+ comment: Accepted for publication in Health Information Science and Systems +
+
+
+
+
+ + ♻ ☆ Look at the Text: Instruction-Tuned Language Models are More Robust + Multiple Choice Selectors than You Think + + +
+ Multiple choice questions (MCQs) are commonly used to evaluate the +capabilities of large language models (LLMs). One common way to evaluate the +model response is to rank the candidate answers based on the log probability of +the first token prediction. An alternative way is to examine the text output. +Prior work has shown that first token probabilities lack robustness to changes +in MCQ phrasing, and that first token probabilities do not match text answers +for instruction-tuned models. Therefore, in this paper, we investigate the +robustness of text answers. We show that the text answers are more robust to +question perturbations than the first token probabilities, when the first token +answers mismatch the text answers. The difference in robustness increases as +the mismatch rate becomes greater. As the mismatch reaches over 50\%, the text +answer is more robust to option order changes than the debiased first token +probabilities using state-of-the-art debiasing methods such as PriDe. Our +findings provide further evidence for the benefits of text answer evaluation +over first token probability evaluation. + +
+
+ comment: COLM 2024 +
+
+
+
+
+ + ♻ ☆ Recent Advances in End-to-End Simultaneous Speech Translation IJCAI 2024 + + +
+ Simultaneous speech translation (SimulST) is a demanding task that involves +generating translations in real-time while continuously processing speech +input. This paper offers a comprehensive overview of the recent developments in +SimulST research, focusing on four major challenges. Firstly, the complexities +associated with processing lengthy and continuous speech streams pose +significant hurdles. Secondly, satisfying real-time requirements presents +inherent difficulties due to the need for immediate translation output. +Thirdly, striking a balance between translation quality and latency constraints +remains a critical challenge. Finally, the scarcity of annotated data adds +another layer of complexity to the task. Through our exploration of these +challenges and the proposed solutions, we aim to provide valuable insights into +the current landscape of SimulST research and suggest promising directions for +future exploration. + +
+
+ comment: Accepted by IJCAI 2024 +
+
+
+
+
+ + ♻ ☆ Enhancing Startup Success Predictions in Venture Capital: A GraphRAG + Augmented Multivariate Time Series Method + + +
+ In the Venture Capital(VC) industry, predicting the success of startups is +challenging due to limited financial data and the need for subjective revenue +forecasts. Previous methods based on time series analysis or deep learning +often fall short as they fail to incorporate crucial inter-company +relationships such as competition and collaboration. Regarding the issues, we +propose a novel approach using GrahphRAG augmented time series model. With +GraphRAG, time series predictive methods are enhanced by integrating these +vital relationships into the analysis framework, allowing for a more dynamic +understanding of the startup ecosystem in venture capital. Our experimental +results demonstrate that our model significantly outperforms previous models in +startup success predictions. To the best of our knowledge, our work is the +first application work of GraphRAG. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2312.13936, + arXiv:2312.04876, arXiv:2402.11454 by other authors +
+
+
+
+
+ + ♻ ☆ CULTURE-GEN: Revealing Global Cultural Perception in Language Models + through Natural Language Prompting + + +
+ As the utilization of large language models (LLMs) has proliferated +world-wide, it is crucial for them to have adequate knowledge and fair +representation for diverse global cultures. In this work, we uncover culture +perceptions of three SOTA models on 110 countries and regions on 8 +culture-related topics through culture-conditioned generations, and extract +symbols from these generations that are associated to each culture by the LLM. +We discover that culture-conditioned generation consist of linguistic "markers" +that distinguish marginalized cultures apart from default cultures. We also +discover that LLMs have an uneven degree of diversity in the culture symbols, +and that cultures from different geographic regions have different presence in +LLMs' culture-agnostic generation. Our findings promote further research in +studying the knowledge and fairness of global culture perception in LLMs. Code +and Data can be found here: https://github.com/huihanlhh/Culture-Gen/ + +
+
+
+
+
+ + ♻ ☆ Large Language Models Know What Makes Exemplary Contexts + + +
+ In-context learning (ICL) has proven to be a significant capability with the +advancement of Large Language models (LLMs). By instructing LLMs using few-shot +demonstrative examples, ICL enables them to perform a wide range of tasks +without needing to update millions of parameters. This paper presents a unified +framework for LLMs that allows them to self-select influential in-context +examples to compose their contexts; self-rank candidates with different +demonstration compositions; self-optimize the demonstration selection and +ordering through reinforcement learning. Specifically, our method designs a +parameter-efficient retrieval head that generates the optimized demonstration +after training with rewards from LLM's own preference. Experimental results +validate the proposed method's effectiveness in enhancing ICL performance. +Additionally, our approach effectively identifies and selects the most +representative examples for the current task, and includes more diversity in +retrieval. + +
+
+ comment: 12 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ PsySafe: A Comprehensive Framework for Psychological-based Attack, + Defense, and Evaluation of Multi-agent System Safety ACL 2024 + + +
+ Multi-agent systems, when enhanced with Large Language Models (LLMs), exhibit +profound capabilities in collective intelligence. However, the potential misuse +of this intelligence for malicious purposes presents significant risks. To +date, comprehensive research on the safety issues associated with multi-agent +systems remains limited. In this paper, we explore these concerns through the +innovative lens of agent psychology, revealing that the dark psychological +states of agents constitute a significant threat to safety. To tackle these +concerns, we propose a comprehensive framework (PsySafe) grounded in agent +psychology, focusing on three key areas: firstly, identifying how dark +personality traits in agents can lead to risky behaviors; secondly, evaluating +the safety of multi-agent systems from the psychological and behavioral +perspectives, and thirdly, devising effective strategies to mitigate these +risks. Our experiments reveal several intriguing phenomena, such as the +collective dangerous behaviors among agents, agents' self-reflection when +engaging in dangerous behavior, and the correlation between agents' +psychological assessments and dangerous behaviors. We anticipate that our +framework and observations will provide valuable insights for further research +into the safety of multi-agent systems. We will make our data and code publicly +accessible at https://github.com/AI4Good24/PsySafe. + +
+
+ comment: ACL 2024 +
+
+
+
+
+ + ♻ ☆ Auto-ICL: In-Context Learning without Human Supervision + + +
+ With in-context learning ability, the performance of large language models +can be significantly boosted when provided with appropriate context. However, +existing in-context learning methods mainly rely on human-provided contexts, +such as labeled examples and explicit instructions. Writing context by humans +is labor-intensive on various tasks and limits the model to tasks manageable by +humans. To overcome these limitations, we propose Automatic In-Context Learning +framework that enables the model to autonomously generate examples and +instructions for problem-solving. With experiments across various models and +datasets, results show that model-generated contexts outperform human-annotated +contexts, including Few-Shot and Few-Shot-CoT methods, and surpass existing +self-generated context methods like Zero-CoT and Auto-CoT. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ MoDeGPT: Modular Decomposition for Large Language Model Compression + + +
+ Large Language Models (LLMs) have reshaped the landscape of artificial +intelligence by demonstrating exceptional performance across various tasks. +However, substantial computational requirements make their deployment +challenging on devices with limited resources. Recently, compression methods +using low-rank matrix techniques have shown promise, yet these often lead to +degraded accuracy or introduce significant overhead in parameters and inference +latency. This paper introduces \textbf{Mo}dular \textbf{De}composition +(MoDeGPT), a novel structured compression framework that does not need recovery +fine-tuning while resolving the above drawbacks. MoDeGPT partitions the +Transformer block into modules comprised of matrix pairs and reduces the hidden +dimensions via reconstructing the module-level outputs. MoDeGPT is developed +based on a theoretical framework that utilizes three well-established matrix +decomposition algorithms -- Nystr\"om approximation, CR decomposition, and SVD +-- and applies them to our redefined transformer modules. Our comprehensive +experiments show MoDeGPT, without backward propagation, matches or surpasses +previous structured compression methods that rely on gradient information, and +saves 98% of compute costs on compressing a 13B model. On \textsc{Llama}-2/3 +and OPT models, MoDeGPT maintains 90-95% zero-shot performance with 25-30% +compression rates. Moreover, the compression can be done on a single GPU within +a few hours and increases the inference throughput by up to 46%. + +
+
+ comment: 31 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ F-Eval: Assessing Fundamental Abilities with Refined Evaluation Methods ACL 2024 + + +
+ Large language models (LLMs) garner significant attention for their +unprecedented performance, leading to an increasing number of researches +evaluating LLMs. However, these evaluation benchmarks are limited to assessing +the instruction-following capabilities, overlooking the fundamental abilities +that emerge during the pre-training stage. Previous subjective evaluation +methods mainly reply on scoring by API models. However, in the absence of +references, large models have shown limited ability to discern subtle +differences. To bridge the gap, we propose F-Eval, a bilingual evaluation +benchmark to evaluate the fundamental abilities, including expression, +commonsense and logic. The tasks in F-Eval include multi-choice objective +tasks, open-ended objective tasks, reference-based subjective tasks and +reference-free subjective tasks. For reference-free subjective tasks, we devise +new evaluation methods, serving as alternatives to scoring by API models. We +conduct evaluations on 13 advanced LLMs. Results show that our evaluation +methods show higher correlation coefficients and larger distinction than other +evaluators. Additionally, we discuss the influence of different model sizes, +dimensions, and normalization methods. We anticipate that F-Eval will +facilitate the study of LLMs' fundamental abilities. + +
+
+ comment: ACL 2024 +
+
+
+
+
+ + ♻ ☆ Instruct, Not Assist: LLM-based Multi-Turn Planning and Hierarchical + Questioning for Socratic Code Debugging + + +
+ Socratic questioning is an effective teaching strategy, encouraging critical +thinking and problem-solving. The conversational capabilities of large language +models (LLMs) show great potential for providing scalable, real-time student +guidance. However, current LLMs often give away solutions directly, making them +ineffective instructors. We tackle this issue in the code debugging domain with +TreeInstruct, an Instructor agent guided by a novel state space-based planning +algorithm. TreeInstruct asks probing questions to help students independently +identify and resolve errors. It estimates a student's conceptual and +syntactical knowledge to dynamically construct a question tree based on their +responses and current knowledge state, effectively addressing both independent +and dependent mistakes concurrently in a multi-turn interaction setting. In +addition to using an existing single-bug debugging benchmark, we construct a +more challenging multi-bug dataset of 150 coding problems, incorrect solutions, +and bug fixes -- all carefully constructed and annotated by experts. Extensive +evaluation shows TreeInstruct's state-of-the-art performance on both datasets, +proving it to be a more effective instructor than baselines. Furthermore, a +real-world case study with five students of varying skill levels further +demonstrates TreeInstruct's ability to guide students to debug their code +efficiently with minimal turns and highly Socratic questioning. We provide our +code and datasets at http://github.com/agarwalishika/TreeInstruct . + +
+
+
+
+
+ + ♻ ☆ A Structure-aware Generative Model for Biomedical Event Extraction + + +
+ Biomedical Event Extraction (BEE) is a challenging task that involves +modeling complex relationships between fine-grained entities in biomedical +text. BEE has traditionally been formulated as a classification problem. With +the recent technological advancements in large language models (LLMs), +generation-based models that cast event extraction as a sequence generation +problem have attracted much attention from the NLP research communities. +However, current generative models often overlook the importance of +cross-instance information from complex event structures such as nested events +and overlapping events, which contribute to over 20% of the events in the +benchmark datasets. In this paper, we propose an event structure-aware +generative model named GenBEE, which can capture complex event structures in +biomedical text for biomedical event extraction. In particular, GenBEE +constructs event prompts that distill knowledge from LLMs for incorporating +both label semantics and argument dependency relationships into the proposed +model. In addition, GenBEE also generates prefixes with event structural +prompts to incorporate structural features for improving the model's overall +performance. We have evaluated the proposed GenBEE model on three widely used +biomedical event extraction benchmark datasets, namely MLEE, GE11, and PHEE. +Experimental results show that GenBEE has achieved state-of-the-art performance +on the MLEE and GE11 datasets, and achieved competitive results when compared +to the state-of-the-art classification-based models on the PHEE dataset. + +
+
+ comment: 8 pages, 4 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ New Job, New Gender? Measuring the Social Bias in Image Generation + Models ACM MM 2024 + + +
+ Image generation models can generate or edit images from a given text. Recent +advancements in image generation technology, exemplified by DALL-E and +Midjourney, have been groundbreaking. These advanced models, despite their +impressive capabilities, are often trained on massive Internet datasets, making +them susceptible to generating content that perpetuates social stereotypes and +biases, which can lead to severe consequences. Prior research on assessing bias +within image generation models suffers from several shortcomings, including +limited accuracy, reliance on extensive human labor, and lack of comprehensive +analysis. In this paper, we propose BiasPainter, a novel evaluation framework +that can accurately, automatically and comprehensively trigger social bias in +image generation models. BiasPainter uses a diverse range of seed images of +individuals and prompts the image generation models to edit these images using +gender, race, and age-neutral queries. These queries span 62 professions, 39 +activities, 57 types of objects, and 70 personality traits. The framework then +compares the edited images to the original seed images, focusing on the +significant changes related to gender, race, and age. BiasPainter adopts a key +insight that these characteristics should not be modified when subjected to +neutral prompts. Built upon this design, BiasPainter can trigger the social +bias and evaluate the fairness of image generation models. We use BiasPainter +to evaluate six widely-used image generation models, such as stable diffusion +and Midjourney. Experimental results show that BiasPainter can successfully +trigger social bias in image generation models. According to our human +evaluation, BiasPainter can achieve 90.8% accuracy on automatic bias detection, +which is significantly higher than the results reported in previous work. + +
+
+ comment: ACM MM 2024 Oral +
+
+
+
+
+ + ♻ ☆ MMBench: Is Your Multi-modal Model an All-around Player? ECCV2024 + + +
+ Large vision-language models (VLMs) have recently achieved remarkable +progress, exhibiting impressive multimodal perception and reasoning abilities. +However, effectively evaluating these large VLMs remains a major challenge, +hindering future development in this domain. Traditional benchmarks like VQAv2 +or COCO Caption provide quantitative performance measurements but lack +fine-grained ability assessment and robust evaluation metrics. Meanwhile, +subjective benchmarks, such as OwlEval, offer comprehensive evaluations of a +model's abilities by incorporating human labor, which is not scalable and may +display significant bias. In response to these challenges, we propose MMBench, +a bilingual benchmark for assessing the multi-modal capabilities of VLMs. +MMBench methodically develops a comprehensive evaluation pipeline, primarily +comprised of the following key features: 1. MMBench is meticulously curated +with well-designed quality control schemes, surpassing existing similar +benchmarks in terms of the number and variety of evaluation questions and +abilities; 2. MMBench introduces a rigorous CircularEval strategy and +incorporates large language models to convert free-form predictions into +pre-defined choices, which helps to yield accurate evaluation results for +models with limited instruction-following capabilities. 3. MMBench incorporates +multiple-choice questions in both English and Chinese versions, enabling an +apples-to-apples comparison of VLMs' performance under a bilingual context. To +summarize, MMBench is a systematically designed objective benchmark for a +robust and holistic evaluation of vision-language models. We hope MMBench will +assist the research community in better evaluating their models and facilitate +future progress in this area. The evalutation code of MMBench has been +integrated into VLMEvalKit: https://github.com/open-compass/VLMEvalKit. + +
+
+ comment: Accepted in ECCV2024 as Oral Presentation +
+
+
+
+
+ + ♻ ☆ Bayesian Optimization with LLM-Based Acquisition Functions for Natural + Language Preference Elicitation + + +
+ Designing preference elicitation (PE) methodologies that can quickly +ascertain a user's top item preferences in a cold-start setting is a key +challenge for building effective and personalized conversational recommendation +(ConvRec) systems. While large language models (LLMs) enable fully natural +language (NL) PE dialogues, we hypothesize that monolithic LLM NL-PE approaches +lack the multi-turn, decision-theoretic reasoning required to effectively +balance the exploration and exploitation of user preferences towards an +arbitrary item set. In contrast, traditional Bayesian optimization PE methods +define theoretically optimal PE strategies, but cannot generate arbitrary NL +queries or reason over content in NL item descriptions -- requiring users to +express preferences via ratings or comparisons of unfamiliar items. To overcome +the limitations of both approaches, we formulate NL-PE in a Bayesian +Optimization (BO) framework that seeks to actively elicit NL feedback to +identify the best recommendation. Key challenges in generalizing BO to deal +with natural language feedback include determining: (a) how to leverage LLMs to +model the likelihood of NL preference feedback as a function of item utilities, +and (b) how to design an acquisition function for NL BO that can elicit +preferences in the infinite space of language. We demonstrate our framework in +a novel NL-PE algorithm, PEBOL, which uses: 1) Natural Language Inference (NLI) +between user preference utterances and NL item descriptions to maintain +Bayesian preference beliefs, and 2) BO strategies such as Thompson Sampling +(TS) and Upper Confidence Bound (UCB) to steer LLM query generation. We +numerically evaluate our methods in controlled simulations, finding that after +10 turns of dialogue, PEBOL can achieve an MRR@10 of up to 0.27 compared to the +best monolithic LLM baseline's MRR@10 of 0.17, despite relying on earlier and +smaller LLMs. + +
+
+
+
+
+ + ♻ ☆ A Comparison of Large Language Model and Human Performance on Random + Number Generation Tasks + + +
+ Random Number Generation Tasks (RNGTs) are used in psychology for examining +how humans generate sequences devoid of predictable patterns. By adapting an +existing human RNGT for an LLM-compatible environment, this preliminary study +tests whether ChatGPT-3.5, a large language model (LLM) trained on +human-generated text, exhibits human-like cognitive biases when generating +random number sequences. Initial findings indicate that ChatGPT-3.5 more +effectively avoids repetitive and sequential patterns compared to humans, with +notably lower repeat frequencies and adjacent number frequencies. Continued +research into different models, parameters, and prompting methodologies will +deepen our understanding of how LLMs can more closely mimic human random +generation behaviors, while also broadening their applications in cognitive and +behavioral science research. + +
+
+
+
+
+ + ♻ ☆ SciRIFF: A Resource to Enhance Language Model Instruction-Following over + Scientific Literature NeurIPS + + +
+ We present SciRIFF (Scientific Resource for Instruction-Following and +Finetuning), a dataset of 137K instruction-following demonstrations for 54 +tasks covering five essential scientific literature understanding capabilities: +information extraction, summarization, question answering, claim verification, +and classification. SciRIFF demonstrations are notable for their long input +contexts, detailed task specifications, and complex structured outputs. While +instruction-following resources are available in specific domains such as +clinical medicine and chemistry, SciRIFF is the first dataset focused on +extracting and synthesizing information from research literature across a wide +range of scientific fields. To demonstrate the utility of SciRIFF, we develop a +sample-efficient strategy to adapt a general instruction-following model for +science by performing additional finetuning on a mix of general-domain and +SciRIFF demonstrations. In evaluations on nine held-out scientific tasks, our +model -- called SciTulu -- improves over a strong LLM baseline by 28.1% and +6.5% at the 7B and 70B scales respectively, while maintaining general +instruction-following performance within 2% of the baseline. We are optimistic +that SciRIFF will facilitate the development and evaluation of LLMs to help +researchers navigate the ever-growing body of scientific literature. We release +our dataset, model checkpoints, and data processing and evaluation code to +enable further research. + +
+
+ comment: Submitted to NeurIPS Datasets and Benchmarks 2024 +
+
+
+
+
+ + ♻ ☆ Large Visual-Language Models Are Also Good Classifiers: A Study of + In-Context Multimodal Fake News Detection + + +
+ Large visual-language models (LVLMs) exhibit exceptional performance in +visual-language reasoning across diverse cross-modal benchmarks. Despite these +advances, recent research indicates that Large Language Models (LLMs), like +GPT-3.5-turbo, underachieve compared to well-trained smaller models, such as +BERT, in Fake News Detection (FND), prompting inquiries into LVLMs' efficacy in +FND tasks. Although performance could improve through fine-tuning LVLMs, the +substantial parameters and requisite pre-trained weights render it a +resource-heavy endeavor for FND applications. This paper initially assesses the +FND capabilities of two notable LVLMs, CogVLM and GPT4V, in comparison to a +smaller yet adeptly trained CLIP model in a zero-shot context. The findings +demonstrate that LVLMs can attain performance competitive with that of the +smaller model. Next, we integrate standard in-context learning (ICL) with +LVLMs, noting improvements in FND performance, though limited in scope and +consistency. To address this, we introduce the \textbf{I}n-context +\textbf{M}ultimodal \textbf{F}ake \textbf{N}ews \textbf{D}etection (IMFND) +framework, enriching in-context examples and test inputs with predictions and +corresponding probabilities from a well-trained smaller model. This strategic +integration directs the LVLMs' focus towards news segments associated with +higher probabilities, thereby improving their analytical accuracy. The +experimental results suggest that the IMFND framework significantly boosts the +FND efficiency of LVLMs, achieving enhanced accuracy over the standard ICL +approach across three publicly available FND datasets. + +
+
+
+
+
+ + ♻ ☆ PromptBench: A Unified Library for Evaluation of Large Language Models + + +
+ The evaluation of large language models (LLMs) is crucial to assess their +performance and mitigate potential security risks. In this paper, we introduce +PromptBench, a unified library to evaluate LLMs. It consists of several key +components that are easily used and extended by researchers: prompt +construction, prompt engineering, dataset and model loading, adversarial prompt +attack, dynamic evaluation protocols, and analysis tools. PromptBench is +designed to be an open, general, and flexible codebase for research purposes +that can facilitate original study in creating new benchmarks, deploying +downstream applications, and designing new evaluation protocols. The code is +available at: https://github.com/microsoft/promptbench and will be continuously +supported. + +
+
+ comment: Accepted by Journal of Machine Learning Research (JMLR); code: + https://github.com/microsoft/promptbench +
+
+
+
+
+ + ♻ ☆ Clarify: Improving Model Robustness With Natural Language Corrections + + +
+ The standard way to teach models is by feeding them lots of data. However, +this approach often teaches models incorrect ideas because they pick up on +misleading signals in the data. To prevent such misconceptions, we must +necessarily provide additional information beyond the training data. Prior +methods incorporate additional instance-level supervision, such as labels for +misleading features or additional labels for debiased data. However, such +strategies require a large amount of labeler effort. We hypothesize that people +are good at providing textual feedback at the concept level, a capability that +existing teaching frameworks do not leverage. We propose Clarify, a novel +interface and method for interactively correcting model misconceptions. Through +Clarify, users need only provide a short text description of a model's +consistent failure patterns. Then, in an entirely automated way, we use such +descriptions to improve the training process. Clarify is the first end-to-end +system for user model correction. Our user studies show that non-expert users +can successfully describe model misconceptions via Clarify, leading to +increased worst-case performance in two datasets. We additionally conduct a +case study on a large-scale image dataset, ImageNet, using Clarify to find and +rectify 31 novel hard subpopulations. + +
+
+ comment: UIST 2024. Interface code available at + https://github.com/yoonholee/Clarify +
+
+
+
+
+ + ♻ ☆ Gender, Race, and Intersectional Bias in Resume Screening via Language + Model Retrieval AAAI + + +
+ Artificial intelligence (AI) hiring tools have revolutionized resume +screening, and large language models (LLMs) have the potential to do the same. +However, given the biases which are embedded within LLMs, it is unclear whether +they can be used in this scenario without disadvantaging groups based on their +protected attributes. In this work, we investigate the possibilities of using +LLMs in a resume screening setting via a document retrieval framework that +simulates job candidate selection. Using that framework, we then perform a +resume audit study to determine whether a selection of Massive Text Embedding +(MTE) models are biased in resume screening scenarios. We simulate this for +nine occupations, using a collection of over 500 publicly available resumes and +500 job descriptions. We find that the MTEs are biased, significantly favoring +White-associated names in 85.1\% of cases and female-associated names in only +11.1\% of cases, with a minority of cases showing no statistically significant +differences. Further analyses show that Black males are disadvantaged in up to +100\% of cases, replicating real-world patterns of bias in employment settings, +and validate three hypotheses of intersectionality. We also find an impact of +document length as well as the corpus frequency of names in the selection of +resumes. These findings have implications for widely used AI tools that are +automating employment, fairness, and tech policy. + +
+
+ comment: To be published in Proceedings of the 2024 AAAI/ACM Conference on AI, + Ethics, and Society; code available at + https://github.com/kyrawilson/Resume-Screening-Bias +
+
+
+
+
+ + ♻ ☆ SelectLLM: Can LLMs Select Important Instructions to Annotate? + + +
+ Instruction tuning benefits from large and diverse datasets; however, +creating such datasets involves a high cost of human labeling. While synthetic +datasets generated by large language models (LLMs) have partly solved this +issue, they often contain low-quality data. One effective solution is +selectively annotating unlabelled instructions, especially given the relative +ease of acquiring unlabeled instructions or texts from various sources. +However, how to select unlabelled instructions is not well-explored, especially +in the context of LLMs. Therefore, we introduce SelectLLM, an alternative +framework that leverages the capabilities of LLMs to select unlabeled +instructions more effectively. Specifically, SelectLLM consists of two key +steps: Coreset-based clustering of unlabelled instructions for enlarging +diversity and prompting of LLM to identify the most beneficial instructions +within each cluster. We evaluate SelectLLM on AlpacaEval2 and MT-Bench, +demonstrating its ability to outperform state-of-the-art methods like +Alpagasus. In addition, we compare the performance and compatibility of +SelectLLM with various LLMs, such as ChatGPT, LLaMA-3.1-70B, and Gemma-2-27b. +SelectLLM's adaptability and robustness are further evidenced by its ability to +maintain high performance across both human and synthetic datasets. All code +and data are publicly available (https://github.com/minnesotanlp/select-llm). + +
+
+ comment: First Authors: Ritik Sachin Parkar and Jaehyung Kim | Second Author: + Jong Inn Park | PI: Dongyeop Kang +
+
+
+
+
+ + ♻ ☆ BLADE: Benchmarking Language Model Agents for Data-Driven Science + + +
+ Data-driven scientific discovery requires the iterative integration of +scientific domain knowledge, statistical expertise, and an understanding of +data semantics to make nuanced analytical decisions, e.g., about which +variables, transformations, and statistical models to consider. LM-based agents +equipped with planning, memory, and code execution capabilities have the +potential to support data-driven science. However, evaluating agents on such +open-ended tasks is challenging due to multiple valid approaches, partially +correct steps, and different ways to express the same decisions. To address +these challenges, we present BLADE, a benchmark to automatically evaluate +agents' multifaceted approaches to open-ended research questions. BLADE +consists of 12 datasets and research questions drawn from existing scientific +literature, with ground truth collected from independent analyses by expert +data scientists and researchers. To automatically evaluate agent responses, we +developed corresponding computational methods to match different +representations of analyses to this ground truth. Though language models +possess considerable world knowledge, our evaluation shows that they are often +limited to basic analyses. However, agents capable of interacting with the +underlying data demonstrate improved, but still non-optimal, diversity in their +analytical decision making. Our work enables the evaluation of agents for +data-driven science and provides researchers deeper insights into agents' +analysis approaches. + +
+
+
+
+
+ + ♻ ☆ LADDER: Language Driven Slice Discovery and Error Rectification + + +
+ Error slice discovery associates structured patterns with model errors. +Existing methods discover error slices by clustering the error-prone samples +with similar patterns or assigning discrete attributes to each sample for +post-hoc analysis. While these methods aim for interpretability and easier +mitigation through reweighting or rebalancing, they may not capture the full +complexity of error patterns due to incomplete or missing attributes. Contrary +to the existing approach, this paper utilizes the reasoning capabilities of the +Large Language Model (LLM) to analyze complex error patterns and generate +testable hypotheses. This paper proposes LADDER: Language Driven slice +Discovery and Error Rectification. It first projects the model's representation +into a language-aligned feature space (\eg CLIP) to preserve semantics in the +original model feature space. This ensures the accurate retrieval of sentences +that highlight the model's errors. Next, the LLM utilizes the sentences and +generates hypotheses to discover error slices. Finally, we mitigate the error +by fine-tuning the classification head by creating a group-balanced dataset +using the hypotheses. Our entire method does not require any attribute +annotation, either explicitly or through external tagging models. We validate +our method with \textbf{five} image classification datasets. The code is +available\footnote{\url{https://github.com/batmanlab/Ladder}} + +
+
+
+
+
+ + ♻ ☆ Evaluating the Efficacy of Foundational Models: Advancing Benchmarking + Practices to Enhance Fine-Tuning Decision-Making + + +
+ Recently, large language models (LLMs) have expanded into various domains. +However, there remains a need to evaluate how these models perform when +prompted with commonplace queries compared to domain-specific queries, which +may be useful for benchmarking prior to fine-tuning for domain-specific +downstream tasks. This study evaluates LLMs, specifically Gemma-2B and +Gemma-7B, across diverse domains, including cybersecurity, medicine, and +finance, compared to common knowledge queries. This study utilizes a +comprehensive methodology to assess foundational models, which includes problem +formulation, data analysis, and the development of ThroughCut, a novel outlier +detection technique that automatically identifies response throughput outliers +based on their conciseness. This methodological rigor enhances the credibility +of the presented evaluation frameworks. This study focused on assessing +inference time, response length, throughput, quality, and resource utilization +and investigated the correlations between these factors. The results indicate +that model size and types of prompts used for inference significantly +influenced response length and quality. In addition, common prompts, which +include various types of queries, generate diverse and inconsistent responses +at irregular intervals. In contrast, domain-specific prompts consistently +generate concise responses within a reasonable time. Overall, this study +underscores the need for comprehensive evaluation frameworks to enhance the +reliability of benchmarking procedures in multidomain AI research. + +
+
+ comment: 10 pages, 5 figures, 2 tables, and algorithms +
+
+
+
+
+ + ♻ ☆ A Roadmap to Pluralistic Alignment ICML 2024 + + +
+ With increased power and prevalence of AI systems, it is ever more critical +that AI systems are designed to serve all, i.e., people with diverse values and +perspectives. However, aligning models to serve pluralistic human values +remains an open research question. In this piece, we propose a roadmap to +pluralistic alignment, specifically using language models as a test bed. We +identify and formalize three possible ways to define and operationalize +pluralism in AI systems: 1) Overton pluralistic models that present a spectrum +of reasonable responses; 2) Steerably pluralistic models that can steer to +reflect certain perspectives; and 3) Distributionally pluralistic models that +are well-calibrated to a given population in distribution. We also formalize +and discuss three possible classes of pluralistic benchmarks: 1) +Multi-objective benchmarks, 2) Trade-off steerable benchmarks, which +incentivize models to steer to arbitrary trade-offs, and 3) Jury-pluralistic +benchmarks which explicitly model diverse human ratings. We use this framework +to argue that current alignment techniques may be fundamentally limited for +pluralistic AI; indeed, we highlight empirical evidence, both from our own +experiments and from other work, that standard alignment procedures might +reduce distributional pluralism in models, motivating the need for further +research on pluralistic alignment. + +
+
+ comment: ICML 2024 +
+
+
+
+
+ + ♻ ☆ Large Model Strategic Thinking, Small Model Efficiency: Transferring + Theory of Mind in Large Language Models + + +
+ As the performance of larger, newer Large Language Models continues to +improve for strategic Theory of Mind (ToM) tasks, the demand for these +state-of-the-art models increases commensurately. However, their deployment is +costly both in terms of processing power and time. In this paper, we +investigate the feasibility of creating smaller, highly-performing specialized +algorithms by way of fine-tuning. To do this, we first present a large +pre-trained model with 20 unique scenarios that combine different social +contexts with games of varying social dilemmas, record its answers, and use +them for Q&A fine-tuning on a smaller model of the same family. Our focus is on +in-context game-theoretic decision-making, the same domain within which human +interaction occurs and that requires both a theory of mind (or a semblance +thereof) and an understanding of social dynamics. The smaller model is +therefore trained not just on the answers provided, but also on the motivations +provided by the larger model, which should contain advice and guidelines to +navigate both strategic dilemmas and social cues. We find that the fine-tuned +smaller language model consistently bridged the gap in performance between the +smaller pre-trained version of the model and its larger relative and that its +improvements extended in areas and contexts beyond the ones provided in the +training examples, including on out-of-sample scenarios that include completely +different game structures. On average for all games, through fine-tuning, the +smaller model showed a 46% improvement measured as alignment towards the +behavior of the larger model, with 100% representing indistinguishable +behavior. When presented with out-of-sample social contexts and games, the +fine-tuned model still displays remarkable levels of alignment, reaching an +improvement of 18% and 28% respectively. + +
+
+ comment: 18 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ How Much are Large Language Models Contaminated? A Comprehensive Survey + and the LLMSanitize Library + + +
+ With the rise of Large Language Models (LLMs) in recent years, abundant new +opportunities are emerging, but also new challenges, among which contamination +is quickly becoming critical. Business applications and fundraising in AI have +reached a scale at which a few percentage points gained on popular +question-answering benchmarks could translate into dozens of millions of +dollars, placing high pressure on model integrity. At the same time, it is +becoming harder and harder to keep track of the data that LLMs have seen; if +not impossible with closed-source models like GPT-4 and Claude-3 not divulging +any information on the training set. As a result, contamination becomes a major +issue: LLMs' performance may not be reliable anymore, as the high performance +may be at least partly due to their previous exposure to the data. This +limitation jeopardizes the entire progress in the field of NLP, yet, there +remains a lack of methods on how to efficiently detect contamination.In this +paper, we survey all recent work on contamination detection with LLMs, and help +the community track contamination levels of LLMs by releasing an open-source +Python library named LLMSanitize implementing major contamination detection +algorithms. + +
+
+ comment: 8 pages, 1 figure, 1 table +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 145 + +
+
+
+ + ☆ Prompt-Guided Image-Adaptive Neural Implicit Lookup Tables for + Interpretable Image Enhancement + + +
+ In this paper, we delve into the concept of interpretable image enhancement, +a technique that enhances image quality by adjusting filter parameters with +easily understandable names such as "Exposure" and "Contrast". Unlike using +predefined image editing filters, our framework utilizes learnable filters that +acquire interpretable names through training. Our contribution is two-fold. +Firstly, we introduce a novel filter architecture called an image-adaptive +neural implicit lookup table, which uses a multilayer perceptron to implicitly +define the transformation from input feature space to output color space. By +incorporating image-adaptive parameters directly into the input features, we +achieve highly expressive filters. Secondly, we introduce a prompt guidance +loss to assign interpretable names to each filter. We evaluate visual +impressions of enhancement results, such as exposure and contrast, using a +vision and language model along with guiding prompts. We define a constraint to +ensure that each filter affects only the targeted visual impression without +influencing other attributes, which allows us to obtain the desired filter +effects. Experimental results show that our method outperforms existing +predefined filter-based methods, thanks to the filters optimized to predict +target results. Our source code is available at +https://github.com/satoshi-kosugi/PG-IA-NILUT. + +
+
+ comment: Accepted to ACM Multimedia 2024 +
+
+
+
+
+ + ☆ NeCo: Improving DINOv2's spatial representations in 19 GPU hours with + Patch Neighbor Consistency + + +
+ We propose sorting patch representations across views as a novel +self-supervised learning signal to improve pretrained representations. To this +end, we introduce NeCo: Patch Neighbor Consistency, a novel training loss that +enforces patch-level nearest neighbor consistency across a student and teacher +model, relative to reference batches. Our method leverages a differentiable +sorting method applied on top of pretrained representations, such as +DINOv2-registers to bootstrap the learning signal and further improve upon +them. This dense post-pretraining leads to superior performance across various +models and datasets, despite requiring only 19 hours on a single GPU. We +demonstrate that this method generates high-quality dense feature encoders and +establish several new state-of-the-art results: +5.5% and + 6% for +non-parametric in-context semantic segmentation on ADE20k and Pascal VOC, and ++7.2% and +5.7% for linear segmentation evaluations on COCO-Things and -Stuff. + +
+
+ comment: Preprint. The webpage is accessible at: + https://vpariza.github.io/NeCo/ +
+
+
+
+
+ + ☆ FLAME: Learning to Navigate with Multimodal LLM in Urban Environments + + +
+ Large Language Models (LLMs) have demonstrated potential in +Vision-and-Language Navigation (VLN) tasks, yet current applications face +challenges. While LLMs excel in general conversation scenarios, they struggle +with specialized navigation tasks, yielding suboptimal performance compared to +specialized VLN models. We introduce FLAME (FLAMingo-Architected Embodied +Agent), a novel Multimodal LLM-based agent and architecture designed for urban +VLN tasks that efficiently handles multiple observations. Our approach +implements a three-phase tuning technique for effective adaptation to +navigation tasks, including single perception tuning for street view +description, multiple perception tuning for trajectory summarization, and +end-to-end training on VLN datasets. The augmented datasets are synthesized +automatically. Experimental results demonstrate FLAME's superiority over +existing methods, surpassing state-of-the-art methods by a 7.3% increase in +task completion rate on Touchdown dataset. This work showcases the potential of +Multimodal LLMs (MLLMs) in complex navigation tasks, representing an +advancement towards practical applications of MLLMs in embodied AI. Project +page: https://flame-sjtu.github.io + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ Transfusion: Predict the Next Token and Diffuse Images with One + Multi-Modal Model + + +
+ We introduce Transfusion, a recipe for training a multi-modal model over +discrete and continuous data. Transfusion combines the language modeling loss +function (next token prediction) with diffusion to train a single transformer +over mixed-modality sequences. We pretrain multiple Transfusion models up to 7B +parameters from scratch on a mixture of text and image data, establishing +scaling laws with respect to a variety of uni- and cross-modal benchmarks. Our +experiments show that Transfusion scales significantly better than quantizing +images and training a language model over discrete image tokens. By introducing +modality-specific encoding and decoding layers, we can further improve the +performance of Transfusion models, and even compress each image to just 16 +patches. We further demonstrate that scaling our Transfusion recipe to 7B +parameters and 2T multi-modal tokens produces a model that can generate images +and text on a par with similar scale diffusion models and language models, +reaping the benefits of both worlds. + +
+
+ comment: 23 pages +
+
+
+
+
+ + ☆ Atmospheric Transport Modeling of CO$_2$ with Neural Networks + + +
+ Accurately describing the distribution of CO$_2$ in the atmosphere with +atmospheric tracer transport models is essential for greenhouse gas monitoring +and verification support systems to aid implementation of international climate +agreements. Large deep neural networks are poised to revolutionize weather +prediction, which requires 3D modeling of the atmosphere. While similar in this +regard, atmospheric transport modeling is subject to new challenges. Both, +stable predictions for longer time horizons and mass conservation throughout +need to be achieved, while IO plays a larger role compared to computational +costs. In this study we explore four different deep neural networks (UNet, +GraphCast, Spherical Fourier Neural Operator and SwinTransformer) which have +proven as state-of-the-art in weather prediction to assess their usefulness for +atmospheric tracer transport modeling. For this, we assemble the CarbonBench +dataset, a systematic benchmark tailored for machine learning emulators of +Eulerian atmospheric transport. Through architectural adjustments, we decouple +the performance of our emulators from the distribution shift caused by a steady +rise in atmospheric CO$_2$. More specifically, we center CO$_2$ input fields to +zero mean and then use an explicit flux scheme and a mass fixer to assure mass +balance. This design enables stable and mass conserving transport for over 6 +months with all four neural network architectures. In our study, the +SwinTransformer displays particularly strong emulation skill (90-day $R^2 > +0.99$), with physically plausible emulation even for forward runs of multiple +years. This work paves the way forward towards high resolution forward and +inverse modeling of inert trace gases with neural networks. + +
+
+ comment: Code: https://github.com/vitusbenson/carbonbench +
+
+
+
+
+ + ☆ OpenScan: A Benchmark for Generalized Open-Vocabulary 3D Scene + Understanding + + +
+ Open-vocabulary 3D scene understanding (OV-3D) aims to localize and classify +novel objects beyond the closed object classes. However, existing approaches +and benchmarks primarily focus on the open vocabulary problem within the +context of object classes, which is insufficient to provide a holistic +evaluation to what extent a model understands the 3D scene. In this paper, we +introduce a more challenging task called Generalized Open-Vocabulary 3D Scene +Understanding (GOV-3D) to explore the open vocabulary problem beyond object +classes. It encompasses an open and diverse set of generalized knowledge, +expressed as linguistic queries of fine-grained and object-specific attributes. +To this end, we contribute a new benchmark named OpenScan, which consists of 3D +object attributes across eight representative linguistic aspects, including +affordance, property, material, and more. We further evaluate state-of-the-art +OV-3D methods on our OpenScan benchmark, and discover that these methods +struggle to comprehend the abstract vocabularies of the GOV-3D task, a +challenge that cannot be addressed by simply scaling up object classes during +training. We highlight the limitations of existing methodologies and explore a +promising direction to overcome the identified shortcomings. Data and code are +available at https://github.com/YoujunZhao/OpenScan + +
+
+
+
+
+ + ☆ MegaFusion: Extend Diffusion Models towards Higher-resolution Image + Generation without Further Tuning + + +
+ Diffusion models have emerged as frontrunners in text-to-image generation for +their impressive capabilities. Nonetheless, their fixed image resolution during +training often leads to challenges in high-resolution image generation, such as +semantic inaccuracies and object replication. This paper introduces MegaFusion, +a novel approach that extends existing diffusion-based text-to-image generation +models towards efficient higher-resolution generation without additional +fine-tuning or extra adaptation. Specifically, we employ an innovative truncate +and relay strategy to bridge the denoising processes across different +resolutions, allowing for high-resolution image generation in a coarse-to-fine +manner. Moreover, by integrating dilated convolutions and noise re-scheduling, +we further adapt the model's priors for higher resolution. The versatility and +efficacy of MegaFusion make it universally applicable to both latent-space and +pixel-space diffusion models, along with other derivative models. Extensive +experiments confirm that MegaFusion significantly boosts the capability of +existing models to produce images of megapixels and various aspect ratios, +while only requiring about 40% of the original computational cost. + +
+
+ comment: Technical Report. Project Page: + https://haoningwu3639.github.io/MegaFusion/ +
+
+
+
+
+ + ☆ SenPa-MAE: Sensor Parameter Aware Masked Autoencoder for Multi-Satellite + Self-Supervised Pretraining + + +
+ This paper introduces SenPa-MAE, a transformer architecture that encodes the +sensor parameters of an observed multispectral signal into the image +embeddings. SenPa-MAE can be pre-trained on imagery of different satellites +with non-matching spectral or geometrical sensor characteristics. To +incorporate sensor parameters, we propose a versatile sensor parameter encoding +module as well as a data augmentation strategy for the diversification of the +pre-training dataset. This enables the model to effectively differentiate +between various sensors and gain an understanding of sensor parameters and the +correlation to the observed signal. Given the rising number of Earth +observation satellite missions and the diversity in their sensor +specifications, our approach paves the way towards a sensor-independent Earth +observation foundation model. This opens up possibilities such as cross-sensor +training and sensor-independent inference. + +
+
+ comment: GCPR 2024 +
+
+
+
+
+ + ☆ Facial Demorphing via Identity Preserving Image Decomposition + + +
+ A face morph is created by combining the face images usually pertaining to +two distinct identities. The goal is to generate an image that can be matched +with two identities thereby undermining the security of a face recognition +system. To deal with this problem, several morph attack detection techniques +have been developed. But these methods do not extract any information about the +underlying bonafides used to create them. Demorphing addresses this limitation. +However, current demorphing techniques are mostly reference-based, i.e, they +need an image of one of the identities to recover the other. In this work, we +treat demorphing as an ill-posed decomposition problem. We propose a novel +method that is reference-free and recovers the bonafides with high accuracy. +Our method decomposes the morph into several identity-preserving feature +components. A merger network then weighs and combines these components to +recover the bonafides. Our method is observed to reconstruct high-quality +bonafides in terms of definition and fidelity. Experiments on the +CASIA-WebFace, SMDD and AMSL datasets demonstrate the effectiveness of our +method. + +
+
+
+
+
+ + ☆ Denoising Plane Wave Ultrasound Images Using Diffusion Probabilistic + Models + + +
+ Ultrasound plane wave imaging is a cutting-edge technique that enables high +frame-rate imaging. However, one challenge associated with high frame-rate +ultrasound imaging is the high noise associated with them, hindering their +wider adoption. Therefore, the development of a denoising method becomes +imperative to augment the quality of plane wave images. Drawing inspiration +from Denoising Diffusion Probabilistic Models (DDPMs), our proposed solution +aims to enhance plane wave image quality. Specifically, the method considers +the distinction between low-angle and high-angle compounding plane waves as +noise and effectively eliminates it by adapting a DDPM to beamformed +radiofrequency (RF) data. The method underwent training using only 400 +simulated images. In addition, our approach employs natural image segmentation +masks as intensity maps for the generated images, resulting in accurate +denoising for various anatomy shapes. The proposed method was assessed across +simulation, phantom, and in vivo images. The results of the evaluations +indicate that our approach not only enhances image quality on simulated data +but also demonstrates effectiveness on phantom and in vivo data in terms of +image quality. Comparative analysis with other methods underscores the +superiority of our proposed method across various evaluation metrics. The +source code and trained model will be released along with the dataset at: +http://code.sonography.ai + +
+
+
+
+
+ + ☆ ISLES'24: Improving final infarct prediction in ischemic stroke using + multimodal imaging and clinical data + + +
+ Accurate estimation of core (irreversibly damaged tissue) and penumbra +(salvageable tissue) volumes is essential for ischemic stroke treatment +decisions. Perfusion CT, the clinical standard, estimates these volumes but is +affected by variations in deconvolution algorithms, implementations, and +thresholds. Core tissue expands over time, with growth rates influenced by +thrombus location, collateral circulation, and inherent patient-specific +factors. Understanding this tissue growth is crucial for determining the need +to transfer patients to comprehensive stroke centers, predicting the benefits +of additional reperfusion attempts during mechanical thrombectomy, and +forecasting final clinical outcomes. This work presents the ISLES'24 challenge, +which addresses final post-treatment stroke infarct prediction from +pre-interventional acute stroke imaging and clinical data. ISLES'24 establishes +a unique 360-degree setting where all feasibly accessible clinical data are +available for participants, including full CT acute stroke imaging, sub-acute +follow-up MRI, and clinical tabular data. The contributions of this work are +two-fold: first, we introduce a standardized benchmarking of final stroke +infarct segmentation algorithms through the ISLES'24 challenge; second, we +provide insights into infarct segmentation using multimodal imaging and +clinical data strategies by identifying outperforming methods on a finely +curated dataset. The outputs of this challenge are anticipated to enhance +clinical decision-making and improve patient outcome predictions. All ISLES'24 +materials, including data, performance evaluation scripts, and leading +algorithmic strategies, are available to the research community following +\url{https://isles-24.grand-challenge.org/}. + +
+
+
+
+
+ + ☆ Multichannel Attention Networks with Ensembled Transfer Learning to + Recognize Bangla Handwritten Charecter + + +
+ The Bengali language is the 5th most spoken native and 7th most spoken +language in the world, and Bengali handwritten character recognition has +attracted researchers for decades. However, other languages such as English, +Arabic, Turkey, and Chinese character recognition have contributed +significantly to developing handwriting recognition systems. Still, little +research has been done on Bengali character recognition because of the +similarity of the character, curvature and other complexities. However, many +researchers have used traditional machine learning and deep learning models to +conduct Bengali hand-written recognition. The study employed a convolutional +neural network (CNN) with ensemble transfer learning and a multichannel +attention network. We generated the feature from the two branches of the CNN, +including Inception Net and ResNet and then produced an ensemble feature fusion +by concatenating them. After that, we applied the attention module to produce +the contextual information from the ensemble features. Finally, we applied a +classification module to refine the features and classification. We evaluated +the proposed model using the CAMTERdb 3.1.2 data set and achieved 92\% accuracy +for the raw dataset and 98.00\% for the preprocessed dataset. We believe that +our contribution to the Bengali handwritten character recognition domain will +be considered a great development. + +
+
+
+
+
+ + ☆ HiRED: Attention-Guided Token Dropping for Efficient Inference of + High-Resolution Vision-Language Models in Resource-Constrained Environments + + +
+ High-resolution Vision-Language Models (VLMs) have been widely used in +multimodal tasks to enhance accuracy by preserving detailed image information. +However, these models often generate excessive visual tokens due to encoding +multiple partitions of the input image. Processing these excessive visual +tokens is computationally challenging, especially in resource-constrained +environments with commodity GPUs. To support high-resolution images while +meeting resource constraints, we propose High-Resolution Early Dropping +(HiRED), a token-dropping scheme that operates within a fixed token budget +before the Large Language Model (LLM) stage. HiRED can be integrated with +existing high-resolution VLMs in a plug-and-play manner, as it requires no +additional training while still maintaining superior accuracy. We strategically +use the vision encoder's attention in the initial layers to assess the visual +content of each image partition and allocate the token budget accordingly. +Then, using the attention in the final layer, we select the most important +visual tokens from each partition within the allocated budget, dropping the +rest. Empirically, when applied to LLaVA-Next-7B on NVIDIA TESLA P40 GPU, HiRED +with a 20% token budget increases token generation throughput by 4.7, reduces +first-token generation latency by 15 seconds, and saves 2.3 GB of GPU memory +for a single inference. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ A Closer Look at Data Augmentation Strategies for Finetuning-Based + Low/Few-Shot Object Detection + + +
+ Current methods for low- and few-shot object detection have primarily focused +on enhancing model performance for detecting objects. One common approach to +achieve this is by combining model finetuning with data augmentation +strategies. However, little attention has been given to the energy efficiency +of these approaches in data-scarce regimes. This paper seeks to conduct a +comprehensive empirical study that examines both model performance and energy +efficiency of custom data augmentations and automated data augmentation +selection strategies when combined with a lightweight object detector. The +methods are evaluated in three different benchmark datasets in terms of their +performance and energy consumption, and the Efficiency Factor is employed to +gain insights into their effectiveness considering both performance and +efficiency. Consequently, it is shown that in many cases, the performance gains +of data augmentation strategies are overshadowed by their increased energy +usage, necessitating the development of more energy efficient data augmentation +strategies to address data scarcity. + +
+
+
+
+
+ + ☆ Large Point-to-Gaussian Model for Image-to-3D Generation ACM MM 2024 + + +
+ Recently, image-to-3D approaches have significantly advanced the generation +quality and speed of 3D assets based on large reconstruction models, +particularly 3D Gaussian reconstruction models. Existing large 3D Gaussian +models directly map 2D image to 3D Gaussian parameters, while regressing 2D +image to 3D Gaussian representations is challenging without 3D priors. In this +paper, we propose a large Point-to-Gaussian model, that inputs the initial +point cloud produced from large 3D diffusion model conditional on 2D image to +generate the Gaussian parameters, for image-to-3D generation. The point cloud +provides initial 3D geometry prior for Gaussian generation, thus significantly +facilitating image-to-3D Generation. Moreover, we present the +\textbf{A}ttention mechanism, \textbf{P}rojection mechanism, and \textbf{P}oint +feature extractor, dubbed as \textbf{APP} block, for fusing the image features +with point cloud features. The qualitative and quantitative experiments +extensively demonstrate the effectiveness of the proposed approach on GSO and +Objaverse datasets, and show the proposed method achieves state-of-the-art +performance. + +
+
+ comment: 10 pages, 9 figures, ACM MM 2024 +
+
+
+
+
+ + ☆ SDI-Net: Toward Sufficient Dual-View Interaction for Low-light Stereo + Image Enhancement + + +
+ Currently, most low-light image enhancement methods only consider information +from a single view, neglecting the correlation between cross-view information. +Therefore, the enhancement results produced by these methods are often +unsatisfactory. In this context, there have been efforts to develop methods +specifically for low-light stereo image enhancement. These methods take into +account the cross-view disparities and enable interaction between the left and +right views, leading to improved performance. However, these methods still do +not fully exploit the interaction between left and right view information. To +address this issue, we propose a model called Toward Sufficient Dual-View +Interaction for Low-light Stereo Image Enhancement (SDI-Net). The backbone +structure of SDI-Net is two encoder-decoder pairs, which are used to learn the +mapping function from low-light images to normal-light images. Among the +encoders and the decoders, we design a module named Cross-View Sufficient +Interaction Module (CSIM), aiming to fully exploit the correlations between the +binocular views via the attention mechanism. The quantitative and visual +results on public datasets validate the superiority of our method over other +related methods. Ablation studies also demonstrate the effectiveness of the key +elements in our model. + +
+
+
+
+
+ + ☆ CrossFi: A Cross Domain Wi-Fi Sensing Framework Based on Siamese Network + + +
+ In recent years, Wi-Fi sensing has garnered significant attention due to its +numerous benefits, such as privacy protection, low cost, and penetration +ability. Extensive research has been conducted in this field, focusing on areas +such as gesture recognition, people identification, and fall detection. +However, many data-driven methods encounter challenges related to domain shift, +where the model fails to perform well in environments different from the +training data. One major factor contributing to this issue is the limited +availability of Wi-Fi sensing datasets, which makes models learn excessive +irrelevant information and over-fit to the training set. Unfortunately, +collecting large-scale Wi-Fi sensing datasets across diverse scenarios is a +challenging task. To address this problem, we propose CrossFi, a siamese +network-based approach that excels in both in-domain scenario and cross-domain +scenario, including few-shot, zero-shot scenarios, and even works in few-shot +new-class scenario where testing set contains new categories. The core +component of CrossFi is a sample-similarity calculation network called CSi-Net, +which improves the structure of the siamese network by using an attention +mechanism to capture similarity information, instead of simply calculating the +distance or cosine similarity. Based on it, we develop an extra Weight-Net that +can generate a template for each class, so that our CrossFi can work in +different scenarios. Experimental results demonstrate that our CrossFi achieves +state-of-the-art performance across various scenarios. In gesture recognition +task, our CrossFi achieves an accuracy of 98.17% in in-domain scenario, 91.72% +in one-shot cross-domain scenario, 64.81% in zero-shot cross-domain scenario, +and 84.75% in one-shot new-class scenario. To facilitate future research, we +will release the code for our model upon publication. + +
+
+
+
+
+ + ☆ ShapeSplat: A Large-scale Dataset of Gaussian Splats and Their + Self-Supervised Pretraining + + +
+ 3D Gaussian Splatting (3DGS) has become the de facto method of 3D +representation in many vision tasks. This calls for the 3D understanding +directly in this representation space. To facilitate the research in this +direction, we first build a large-scale dataset of 3DGS using the commonly used +ShapeNet and ModelNet datasets. Our dataset ShapeSplat consists of 65K objects +from 87 unique categories, whose labels are in accordance with the respective +datasets. The creation of this dataset utilized the compute equivalent of 2 GPU +years on a TITAN XP GPU. + We utilize our dataset for unsupervised pretraining and supervised finetuning +for classification and segmentation tasks. To this end, we introduce +\textbf{\textit{Gaussian-MAE}}, which highlights the unique benefits of +representation learning from Gaussian parameters. Through exhaustive +experiments, we provide several valuable insights. In particular, we show that +(1) the distribution of the optimized GS centroids significantly differs from +the uniformly sampled point cloud (used for initialization) counterpart; (2) +this change in distribution results in degradation in classification but +improvement in segmentation tasks when using only the centroids; (3) to +leverage additional Gaussian parameters, we propose Gaussian feature grouping +in a normalized feature space, along with splats pooling layer, offering a +tailored solution to effectively group and embed similar Gaussians, which leads +to notable improvement in finetuning tasks. + +
+
+
+
+
+ + ☆ A Grey-box Attack against Latent Diffusion Model-based Image Editing by + Posterior Collapse + + +
+ Recent advancements in generative AI, particularly Latent Diffusion Models +(LDMs), have revolutionized image synthesis and manipulation. However, these +generative techniques raises concerns about data misappropriation and +intellectual property infringement. Adversarial attacks on machine learning +models have been extensively studied, and a well-established body of research +has extended these techniques as a benign metric to prevent the underlying +misuse of generative AI. Current approaches to safeguarding images from +manipulation by LDMs are limited by their reliance on model-specific knowledge +and their inability to significantly degrade semantic quality of generated +images. In response to these shortcomings, we propose the Posterior Collapse +Attack (PCA) based on the observation that VAEs suffer from posterior collapse +during training. Our method minimizes dependence on the white-box information +of target models to get rid of the implicit reliance on model-specific +knowledge. By accessing merely a small amount of LDM parameters, in specific +merely the VAE encoder of LDMs, our method causes a substantial semantic +collapse in generation quality, particularly in perceptual consistency, and +demonstrates strong transferability across various model architectures. +Experimental results show that PCA achieves superior perturbation effects on +image generation of LDMs with lower runtime and VRAM. Our method outperforms +existing techniques, offering a more robust and generalizable solution that is +helpful in alleviating the socio-technical challenges posed by the rapidly +evolving landscape of generative AI. + +
+
+ comment: 21 pages, 7 figures, 10 tables +
+
+
+
+
+ + ☆ ViLReF: A Chinese Vision-Language Retinal Foundation Model + + +
+ Subtle semantic differences in retinal image and text data present great +challenges for pre-training visual-language models. Moreover, false negative +samples, i.e., image-text pairs having the same semantics but incorrectly +regarded as negatives, disrupt the visual-language pre-training process and +affect the model's learning ability. This work aims to develop a retinal +foundation model, called ViLReF, by pre-training on a paired dataset comprising +451,956 retinal images and corresponding diagnostic text reports. In our +vision-language pre-training strategy, we leverage expert knowledge to +facilitate the extraction of labels and propose a novel constraint, the +Weighted Similarity Coupling Loss, to adjust the speed of pushing sample pairs +further apart dynamically within the feature space. Furthermore, we employ a +batch expansion module with dynamic memory queues, maintained by momentum +encoders, to supply extra samples and compensate for the vacancies caused by +eliminating false negatives. Extensive experiments are conducted on multiple +datasets for downstream classification and segmentation tasks. The experimental +results demonstrate the powerful zero-shot and transfer learning capabilities +of ViLReF, verifying the effectiveness of our pre-training strategy. Our ViLReF +model is available at: https://github.com/T6Yang/ViLReF. + +
+
+
+
+
+ + ☆ Low-Quality Image Detection by Hierarchical VAE ICCV 2023 + + +
+ To make an employee roster, photo album, or training dataset of generative +models, one needs to collect high-quality images while dismissing low-quality +ones. This study addresses a new task of unsupervised detection of low-quality +images. We propose a method that not only detects low-quality images with +various types of degradation but also provides visual clues of them based on an +observation that partial reconstruction by hierarchical variational +autoencoders fails for low-quality images. The experiments show that our method +outperforms several unsupervised out-of-distribution detection methods and also +gives visual clues for low-quality images that help humans recognize them even +in thumbnail view. + +
+
+ comment: ICCV 2023, Workshop on Uncertainty Estimation for Computer Vision +
+
+
+
+
+ + ☆ DAAD: Dynamic Analysis and Adaptive Discriminator for Fake News + Detection + + +
+ In current web environment, fake news spreads rapidly across online social +networks, posing serious threats to society. Existing multimodal fake news +detection (MFND) methods can be classified into knowledge-based and +semantic-based approaches. However, these methods are overly dependent on human +expertise and feedback, lacking flexibility. To address this challenge, we +propose a Dynamic Analysis and Adaptive Discriminator (DAAD) approach for fake +news detection. For knowledge-based methods, we introduce the Monte Carlo Tree +Search (MCTS) algorithm to leverage the self-reflective capabilities of large +language models (LLMs) for prompt optimization, providing richer, +domain-specific details and guidance to the LLMs, while enabling more flexible +integration of LLM comment on news content. For semantic-based methods, we +define four typical deceit patterns: emotional exaggeration, logical +inconsistency, image manipulation, and semantic inconsistency, to reveal the +mechanisms behind fake news creation. To detect these patterns, we carefully +design four discriminators and expand them in depth and breadth, using the +soft-routing mechanism to explore optimal detection models. Experimental +results on three real-world datasets demonstrate the superiority of our +approach. The code will be available at: https://github.com/SuXinqi/DAAD. + +
+
+
+
+
+ + ☆ Open 3D World in Autonomous Driving + + +
+ The capability for open vocabulary perception represents a significant +advancement in autonomous driving systems, facilitating the comprehension and +interpretation of a wide array of textual inputs in real-time. Despite +extensive research in open vocabulary tasks within 2D computer vision, the +application of such methodologies to 3D environments, particularly within +large-scale outdoor contexts, remains relatively underdeveloped. This paper +presents a novel approach that integrates 3D point cloud data, acquired from +LIDAR sensors, with textual information. The primary focus is on the +utilization of textual data to directly localize and identify objects within +the autonomous driving context. We introduce an efficient framework for the +fusion of bird's-eye view (BEV) region features with textual features, thereby +enabling the system to seamlessly adapt to novel textual inputs and enhancing +the robustness of open vocabulary detection tasks. The effectiveness of the +proposed methodology is rigorously evaluated through extensive experimentation +on the newly introduced NuScenes-T dataset, with additional validation of its +zero-shot performance on the Lyft Level 5 dataset. This research makes a +substantive contribution to the advancement of autonomous driving technologies +by leveraging multimodal data to enhance open vocabulary perception in 3D +environments, thereby pushing the boundaries of what is achievable in +autonomous navigation and perception. + +
+
+
+
+
+ + ☆ V-RoAst: A New Dataset for Visual Road Assessment + + +
+ Road traffic crashes cause millions of deaths annually and have a significant +economic impact, particularly in low- and middle-income countries (LMICs). This +paper presents an approach using Vision Language Models (VLMs) for road safety +assessment, overcoming the limitations of traditional Convolutional Neural +Networks (CNNs). We introduce a new task ,V-RoAst (Visual question answering +for Road Assessment), with a real-world dataset. Our approach optimizes prompt +engineering and evaluates advanced VLMs, including Gemini-1.5-flash and +GPT-4o-mini. The models effectively examine attributes for road assessment. +Using crowdsourced imagery from Mapillary, our scalable solution influentially +estimates road safety levels. In addition, this approach is designed for local +stakeholders who lack resources, as it does not require training data. It +offers a cost-effective and automated methods for global road safety +assessments, potentially saving lives and reducing economic burdens. + +
+
+
+
+
+ + ☆ Radio U-Net: a convolutional neural network to detect diffuse radio + sources in galaxy clusters and beyond + + +
+ The forthcoming generation of radio telescope arrays promises significant +advancements in sensitivity and resolution, enabling the identification and +characterization of many new faint and diffuse radio sources. Conventional +manual cataloging methodologies are anticipated to be insufficient to exploit +the capabilities of new radio surveys. Radio interferometric images of diffuse +sources present a challenge for image segmentation tasks due to noise, +artifacts, and embedded radio sources. In response to these challenges, we +introduce Radio U-Net, a fully convolutional neural network based on the U-Net +architecture. Radio U-Net is designed to detect faint and extended sources in +radio surveys, such as radio halos, relics, and cosmic web filaments. Radio +U-Net was trained on synthetic radio observations built upon cosmological +simulations and then tested on a sample of galaxy clusters, where the detection +of cluster diffuse radio sources relied on customized data reduction and visual +inspection of LOFAR Two Metre Sky Survey (LoTSS) data. The 83% of clusters +exhibiting diffuse radio emission were accurately identified, and the +segmentation successfully recovered the morphology of the sources even in +low-quality images. In a test sample comprising 246 galaxy clusters, we +achieved a 73% accuracy rate in distinguishing between clusters with and +without diffuse radio emission. Our results establish the applicability of +Radio U-Net to extensive radio survey datasets, probing its efficiency on +cutting-edge high-performance computing systems. This approach represents an +advancement in optimizing the exploitation of forthcoming large radio surveys +for scientific exploration. + +
+
+ comment: Accepted by MNRAS, 16 pages, 9 figures, 2 tables +
+
+
+
+
+ + ☆ MambaDS: Near-Surface Meteorological Field Downscaling with Topography + Constrained Selective State Space Modeling + + +
+ In an era of frequent extreme weather and global warming, obtaining precise, +fine-grained near-surface weather forecasts is increasingly essential for human +activities. Downscaling (DS), a crucial task in meteorological forecasting, +enables the reconstruction of high-resolution meteorological states for target +regions from global-scale forecast results. Previous downscaling methods, +inspired by CNN and Transformer-based super-resolution models, lacked tailored +designs for meteorology and encountered structural limitations. Notably, they +failed to efficiently integrate topography, a crucial prior in the downscaling +process. In this paper, we address these limitations by pioneering the +selective state space model into the meteorological field downscaling and +propose a novel model called MambaDS. This model enhances the utilization of +multivariable correlations and topography information, unique challenges in the +downscaling process while retaining the advantages of Mamba in long-range +dependency modeling and linear computational complexity. Through extensive +experiments in both China mainland and the continental United States (CONUS), +we validated that our proposed MambaDS achieves state-of-the-art results in +three different types of meteorological field downscaling settings. We will +release the code subsequently. + +
+
+
+
+
+ + ☆ Perception-guided Jailbreak against Text-to-Image Models + + +
+ In recent years, Text-to-Image (T2I) models have garnered significant +attention due to their remarkable advancements. However, security concerns have +emerged due to their potential to generate inappropriate or Not-Safe-For-Work +(NSFW) images. In this paper, inspired by the observation that texts with +different semantics can lead to similar human perceptions, we propose an +LLM-driven perception-guided jailbreak method, termed PGJ. It is a black-box +jailbreak method that requires no specific T2I model (model-free) and generates +highly natural attack prompts. Specifically, we propose identifying a safe +phrase that is similar in human perception yet inconsistent in text semantics +with the target unsafe word and using it as a substitution. The experiments +conducted on six open-source models and commercial online services with +thousands of prompts have verified the effectiveness of PGJ. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ Aligning Object Detector Bounding Boxes with Human Preference ECCV 2024 + + +
+ Previous work shows that humans tend to prefer large bounding boxes over +small bounding boxes with the same IoU. However, we show here that commonly +used object detectors predict large and small boxes equally often. In this +work, we investigate how to align automatically detected object boxes with +human preference and study whether this improves human quality perception. We +evaluate the performance of three commonly used object detectors through a user +study (N = 123). We find that humans prefer object detections that are upscaled +with factors of 1.5 or 2, even if the corresponding AP is close to 0. Motivated +by this result, we propose an asymmetric bounding box regression loss that +encourages large over small predicted bounding boxes. Our evaluation study +shows that object detectors fine-tuned with the asymmetric loss are better +aligned with human preference and are preferred over fixed scaling factors. A +qualitative evaluation shows that human preference might be influenced by some +object characteristics, like object shape. + +
+
+ comment: Accepted paper at the ECCV 2024 workshop on Assistive Computer Vision + and Robotics (ACVR) +
+
+
+
+
+ + ☆ ZebraPose: Zebra Detection and Pose Estimation using only Synthetic Data + + +
+ Synthetic data is increasingly being used to address the lack of labeled +images in uncommon domains for deep learning tasks. A prominent example is 2D +pose estimation of animals, particularly wild species like zebras, for which +collecting real-world data is complex and impractical. However, many approaches +still require real images, consistency and style constraints, sophisticated +animal models, and/or powerful pre-trained networks to bridge the syn-to-real +gap. Moreover, they often assume that the animal can be reliably detected in +images or videos, a hypothesis that often does not hold, e.g. in wildlife +scenarios or aerial images. To solve this, we use synthetic data generated with +a 3D photorealistic simulator to obtain the first synthetic dataset that can be +used for both detection and 2D pose estimation of zebras without applying any +of the aforementioned bridging strategies. Unlike previous works, we +extensively train and benchmark our detection and 2D pose estimation models on +multiple real-world and synthetic datasets using both pre-trained and +non-pre-trained backbones. These experiments show how the models trained from +scratch and only with synthetic data can consistently generalize to real-world +images of zebras in both tasks. Moreover, we show it is possible to easily +generalize those same models to 2D pose estimation of horses with a minimal +amount of real-world images to account for the domain transfer. Code, results, +trained models; and the synthetic, training, and validation data, including +104K manually labeled frames, are provided as open-source at +https://zebrapose.is.tue.mpg.de/ + +
+
+ comment: 8 pages, 5 tables, 7 figures +
+
+
+
+
+ + ☆ CO2Wounds-V2: Extended Chronic Wounds Dataset From Leprosy Patients ICIP 2024 + + +
+ Chronic wounds pose an ongoing health concern globally, largely due to the +prevalence of conditions such as diabetes and leprosy's disease. The standard +method of monitoring these wounds involves visual inspection by healthcare +professionals, a practice that could present challenges for patients in remote +areas with inadequate transportation and healthcare infrastructure. This has +led to the development of algorithms designed for the analysis and follow-up of +wound images, which perform image-processing tasks such as classification, +detection, and segmentation. However, the effectiveness of these algorithms +heavily depends on the availability of comprehensive and varied wound image +data, which is usually scarce. This paper introduces the CO2Wounds-V2 dataset, +an extended collection of RGB wound images from leprosy patients with their +corresponding semantic segmentation annotations, aiming to enhance the +development and testing of image-processing algorithms in the medical field. + +
+
+ comment: 2024 IEEE International Conference on Image Processing (ICIP 2024) +
+
+
+
+
+ + ☆ Trustworthy Compression? Impact of AI-based Codecs on Biometrics for Law + Enforcement + + +
+ Image-based biometrics can aid law enforcement in various aspects, for +example in iris, fingerprint and soft-biometric recognition. A critical +precondition for recognition is the availability of sufficient biometric +information in images. It is visually apparent that strong JPEG compression +removes such details. However, latest AI-based image compression seemingly +preserves many image details even for very strong compression factors. Yet, +these perceived details are not necessarily grounded in measurements, which +raises the question whether these images can still be used for biometric +recognition. In this work, we investigate how AI compression impacts iris, +fingerprint and soft-biometric (fabrics and tattoo) images. We also investigate +the recognition performance for iris and fingerprint images after AI +compression. It turns out that iris recognition can be strongly affected, while +fingerprint recognition is quite robust. The loss of detail is qualitatively +best seen in fabrics and tattoos images. Overall, our results show that +AI-compression still permits many biometric tasks, but attention to strong +compression factors in sensitive tasks is advisable. + +
+
+
+
+
+ + ☆ Constructing a High Temporal Resolution Global Lakes Dataset via + Swin-Unet with Applications to Area Prediction + + +
+ Lakes provide a wide range of valuable ecosystem services, such as water +supply, biodiversity habitats, and carbon sequestration. However, lakes are +increasingly threatened by climate change and human activities. Therefore, +continuous global monitoring of lake dynamics is crucial, but remains +challenging on a large scale. The recently developed Global Lakes Area Database +(GLAKES) has mapped over 3.4 million lakes worldwide, but it only provides data +at decadal intervals, which may be insufficient to capture rapid or short-term +changes.This paper introduces an expanded lake database, GLAKES-Additional, +which offers biennial delineations and area measurements for 152,567 lakes +globally from 1990 to 2021. We employed the Swin-Unet model, replacing +traditional convolution operations, to effectively address the challenges posed +by the receptive field requirements of high spatial resolution satellite +imagery. The increased biennial time resolution helps to quantitatively +attribute lake area changes to climatic and hydrological drivers, such as +precipitation and temperature changes.For predicting lake area changes, we used +a Long Short-Term Memory (LSTM) neural network and an extended time series +dataset for preliminary modeling. Under climate and land use scenarios, our +model achieved an RMSE of 0.317 km^2 in predicting future lake area changes. + +
+
+
+
+
+ + ☆ MPL: Lifting 3D Human Pose from Multi-view 2D Poses ECCV + + +
+ Estimating 3D human poses from 2D images is challenging due to occlusions and +projective acquisition. Learning-based approaches have been largely studied to +address this challenge, both in single and multi-view setups. These solutions +however fail to generalize to real-world cases due to the lack of (multi-view) +'in-the-wild' images paired with 3D poses for training. For this reason, we +propose combining 2D pose estimation, for which large and rich training +datasets exist, and 2D-to-3D pose lifting, using a transformer-based network +that can be trained from synthetic 2D-3D pose pairs. Our experiments +demonstrate decreases up to 45% in MPJPE errors compared to the 3D pose +obtained by triangulating the 2D poses. The framework's source code is +available at https://github.com/aghasemzadeh/OpenMPL . + +
+
+ comment: 14 pages, accepted in ECCV T-CAP 2024, code: + https://github.com/aghasemzadeh/OpenMPL +
+
+
+
+
+ + ☆ Tapping in a Remote Vehicle's onboard LLM to Complement the Ego + Vehicle's Field-of-View + + +
+ Today's advanced automotive systems are turning into intelligent +Cyber-Physical Systems (CPS), bringing computational intelligence to their +cyber-physical context. Such systems power advanced driver assistance systems +(ADAS) that observe a vehicle's surroundings for their functionality. However, +such ADAS have clear limitations in scenarios when the direct line-of-sight to +surrounding objects is occluded, like in urban areas. Imagine now automated +driving (AD) systems that ideally could benefit from other vehicles' +field-of-view in such occluded situations to increase traffic safety if, for +example, locations about pedestrians can be shared across vehicles. Current +literature suggests vehicle-to-infrastructure (V2I) via roadside units (RSUs) +or vehicle-to-vehicle (V2V) communication to address such issues that stream +sensor or object data between vehicles. When considering the ongoing revolution +in vehicle system architectures towards powerful, centralized processing units +with hardware accelerators, foreseeing the onboard presence of large language +models (LLMs) to improve the passengers' comfort when using voice assistants +becomes a reality. We are suggesting and evaluating a concept to complement the +ego vehicle's field-of-view (FOV) with another vehicle's FOV by tapping into +their onboard LLM to let the machines have a dialogue about what the other +vehicle ``sees''. Our results show that very recent versions of LLMs, such as +GPT-4V and GPT-4o, understand a traffic situation to an impressive level of +detail, and hence, they can be used even to spot traffic participants. However, +better prompts are needed to improve the detection quality and future work is +needed towards a standardised message interchange format between vehicles. + +
+
+ comment: 50th Euromicro Conference Series on Software Engineering and Advanced + Applications (SEAA) 2024 - WiP +
+
+
+
+
+ + ☆ Learning Part-aware 3D Representations by Fusing 2D Gaussians and + Superquadrics + + +
+ Low-level 3D representations, such as point clouds, meshes, NeRFs, and 3D +Gaussians, are commonly used to represent 3D objects or scenes. However, humans +usually perceive 3D objects or scenes at a higher level as a composition of +parts or structures rather than points or voxels. Representing 3D as semantic +parts can benefit further understanding and applications. We aim to solve +part-aware 3D reconstruction, which parses objects or scenes into semantic +parts. In this paper, we introduce a hybrid representation of superquadrics and +2D Gaussians, trying to dig 3D structural clues from multi-view image inputs. +Accurate structured geometry reconstruction and high-quality rendering are +achieved at the same time. We incorporate parametric superquadrics in mesh +forms into 2D Gaussians by attaching Gaussian centers to faces in meshes. +During the training, superquadrics parameters are iteratively optimized, and +Gaussians are deformed accordingly, resulting in an efficient hybrid +representation. On the one hand, this hybrid representation inherits the +advantage of superquadrics to represent different shape primitives, supporting +flexible part decomposition of scenes. On the other hand, 2D Gaussians are +incorporated to model the complex texture and geometry details, ensuring +high-quality rendering and geometry reconstruction. The reconstruction is fully +unsupervised. We conduct extensive experiments on data from DTU and ShapeNet +datasets, in which the method decomposes scenes into reasonable parts, +outperforming existing state-of-the-art approaches. + +
+
+
+
+
+ + ☆ LightMDETR: A Lightweight Approach for Low-Cost Open-Vocabulary Object + Detection Training + + +
+ Object detection in computer vision traditionally involves identifying +objects in images. By integrating textual descriptions, we enhance this +process, providing better context and accuracy. The MDETR model significantly +advances this by combining image and text data for more versatile object +detection and classification. However, MDETR's complexity and high +computational demands hinder its practical use. In this paper, we introduce +Lightweight MDETR (LightMDETR), an optimized MDETR variant designed for +improved computational efficiency while maintaining robust multimodal +capabilities. Our approach involves freezing the MDETR backbone and training a +sole component, the Deep Fusion Encoder (DFE), to represent image and text +modalities. A learnable context vector enables the DFE to switch between these +modalities. Evaluation on datasets like RefCOCO, RefCOCO+, and RefCOCOg +demonstrates that LightMDETR achieves superior precision and accuracy. + +
+
+
+
+
+ + ☆ Just a Hint: Point-Supervised Camouflaged Object Detection ECCV2024 + + +
+ Camouflaged Object Detection (COD) demands models to expeditiously and +accurately distinguish objects which conceal themselves seamlessly in the +environment. Owing to the subtle differences and ambiguous boundaries, COD is +not only a remarkably challenging task for models but also for human +annotators, requiring huge efforts to provide pixel-wise annotations. To +alleviate the heavy annotation burden, we propose to fulfill this task with the +help of only one point supervision. Specifically, by swiftly clicking on each +object, we first adaptively expand the original point-based annotation to a +reasonable hint area. Then, to avoid partial localization around discriminative +parts, we propose an attention regulator to scatter model attention to the +whole object through partially masking labeled regions. Moreover, to solve the +unstable feature representation of camouflaged objects under only point-based +annotation, we perform unsupervised contrastive learning based on differently +augmented image pairs (e.g. changing color or doing translation). On three +mainstream COD benchmarks, experimental results show that our model outperforms +several weakly-supervised methods by a large margin across various metrics. + +
+
+ comment: Accepted by ECCV2024 +
+
+
+
+
+ + ☆ Generative AI in Industrial Machine Vision -- A Review + + +
+ Machine vision enhances automation, quality control, and operational +efficiency in industrial applications by enabling machines to interpret and act +on visual data. While traditional computer vision algorithms and approaches +remain widely utilized, machine learning has become pivotal in current research +activities. In particular, generative \gls*{AI} demonstrates promising +potential by improving pattern recognition capabilities, through data +augmentation, increasing image resolution, and identifying anomalies for +quality control. However, the application of generative \gls*{AI} in machine +vision is still in its early stages due to challenges in data diversity, +computational requirements, and the necessity for robust validation methods. A +comprehensive literature review is essential to understand the current state of +generative \gls*{AI} in industrial machine vision, focusing on recent +advancements, applications, and research trends. Thus, a literature review +based on the PRISMA guidelines was conducted, analyzing over 1,200 papers on +generative \gls*{AI} in industrial machine vision. Our findings reveal various +patterns in current research, with the primary use of generative \gls*{AI} +being data augmentation, for machine vision tasks such as classification and +object detection. Furthermore, we gather a collection of application challenges +together with data requirements to enable a successful application of +generative \gls*{AI} in industrial machine vision. This overview aims to +provide researchers with insights into the different areas and applications +within current research, highlighting significant advancements and identifying +opportunities for future work. + +
+
+ comment: 44 pages, 7 figures, This work has been submitted to the Journal of + Intelligent Manufacturing +
+
+
+
+
+ + ☆ Detection of Intracranial Hemorrhage for Trauma Patients + + +
+ Whole-body CT is used for multi-trauma patients in the search of any and all +injuries. Since an initial assessment needs to be rapid and the search for +lesions is done for the whole body, very little time can be allocated for the +inspection of a specific anatomy. In particular, intracranial hemorrhages are +still missed, especially by clinical students. In this work, we present a Deep +Learning approach for highlighting such lesions to improve the diagnostic +accuracy. While most works on intracranial hemorrhages perform segmentation, +detection only requires bounding boxes for the localization of the bleeding. In +this paper, we propose a novel Voxel-Complete IoU (VC-IoU) loss that encourages +the network to learn the 3D aspect ratios of bounding boxes and leads to more +precise detections. We extensively experiment on brain bleeding detection using +a publicly available dataset, and validate it on a private cohort, where we +achieve 0.877 AR30, 0.728 AP30, and 0.653 AR30, 0.514 AP30 respectively. These +results constitute a relative +5% improvement in Average Recall for both +datasets compared to other loss functions. Finally, as there is little data +currently publicly available for 3D object detection and as annotation +resources are limited in the clinical setting, we evaluate the cost of +different annotation methods, as well as the impact of imprecise bounding boxes +in the training data on the detection performance. + +
+
+
+
+
+ + ☆ SAM-COD: SAM-guided Unified Framework for Weakly-Supervised Camouflaged + Object Detection ECCV2024 + + +
+ Most Camouflaged Object Detection (COD) methods heavily rely on mask +annotations, which are time-consuming and labor-intensive to acquire. Existing +weakly-supervised COD approaches exhibit significantly inferior performance +compared to fully-supervised methods and struggle to simultaneously support all +the existing types of camouflaged object labels, including scribbles, bounding +boxes, and points. Even for Segment Anything Model (SAM), it is still +problematic to handle the weakly-supervised COD and it typically encounters +challenges of prompt compatibility of the scribble labels, extreme response, +semantically erroneous response, and unstable feature representations, +producing unsatisfactory results in camouflaged scenes. To mitigate these +issues, we propose a unified COD framework in this paper, termed SAM-COD, which +is capable of supporting arbitrary weakly-supervised labels. Our SAM-COD +employs a prompt adapter to handle scribbles as prompts based on SAM. +Meanwhile, we introduce response filter and semantic matcher modules to improve +the quality of the masks obtained by SAM under COD prompts. To alleviate the +negative impacts of inaccurate mask predictions, a new strategy of +prompt-adaptive knowledge distillation is utilized to ensure a reliable feature +representation. To validate the effectiveness of our approach, we have +conducted extensive empirical experiments on three mainstream COD benchmarks. +The results demonstrate the superiority of our method against state-of-the-art +weakly-supervised and even fully-supervised methods. + +
+
+ comment: Accepted by ECCV2024 +
+
+
+
+
+ + ☆ TrackNeRF: Bundle Adjusting NeRF from Sparse and Noisy Views via Feature + Tracks ECCV 2024 + + +
+ Neural radiance fields (NeRFs) generally require many images with accurate +poses for accurate novel view synthesis, which does not reflect realistic +setups where views can be sparse and poses can be noisy. Previous solutions for +learning NeRFs with sparse views and noisy poses only consider local geometry +consistency with pairs of views. Closely following \textit{bundle adjustment} +in Structure-from-Motion (SfM), we introduce TrackNeRF for more globally +consistent geometry reconstruction and more accurate pose optimization. +TrackNeRF introduces \textit{feature tracks}, \ie connected pixel trajectories +across \textit{all} visible views that correspond to the \textit{same} 3D +points. By enforcing reprojection consistency among feature tracks, TrackNeRF +encourages holistic 3D consistency explicitly. Through extensive experiments, +TrackNeRF sets a new benchmark in noisy and sparse view reconstruction. In +particular, TrackNeRF shows significant improvements over the state-of-the-art +BARF and SPARF by $\sim8$ and $\sim1$ in terms of PSNR on DTU under various +sparse and noisy view setups. The code is available at +\href{https://tracknerf.github.io/}. + +
+
+ comment: ECCV 2024 (supplemental pages included) +
+
+
+
+
+ + ☆ Classification of Endoscopy and Video Capsule Images using + CNN-Transformer Model + + +
+ Gastrointestinal cancer is a leading cause of cancer-related incidence and +death, making it crucial to develop novel computer-aided diagnosis systems for +early detection and enhanced treatment. Traditional approaches rely on the +expertise of gastroenterologists to identify diseases; however, this process is +subjective, and interpretation can vary even among expert clinicians. +Considering recent advancements in classifying gastrointestinal anomalies and +landmarks in endoscopic and video capsule endoscopy images, this study proposes +a hybrid model that combines the advantages of Transformers and Convolutional +Neural Networks (CNNs) to enhance classification performance. Our model +utilizes DenseNet201 as a CNN branch to extract local features and integrates a +Swin Transformer branch for global feature understanding, combining both to +perform the classification task. For the GastroVision dataset, our proposed +model demonstrates excellent performance with Precision, Recall, F1 score, +Accuracy, and Matthews Correlation Coefficient (MCC) of 0.8320, 0.8386, 0.8324, +0.8386, and 0.8191, respectively, showcasing its robustness against class +imbalance and surpassing other CNNs as well as the Swin Transformer model. +Similarly, for the Kvasir-Capsule, a large video capsule endoscopy dataset, our +model outperforms all others, achieving overall Precision, Recall, F1 score, +Accuracy, and MCC of 0.7007, 0.7239, 0.6900, 0.7239, and 0.3871. Moreover, we +generated saliency maps to explain our model's focus areas, demonstrating its +reliable decision-making process. The results underscore the potential of our +hybrid CNN-Transformer model in aiding the early and accurate detection of +gastrointestinal (GI) anomalies. + +
+
+
+
+
+ + ☆ Coarse-to-Fine Detection of Multiple Seams for Robotic Welding + + +
+ Efficiently detecting target weld seams while ensuring sub-millimeter +accuracy has always been an important challenge in autonomous welding, which +has significant application in industrial practice. Previous works mostly +focused on recognizing and localizing welding seams one by one, leading to +inferior efficiency in modeling the workpiece. This paper proposes a novel +framework capable of multiple weld seams extraction using both RGB images and +3D point clouds. The RGB image is used to obtain the region of interest by +approximately localizing the weld seams, and the point cloud is used to achieve +the fine-edge extraction of the weld seams within the region of interest using +region growth. Our method is further accelerated by using a pre-trained deep +learning model to ensure both efficiency and generalization ability. The +performance of the proposed method has been comprehensively tested on various +workpieces featuring both linear and curved weld seams and in physical +experiment systems. The results showcase considerable potential for real-world +industrial applications, emphasizing the method's efficiency and effectiveness. +Videos of the real-world experiments can be found at +https://youtu.be/pq162HSP2D4. + +
+
+
+
+
+ + ☆ Large Language Models for Multimodal Deformable Image Registration + + +
+ The challenge of Multimodal Deformable Image Registration (MDIR) lies in the +conversion and alignment of features between images of different modalities. +Generative models (GMs) cannot retain the necessary information enough from the +source modality to the target one, while non-GMs struggle to align features +across these two modalities. In this paper, we propose a novel coarse-to-fine +MDIR framework,LLM-Morph, which is applicable to various pre-trained Large +Language Models (LLMs) to solve these concerns by aligning the deep features +from different modal medical images. Specifically, we first utilize a CNN +encoder to extract deep visual features from cross-modal image pairs, then we +use the first adapter to adjust these tokens, and use LoRA in pre-trained LLMs +to fine-tune their weights, both aimed at eliminating the domain gap between +the pre-trained LLMs and the MDIR task. Third, for the alignment of tokens, we +utilize other four adapters to transform the LLM-encoded tokens into +multi-scale visual features, generating multi-scale deformation fields and +facilitating the coarse-to-fine MDIR task. Extensive experiments in MR-CT +Abdomen and SR-Reg Brain datasets demonstrate the effectiveness of our +framework and the potential of pre-trained LLMs for MDIR task. Our code is +availabel at: https://github.com/ninjannn/LLM-Morph. + +
+
+
+
+
+ + ☆ MsMemoryGAN: A Multi-scale Memory GAN for Palm-vein Adversarial + Purification + + +
+ Deep neural networks have recently achieved promising performance in the vein +recognition task and have shown an increasing application trend, however, they +are prone to adversarial perturbation attacks by adding imperceptible +perturbations to the input, resulting in making incorrect recognition. To +address this issue, we propose a novel defense model named MsMemoryGAN, which +aims to filter the perturbations from adversarial samples before recognition. +First, we design a multi-scale autoencoder to achieve high-quality +reconstruction and two memory modules to learn the detailed patterns of normal +samples at different scales. Second, we investigate a learnable metric in the +memory module to retrieve the most relevant memory items to reconstruct the +input image. Finally, the perceptional loss is combined with the pixel loss to +further enhance the quality of the reconstructed image. During the training +phase, the MsMemoryGAN learns to reconstruct the input by merely using fewer +prototypical elements of the normal patterns recorded in the memory. At the +testing stage, given an adversarial sample, the MsMemoryGAN retrieves its most +relevant normal patterns in memory for the reconstruction. Perturbations in the +adversarial sample are usually not reconstructed well, resulting in purifying +the input from adversarial perturbations. We have conducted extensive +experiments on two public vein datasets under different adversarial attack +methods to evaluate the performance of the proposed approach. The experimental +results show that our approach removes a wide variety of adversarial +perturbations, allowing vein classifiers to achieve the highest recognition +accuracy. + +
+
+
+
+
+ + ☆ TDS-CLIP: Temporal Difference Side Network for Image-to-Video Transfer + Learning + + +
+ Recently, large-scale pre-trained vision-language models (e.g., CLIP), have +garnered significant attention thanks to their powerful representative +capabilities. This inspires researchers in transferring the knowledge from +these large pre-trained models to other task-specific models, e.g., Video +Action Recognition (VAR) models, via particularly leveraging side networks to +enhance the efficiency of parameter-efficient fine-tuning (PEFT). However, +current transferring approaches in VAR tend to directly transfer the frozen +knowledge from large pre-trained models to action recognition networks with +minimal cost, instead of exploiting the temporal modeling capabilities of the +action recognition models themselves. Therefore, in this paper, we propose a +memory-efficient Temporal Difference Side Network (TDS-CLIP) to balance +knowledge transferring and temporal modeling, avoiding backpropagation in +frozen parameter models. Specifically, we introduce a Temporal Difference +Adapter (TD-Adapter), which can effectively capture local temporal differences +in motion features to strengthen the model's global temporal modeling +capabilities. Furthermore, we designed a Side Motion Enhancement Adapter +(SME-Adapter) to guide the proposed side network in efficiently learning the +rich motion information in videos, thereby improving the side network's ability +to capture and learn motion information. Extensive experiments are conducted on +three benchmark datasets, including Something-Something V1\&V2, and +Kinetics-400. Experimental results demonstrate that our approach achieves +competitive performance. + +
+
+
+
+
+ + ☆ DemMamba: Alignment-free Raw Video Demoireing with Frequency-assisted + Spatio-Temporal Mamba + + +
+ Moire patterns arise when two similar repetitive patterns interfere, a +phenomenon frequently observed during the capture of images or videos on +screens. The color, shape, and location of moire patterns may differ across +video frames, posing a challenge in learning information from adjacent frames +and preserving temporal consistency. Previous video demoireing methods heavily +rely on well-designed alignment modules, resulting in substantial computational +burdens. Recently, Mamba, an improved version of the State Space Model (SSM), +has demonstrated significant potential for modeling long-range dependencies +with linear complexity, enabling efficient temporal modeling in video +demoireing without requiring a specific alignment module. In this paper, we +propose a novel alignment-free Raw video demoireing network with +frequency-assisted spatio-temporal Mamba (DemMamba). The Spatial Mamba Block +(SMB) and Temporal Mamba Block (TMB) are sequentially arranged to facilitate +effective intra- and inter-relationship modeling in Raw videos with moire +patterns. Within SMB, an Adaptive Frequency Block (AFB) is introduced to aid +demoireing in the frequency domain. For TMB, a Channel Attention Block (CAB) is +embedded to further enhance temporal information interactions by exploiting the +inter-channel relationships among features. Extensive experiments demonstrate +that our proposed DemMamba surpasses state-of-the-art approaches by 1.3 dB and +delivers a superior visual experience. + +
+
+
+
+
+ + ☆ A Noncontact Technique for Wave Measurement Based on Thermal + Stereography and Deep Learning + + +
+ The accurate measurement of the wave field and its spatiotemporal evolution +is essential in many hydrodynamic experiments and engineering applications. The +binocular stereo imaging technique has been widely used to measure waves. +However, the optical properties of indoor water surfaces, including +transparency, specular reflection, and texture absence, pose challenges for +image processing and stereo reconstruction. This study proposed a novel +technique that combined thermal stereography and deep learning to achieve fully +noncontact wave measurements. The optical imaging properties of water in the +long-wave infrared spectrum were found to be suitable for stereo matching, +effectively avoiding the issues in the visible-light spectrum. After capturing +wave images using thermal stereo cameras, a reconstruction strategy involving +deep learning techniques was proposed to improve stereo matching performance. A +generative approach was employed to synthesize a dataset with ground-truth +disparity from unannotated infrared images. This dataset was then fed to a +pretrained stereo neural network for fine-tuning to achieve domain adaptation. +Wave flume experiments were conducted to validate the feasibility and accuracy +of the proposed technique. The final reconstruction results indicated great +agreement and high accuracy with a mean bias of less than 2.1% compared with +the measurements obtained using wave probes, suggesting that the novel +technique effectively measures the spatiotemporal distribution of wave surface +in hydrodynamic experiments. + +
+
+
+
+
+ + ☆ deepmriprep: Voxel-based Morphometry (VBM) Preprocessing via Deep Neural + Networks + + +
+ Voxel-based Morphometry (VBM) has emerged as a powerful approach in +neuroimaging research, utilized in over 7,000 studies since the year 2000. +Using Magnetic Resonance Imaging (MRI) data, VBM assesses variations in the +local density of brain tissue and examines its associations with biological and +psychometric variables. Here, we present deepmriprep, a neural network-based +pipeline that performs all necessary preprocessing steps for VBM analysis of +T1-weighted MR images using deep neural networks. Utilizing the Graphics +Processing Unit (GPU), deepmriprep is 37 times faster than CAT12, the leading +VBM preprocessing toolbox. The proposed method matches CAT12 in accuracy for +tissue segmentation and image registration across more than 100 datasets and +shows strong correlations in VBM results. Tissue segmentation maps from +deepmriprep have over 95% agreement with ground truth maps, and its non-linear +registration, using supervised SYMNet, predicts smooth deformation fields +comparable to CAT12. The high processing speed of deepmriprep enables rapid +preprocessing of extensive datasets and thereby fosters the application of VBM +analysis to large-scale neuroimaging studies and opens the door to real-time +applications. Finally, deepmripreps straightforward, modular design enables +researchers to easily understand, reuse, and advance the underlying methods, +fostering further advancements in neuroimaging research. deepmriprep can be +conveniently installed as a Python package and is publicly accessible at +https://github.com/wwu-mmll/deepmriprep. + +
+
+
+
+
+ + ☆ UIE-UnFold: Deep Unfolding Network with Color Priors and Vision + Transformer for Underwater Image Enhancement + + +
+ Underwater image enhancement (UIE) plays a crucial role in various marine +applications, but it remains challenging due to the complex underwater +environment. Current learning-based approaches frequently lack explicit +incorporation of prior knowledge about the physical processes involved in +underwater image formation, resulting in limited optimization despite their +impressive enhancement results. This paper proposes a novel deep unfolding +network (DUN) for UIE that integrates color priors and inter-stage feature +transformation to improve enhancement performance. The proposed DUN model +combines the iterative optimization and reliability of model-based methods with +the flexibility and representational power of deep learning, offering a more +explainable and stable solution compared to existing learning-based UIE +approaches. The proposed model consists of three key components: a Color Prior +Guidance Block (CPGB) that establishes a mapping between color channels of +degraded and original images, a Nonlinear Activation Gradient Descent Module +(NAGDM) that simulates the underwater image degradation process, and an Inter +Stage Feature Transformer (ISF-Former) that facilitates feature exchange +between different network stages. By explicitly incorporating color priors and +modeling the physical characteristics of underwater image formation, the +proposed DUN model achieves more accurate and reliable enhancement results. +Extensive experiments on multiple underwater image datasets demonstrate the +superiority of the proposed model over state-of-the-art methods in both +quantitative and qualitative evaluations. The proposed DUN-based approach +offers a promising solution for UIE, enabling more accurate and reliable +scientific analysis in marine research. The code is available at +https://github.com/CXH-Research/UIE-UnFold. + +
+
+ comment: Accepted by DSAA CIVIL 2024 +
+
+
+
+
+ + ☆ Vocabulary-Free 3D Instance Segmentation with Vision and Language + Assistant + + +
+ Most recent 3D instance segmentation methods are open vocabulary, offering a +greater flexibility than closed-vocabulary methods. Yet, they are limited to +reasoning within a specific set of concepts, \ie the vocabulary, prompted by +the user at test time. In essence, these models cannot reason in an open-ended +fashion, i.e., answering ``List the objects in the scene.''. We introduce the +first method to address 3D instance segmentation in a setting that is void of +any vocabulary prior, namely a vocabulary-free setting. We leverage a large +vision-language assistant and an open-vocabulary 2D instance segmenter to +discover and ground semantic categories on the posed images. To form 3D +instance mask, we first partition the input point cloud into dense superpoints, +which are then merged into 3D instance masks. We propose a novel superpoint +merging strategy via spectral clustering, accounting for both mask coherence +and semantic coherence that are estimated from the 2D object instance masks. We +evaluate our method using ScanNet200 and Replica, outperforming existing +methods in both vocabulary-free and open-vocabulary settings. Code will be made +available. + +
+
+
+
+
+ + ☆ A Review of Human-Object Interaction Detection + + +
+ Human-object interaction (HOI) detection plays a key role in high-level +visual understanding, facilitating a deep comprehension of human activities. +Specifically, HOI detection aims to locate the humans and objects involved in +interactions within images or videos and classify the specific interactions +between them. The success of this task is influenced by several key factors, +including the accurate localization of human and object instances, as well as +the correct classification of object categories and interaction relationships. +This paper systematically summarizes and discusses the recent work in +image-based HOI detection. First, the mainstream datasets involved in HOI +relationship detection are introduced. Furthermore, starting with two-stage +methods and end-to-end one-stage detection approaches, this paper +comprehensively discusses the current developments in image-based HOI +detection, analyzing the strengths and weaknesses of these two methods. +Additionally, the advancements of zero-shot learning, weakly supervised +learning, and the application of large-scale language models in HOI detection +are discussed. Finally, the current challenges in HOI detection are outlined, +and potential research directions and future trends are explored. + +
+
+
+
+
+ + ☆ Generating Multi-frame Ultrawide-field Fluorescein Angiography from + Ultrawide-field Color Imaging Improves Diabetic Retinopathy Stratification + + +
+ Ultrawide-field fluorescein angiography (UWF-FA) facilitates diabetic +retinopathy (DR) detection by providing a clear visualization of peripheral +retinal lesions. However, the intravenous dye injection with potential risks +hamper its application. We aim to acquire dye-free UWF-FA images from +noninvasive UWF color fundus (UWF-CF) images using generative artificial +intelligence (GenAI) and evaluate its effectiveness in DR screening. A total of +18,321 UWF-FA images of different phases were registered with corresponding +UWF-CF images and fed into a generative adversarial networks (GAN)-based model +for training. The quality of generated UWF-FA images was evaluated through +quantitative metrics and human evaluation. The DeepDRiD dataset was used to +externally assess the contribution of generated UWF-FA images to DR +classification, using area under the receiver operating characteristic curve +(AUROC) as outcome metrics. The generated early, mid, and late phase UWF-FA +images achieved high authenticity, with multi-scale similarity scores ranging +from 0.70 to 0.91 and qualitative visual scores ranging from 1.64 to 1.98 +(1=real UWF-FA quality). In fifty randomly selected images, 56% to 76% of the +generated images were difficult to distinguish from real images in the Turing +test. Moreover, adding these generated UWF-FA images for DR classification +significantly increased the AUROC from 0.869 to 0.904 compared to the baseline +model using UWF-CF images (P < .001). The model successfully generates +realistic multi-frame UWF-FA images without intravenous dye injection. The +generated UWF-FA enhanced DR stratification. + +
+
+ comment: 27 pages, 2 figures +
+
+
+
+
+ + ☆ Rethinking Video Segmentation with Masked Video Consistency: Did the + Model Learn as Intended? + + +
+ Video segmentation aims at partitioning video sequences into meaningful +segments based on objects or regions of interest within frames. Current video +segmentation models are often derived from image segmentation techniques, which +struggle to cope with small-scale or class-imbalanced video datasets. This +leads to inconsistent segmentation results across frames. To address these +issues, we propose a training strategy Masked Video Consistency, which enhances +spatial and temporal feature aggregation. MVC introduces a training strategy +that randomly masks image patches, compelling the network to predict the entire +semantic segmentation, thus improving contextual information integration. +Additionally, we introduce Object Masked Attention (OMA) to optimize the +cross-attention mechanism by reducing the impact of irrelevant queries, thereby +enhancing temporal modeling capabilities. Our approach, integrated into the +latest decoupled universal video segmentation framework, achieves +state-of-the-art performance across five datasets for three video segmentation +tasks, demonstrating significant improvements over previous methods without +increasing model parameters. + +
+
+
+
+
+ + ☆ WRIM-Net: Wide-Ranging Information Mining Network for Visible-Infrared + Person Re-Identification + + +
+ For the visible-infrared person re-identification (VI-ReID) task, one of the +primary challenges lies in significant cross-modality discrepancy. Existing +methods struggle to conduct modality-invariant information mining. They often +focus solely on mining singular dimensions like spatial or channel, and +overlook the extraction of specific-modality multi-dimension information. To +fully mine modality-invariant information across a wide range, we introduce the +Wide-Ranging Information Mining Network (WRIM-Net), which mainly comprises a +Multi-dimension Interactive Information Mining (MIIM) module and an +Auxiliary-Information-based Contrastive Learning (AICL) approach. Empowered by +the proposed Global Region Interaction (GRI), MIIM comprehensively mines +non-local spatial and channel information through intra-dimension interaction. +Moreover, Thanks to the low computational complexity design, separate MIIM can +be positioned in shallow layers, enabling the network to better mine +specific-modality multi-dimension information. AICL, by introducing the novel +Cross-Modality Key-Instance Contrastive (CMKIC) loss, effectively guides the +network in extracting modality-invariant information. We conduct extensive +experiments not only on the well-known SYSU-MM01 and RegDB datasets but also on +the latest large-scale cross-modality LLCM dataset. The results demonstrate +WRIM-Net's superiority over state-of-the-art methods. + +
+
+ comment: 18 pages, 5 figures +
+
+
+
+
+ + ☆ TextMastero: Mastering High-Quality Scene Text Editing in Diverse + Languages and Styles + + +
+ Scene text editing aims to modify texts on images while maintaining the style +of newly generated text similar to the original. Given an image, a target area, +and target text, the task produces an output image with the target text in the +selected area, replacing the original. This task has been studied extensively, +with initial success using Generative Adversarial Networks (GANs) to balance +text fidelity and style similarity. However, GAN-based methods struggled with +complex backgrounds or text styles. Recent works leverage diffusion models, +showing improved results, yet still face challenges, especially with non-Latin +languages like CJK characters (Chinese, Japanese, Korean) that have complex +glyphs, often producing inaccurate or unrecognizable characters. To address +these issues, we present \emph{TextMastero} - a carefully designed multilingual +scene text editing architecture based on latent diffusion models (LDMs). +TextMastero introduces two key modules: a glyph conditioning module for +fine-grained content control in generating accurate texts, and a latent +guidance module for providing comprehensive style information to ensure +similarity before and after editing. Both qualitative and quantitative +experiments demonstrate that our method surpasses all known existing works in +text fidelity and style similarity. + +
+
+
+
+
+ + ☆ Novel Change Detection Framework in Remote Sensing Imagery Using + Diffusion Models and Structural Similarity Index (SSIM) + + +
+ Change detection is a crucial task in remote sensing, enabling the monitoring +of environmental changes, urban growth, and disaster impact. Conventional +change detection techniques, such as image differencing and ratioing, often +struggle with noise and fail to capture complex variations in imagery. Recent +advancements in machine learning, particularly generative models like diffusion +models, offer new opportunities for enhancing change detection accuracy. In +this paper, we propose a novel change detection framework that combines the +strengths of Stable Diffusion models with the Structural Similarity Index +(SSIM) to create robust and interpretable change maps. Our approach, named +Diffusion Based Change Detector, is evaluated on both synthetic and real-world +remote sensing datasets and compared with state-of-the-art methods. The results +demonstrate that our method significantly outperforms traditional differencing +techniques and recent deep learning-based methods, particularly in scenarios +with complex changes and noise. + +
+
+
+
+
+ + ☆ OMEGA: Efficient Occlusion-Aware Navigation for Air-Ground Robot in + Dynamic Environments via State Space Model + + +
+ Air-ground robots (AGRs) are widely used in surveillance and disaster +response due to their exceptional mobility and versatility (i.e., flying and +driving). Current AGR navigation systems perform well in static occlusion-prone +environments (e.g., indoors) by using 3D semantic occupancy networks to predict +occlusions for complete local mapping and then computing Euclidean Signed +Distance Field (ESDF) for path planning. However, these systems face challenges +in dynamic, severe occlusion scenes (e.g., crowds) due to limitations in +perception networks' low prediction accuracy and path planners' high +computation overhead. In this paper, we propose OMEGA, which contains OccMamba +with an Efficient AGR-Planner to address the above-mentioned problems. OccMamba +adopts a novel architecture that separates semantic and occupancy prediction +into independent branches, incorporating two mamba blocks within these +branches. These blocks efficiently extract semantic and geometric features in +3D environments with linear complexity, ensuring that the network can learn +long-distance dependencies to improve prediction accuracy. Semantic and +geometric features are combined within the Bird's Eye View (BEV) space to +minimise computational overhead during feature fusion. The resulting semantic +occupancy map is then seamlessly integrated into the local map, providing +occlusion awareness of the dynamic environment. Our AGR-Planner utilizes this +local map and employs kinodynamic A* search and gradient-based trajectory +optimization to guarantee planning is ESDF-free and energy-efficient. Extensive +experiments demonstrate that OccMamba outperforms the state-of-the-art 3D +semantic occupancy network with 25.0% mIoU. End-to-end navigation experiments +in dynamic scenes verify OMEGA's efficiency, achieving a 96% average planning +success rate. Code and video are available at +https://jmwang0117.github.io/OMEGA/. + +
+
+ comment: OccMamba is Coming! +
+
+
+
+
+ + ☆ A toolbox for calculating objective image properties in aesthetics + research + + +
+ Over the past two decades, researchers in the field of visual aesthetics have +studied numerous quantitative (objective) image properties and how they relate +to visual aesthetic appreciation. However, results are difficult to compare +between research groups. One reason is that researchers use different sets of +image properties in their studies. But even if the same properties are used, +the image pre-processing techniques may differ and often researchers use their +own customized scripts to calculate the image properties. To provide greater +accessibility and comparability of research results in visual experimental +aesthetics, we developed an open-access and easy-to-use toolbox (called the +'Aesthetics Toolbox'). The Toolbox allows users to calculate a well-defined set +of quantitative image properties popular in contemporary research. The +properties include lightness and color statistics, Fourier spectral properties, +fractality, self-similarity, symmetry, as well as different entropy measures +and CNN-based variances. Compatible with most devices, the Toolbox provides an +intuitive click-and-drop web interface. In the Toolbox, we integrated the +original scripts of four different research groups and translated them into +Python 3. To ensure that results were consistent across analyses, we took care +that results from the Python versions of the scripts were the same as those +from the original scripts. The toolbox, detailed documentation, and a link to +the cloud version are available via Github: +https://github.com/RBartho/Aesthetics-Toolbox. In summary, we developed a +toolbox that helps to standardize and simplify the calculation of quantitative +image properties for visual aesthetics research. + +
+
+ comment: 41 pages, 6 figure +
+
+
+
+
+ + ☆ Generalizable Facial Expression Recognition ECCV2024 + + +
+ SOTA facial expression recognition (FER) methods fail on test sets that have +domain gaps with the train set. Recent domain adaptation FER methods need to +acquire labeled or unlabeled samples of target domains to fine-tune the FER +model, which might be infeasible in real-world deployment. In this paper, we +aim to improve the zero-shot generalization ability of FER methods on different +unseen test sets using only one train set. Inspired by how humans first detect +faces and then select expression features, we propose a novel FER pipeline to +extract expression-related features from any given face images. Our method is +based on the generalizable face features extracted by large models like CLIP. +However, it is non-trivial to adapt the general features of CLIP for specific +tasks like FER. To preserve the generalization ability of CLIP and the high +precision of the FER model, we design a novel approach that learns sigmoid +masks based on the fixed CLIP face features to extract expression features. To +further improve the generalization ability on unseen test sets, we separate the +channels of the learned masked features according to the expression classes to +directly generate logits and avoid using the FC layer to reduce overfitting. We +also introduce a channel-diverse loss to make the learned masks separated. +Extensive experiments on five different FER datasets verify that our method +outperforms SOTA FER methods by large margins. Code is available in +https://github.com/zyh-uaiaaaa/Generalizable-FER. + +
+
+ comment: Accepted by ECCV2024 +
+
+
+
+
+ + ☆ MUSES: 3D-Controllable Image Generation via Multi-Modal Agent + Collaboration + + +
+ Despite recent advancements in text-to-image generation, most existing +methods struggle to create images with multiple objects and complex spatial +relationships in 3D world. To tackle this limitation, we introduce a generic AI +system, namely MUSES, for 3D-controllable image generation from user queries. +Specifically, our MUSES addresses this challenging task by developing a +progressive workflow with three key components, including (1) Layout Manager +for 2D-to-3D layout lifting, (2) Model Engineer for 3D object acquisition and +calibration, (3) Image Artist for 3D-to-2D image rendering. By mimicking the +collaboration of human professionals, this multi-modal agent pipeline +facilitates the effective and automatic creation of images with 3D-controllable +objects, through an explainable integration of top-down planning and bottom-up +generation. Additionally, we find that existing benchmarks lack detailed +descriptions of complex 3D spatial relationships of multiple objects. To fill +this gap, we further construct a new benchmark of T2I-3DisBench (3D image +scene), which describes diverse 3D image scenes with 50 detailed prompts. +Extensive experiments show the state-of-the-art performance of MUSES on both +T2I-CompBench and T2I-3DisBench, outperforming recent strong competitors such +as DALL-E 3 and Stable Diffusion 3. These results demonstrate a significant +step of MUSES forward in bridging natural language, 2D image generation, and 3D +world. + +
+
+
+
+
+ + ☆ MV-MOS: Multi-View Feature Fusion for 3D Moving Object Segmentation + + +
+ Effectively summarizing dense 3D point cloud data and extracting motion +information of moving objects (moving object segmentation, MOS) is crucial to +autonomous driving and robotics applications. How to effectively utilize motion +and semantic features and avoid information loss during 3D-to-2D projection is +still a key challenge. In this paper, we propose a novel multi-view MOS model +(MV-MOS) by fusing motion-semantic features from different 2D representations +of point clouds. To effectively exploit complementary information, the motion +branches of the proposed model combines motion features from both bird's eye +view (BEV) and range view (RV) representations. In addition, a semantic branch +is introduced to provide supplementary semantic features of moving objects. +Finally, a Mamba module is utilized to fuse the semantic features with motion +features and provide effective guidance for the motion branches. We validated +the effectiveness of the proposed multi-branch fusion MOS framework via +comprehensive experiments, and our proposed model outperforms existing +state-of-the-art models on the SemanticKITTI benchmark. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ☆ Breast tumor classification based on self-supervised contrastive + learning from ultrasound videos + + +
+ Background: Breast ultrasound is prominently used in diagnosing breast +tumors. At present, many automatic systems based on deep learning have been +developed to help radiologists in diagnosis. However, training such systems +remains challenging because they are usually data-hungry and demand amounts of +labeled data, which need professional knowledge and are expensive. Methods: We +adopted a triplet network and a self-supervised contrastive learning technique +to learn representations from unlabeled breast ultrasound video clips. We +further designed a new hard triplet loss to to learn representations that +particularly discriminate positive and negative image pairs that are hard to +recognize. We also constructed a pretraining dataset from breast ultrasound +videos (1,360 videos from 200 patients), which includes an anchor sample +dataset with 11,805 images, a positive sample dataset with 188,880 images, and +a negative sample dataset dynamically generated from video clips. Further, we +constructed a finetuning dataset, including 400 images from 66 patients. We +transferred the pretrained network to a downstream benign/malignant +classification task and compared the performance with other state-of-the-art +models, including three models pretrained on ImageNet and a previous +contrastive learning model retrained on our datasets. Results and conclusion: +Experiments revealed that our model achieved an area under the receiver +operating characteristic curve (AUC) of 0.952, which is significantly higher +than the others. Further, we assessed the dependence of our pretrained model on +the number of labeled data and revealed that <100 samples were required to +achieve an AUC of 0.901. The proposed framework greatly reduces the demand for +labeled data and holds potential for use in automatic breast ultrasound image +diagnosis. + +
+
+
+
+
+ + ☆ Vision Calorimeter for Anti-neutron Reconstruction: A Baseline + + +
+ In high-energy physics, anti-neutrons ($\bar{n}$) are fundamental particles +that frequently appear as final-state particles, and the reconstruction of +their kinematic properties provides an important probe for understanding the +governing principles. However, this confronts significant challenges +instrumentally with the electromagnetic calorimeter (EMC), a typical +experimental sensor but recovering the information of incident $\bar{n}$ +insufficiently. In this study, we introduce Vision Calorimeter (ViC), a +baseline method for anti-neutron reconstruction that leverages deep learning +detectors to analyze the implicit relationships between EMC responses and +incident $\bar{n}$ characteristics. Our motivation lies in that energy +distributions of $\bar{n}$ samples deposited in the EMC cell arrays embody rich +contextual information. Converted to 2-D images, such contextual energy +distributions can be used to predict the status of $\bar{n}$ ($i.e.$, incident +position and momentum) through a deep learning detector along with pseudo +bounding boxes and a specified training objective. Experimental results +demonstrate that ViC substantially outperforms the conventional reconstruction +approach, reducing the prediction error of incident position by 42.81% (from +17.31$^{\circ}$ to 9.90$^{\circ}$). More importantly, this study for the first +time realizes the measurement of incident $\bar{n}$ momentum, underscoring the +potential of deep learning detectors for particle reconstruction. Code is +available at https://github.com/yuhongtian17/ViC. + +
+
+
+
+
+ + ☆ An Efficient Sign Language Translation Using Spatial Configuration and + Motion Dynamics with LLMs + + +
+ Gloss-free Sign Language Translation (SLT) converts sign videos directly into +spoken language sentences without relying on glosses. Recently, Large Language +Models (LLMs) have shown remarkable translation performance in gloss-free +methods by harnessing their powerful natural language generation capabilities. +However, these methods often rely on domain-specific fine-tuning of visual +encoders to achieve optimal results. By contrast, this paper emphasizes the +importance of capturing the spatial configurations and motion dynamics inherent +in sign language. With this in mind, we introduce Spatial and Motion-based Sign +Language Translation (SpaMo), a novel LLM-based SLT framework. The core idea of +SpaMo is simple yet effective. We first extract spatial and motion features +using off-the-shelf visual encoders and then input these features into an LLM +with a language prompt. Additionally, we employ a visual-text alignment process +as a warm-up before the SLT supervision. Our experiments demonstrate that SpaMo +achieves state-of-the-art performance on two popular datasets, PHOENIX14T and +How2Sign. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ DEGAS: Detailed Expressions on Full-Body Gaussian Avatars + + +
+ Although neural rendering has made significant advancements in creating +lifelike, animatable full-body and head avatars, incorporating detailed +expressions into full-body avatars remains largely unexplored. We present +DEGAS, the first 3D Gaussian Splatting (3DGS)-based modeling method for +full-body avatars with rich facial expressions. Trained on multiview videos of +a given subject, our method learns a conditional variational autoencoder that +takes both the body motion and facial expression as driving signals to generate +Gaussian maps in the UV layout. To drive the facial expressions, instead of the +commonly used 3D Morphable Models (3DMMs) in 3D head avatars, we propose to +adopt the expression latent space trained solely on 2D portrait images, +bridging the gap between 2D talking faces and 3D avatars. Leveraging the +rendering capability of 3DGS and the rich expressiveness of the expression +latent space, the learned avatars can be reenacted to reproduce photorealistic +rendering images with subtle and accurate facial expressions. Experiments on an +existing dataset and our newly proposed dataset of full-body talking avatars +demonstrate the efficacy of our method. We also propose an audio-driven +extension of our method with the help of 2D talking faces, opening new +possibilities to interactive AI agents. + +
+
+
+
+
+ + ☆ Multi-view Hand Reconstruction with a Point-Embedded Transformer CVPR2023 + + +
+ This work introduces a novel and generalizable multi-view Hand Mesh +Reconstruction (HMR) model, named POEM, designed for practical use in +real-world hand motion capture scenarios. The advances of the POEM model +consist of two main aspects. First, concerning the modeling of the problem, we +propose embedding a static basis point within the multi-view stereo space. A +point represents a natural form of 3D information and serves as an ideal medium +for fusing features across different views, given its varied projections across +these views. Consequently, our method harnesses a simple yet effective idea: a +complex 3D hand mesh can be represented by a set of 3D basis points that 1) are +embedded in the multi-view stereo, 2) carry features from the multi-view +images, and 3) encompass the hand in it. The second advance lies in the +training strategy. We utilize a combination of five large-scale multi-view +datasets and employ randomization in the number, order, and poses of the +cameras. By processing such a vast amount of data and a diverse array of camera +configurations, our model demonstrates notable generalizability in the +real-world applications. As a result, POEM presents a highly practical, +plug-and-play solution that enables user-friendly, cost-effective multi-view +motion capture for both left and right hands. The model and source codes are +available at https://github.com/JubSteven/POEM-v2. + +
+
+ comment: Generalizable multi-view Hand Mesh Reconstruction (HMR) model. + Extension of the original work at CVPR2023 +
+
+
+
+
+ + ☆ MUSE: Mamba is Efficient Multi-scale Learner for Text-video Retrieval + + +
+ Text-Video Retrieval (TVR) aims to align and associate relevant video content +with corresponding natural language queries. Most existing TVR methods are +based on large-scale pre-trained vision-language models (e.g., CLIP). However, +due to the inherent plain structure of CLIP, few TVR methods explore the +multi-scale representations which offer richer contextual information for a +more thorough understanding. To this end, we propose MUSE, a multi-scale mamba +with linear computational complexity for efficient cross-resolution modeling. +Specifically, the multi-scale representations are generated by applying a +feature pyramid on the last single-scale feature map. Then, we employ the Mamba +structure as an efficient multi-scale learner to jointly learn scale-wise +representations. Furthermore, we conduct comprehensive studies to investigate +different model structures and designs. Extensive results on three popular +benchmarks have validated the superiority of MUSE. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ A Tutorial on Explainable Image Classification for Dementia Stages Using + Convolutional Neural Network and Gradient-weighted Class Activation Mapping + + +
+ This paper presents a tutorial of an explainable approach using Convolutional +Neural Network (CNN) and Gradient-weighted Class Activation Mapping (Grad-CAM) +to classify four progressive dementia stages based on open MRI brain images. +The detailed implementation steps are demonstrated with an explanation. Whilst +the proposed CNN architecture is demonstrated to achieve more than 99% accuracy +for the test dataset, the computational procedure of CNN remains a black box. +The visualisation based on Grad-CAM is attempted to explain such very high +accuracy and may provide useful information for physicians. Future motivation +based on this work is discussed. + +
+
+ comment: 15 pages, 11 figures, 3 tables +
+
+
+
+
+ + ☆ Prompt-Agnostic Adversarial Perturbation for Customized Diffusion Models + + +
+ Diffusion models have revolutionized customized text-to-image generation, +allowing for efficient synthesis of photos from personal data with textual +descriptions. However, these advancements bring forth risks including privacy +breaches and unauthorized replication of artworks. Previous researches +primarily center around using prompt-specific methods to generate adversarial +examples to protect personal images, yet the effectiveness of existing methods +is hindered by constrained adaptability to different prompts. In this paper, we +introduce a Prompt-Agnostic Adversarial Perturbation (PAP) method for +customized diffusion models. PAP first models the prompt distribution using a +Laplace Approximation, and then produces prompt-agnostic perturbations by +maximizing a disturbance expectation based on the modeled distribution. This +approach effectively tackles the prompt-agnostic attacks, leading to improved +defense stability. Extensive experiments in face privacy and artistic style +protection, demonstrate the superior generalization of our method in comparison +to existing techniques. + +
+
+ comment: 33 pages, 14 figures, under review +
+
+
+
+
+ + ☆ Prompt Your Brain: Scaffold Prompt Tuning for Efficient Adaptation of + fMRI Pre-trained Model MICCAI 2024 + + +
+ We introduce Scaffold Prompt Tuning (ScaPT), a novel prompt-based framework +for adapting large-scale functional magnetic resonance imaging (fMRI) +pre-trained models to downstream tasks, with high parameter efficiency and +improved performance compared to fine-tuning and baselines for prompt tuning. +The full fine-tuning updates all pre-trained parameters, which may distort the +learned feature space and lead to overfitting with limited training data which +is common in fMRI fields. In contrast, we design a hierarchical prompt +structure that transfers the knowledge learned from high-resource tasks to +low-resource ones. This structure, equipped with a Deeply-conditioned +Input-Prompt (DIP) mapping module, allows for efficient adaptation by updating +only 2% of the trainable parameters. The framework enhances semantic +interpretability through attention mechanisms between inputs and prompts, and +it clusters prompts in the latent space in alignment with prior knowledge. +Experiments on public resting state fMRI datasets reveal ScaPT outperforms +fine-tuning and multitask-based prompt tuning in neurodegenerative diseases +diagnosis/prognosis and personality trait prediction, even with fewer than 20 +participants. It highlights ScaPT's efficiency in adapting pre-trained fMRI +models to low-resource tasks. + +
+
+ comment: MICCAI 2024 +
+
+
+
+
+ + ☆ Kalib: Markerless Hand-Eye Calibration with Keypoint Tracking + + +
+ Hand-eye calibration involves estimating the transformation between the +camera and the robot. Traditional methods rely on fiducial markers, involving +much manual labor and careful setup. Recent advancements in deep learning offer +markerless techniques, but they present challenges, including the need for +retraining networks for each robot, the requirement of accurate mesh models for +data generation, and the need to address the sim-to-real gap. In this letter, +we propose Kalib, an automatic and universal markerless hand-eye calibration +pipeline that leverages the generalizability of visual foundation models to +eliminate these barriers. In each calibration process, Kalib uses keypoint +tracking and proprioceptive sensors to estimate the transformation between a +robot's coordinate space and its corresponding points in camera space. Our +method does not require training new networks or access to mesh models. Through +evaluations in simulation environments and the real-world dataset DROID, Kalib +demonstrates superior accuracy compared to recent baseline methods. This +approach provides an effective and flexible calibration process for various +robot systems by simplifying setup and removing dependency on precise physical +markers. + +
+
+ comment: The code and supplementary materials are available at + https://sites.google.com/view/hand-eye-kalib +
+
+
+
+
+ + ☆ Diff-PCC: Diffusion-based Neural Compression for 3D Point Clouds + + +
+ Stable diffusion networks have emerged as a groundbreaking development for +their ability to produce realistic and detailed visual content. This +characteristic renders them ideal decoders, capable of producing high-quality +and aesthetically pleasing reconstructions. In this paper, we introduce the +first diffusion-based point cloud compression method, dubbed Diff-PCC, to +leverage the expressive power of the diffusion model for generative and +aesthetically superior decoding. Different from the conventional autoencoder +fashion, a dual-space latent representation is devised in this paper, in which +a compressor composed of two independent encoding backbones is considered to +extract expressive shape latents from distinct latent spaces. At the decoding +side, a diffusion-based generator is devised to produce high-quality +reconstructions by considering the shape latents as guidance to stochastically +denoise the noisy point clouds. Experiments demonstrate that the proposed +Diff-PCC achieves state-of-the-art compression performance (e.g., 7.711 dB +BD-PSNR gains against the latest G-PCC standard at ultra-low bitrate) while +attaining superior subjective quality. Source code will be made publicly +available. + +
+
+
+
+
+ + ☆ The Instance-centric Transformer for the RVOS Track of LSVOS Challenge: + 3rd Place Solution + + +
+ Referring Video Object Segmentation is an emerging multi-modal task that aims +to segment objects in the video given a natural language expression. In this +work, we build two instance-centric models and fuse predicted results from +frame-level and instance-level. First, we introduce instance mask into the +DETR-based model for query initialization to achieve temporal enhancement and +employ SAM for spatial refinement. Secondly, we build an instance retrieval +model conducting binary instance mask classification whether the instance is +referred. Finally, we fuse predicted results and our method achieved a score of +52.67 J&F in the validation phase and 60.36 J&F in the test phase, securing the +final ranking of 3rd place in the 6-th LSVOS Challenge RVOS Track. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2406.13939 +
+
+
+
+
+ + ☆ Training Matting Models without Alpha Labels + + +
+ The labelling difficulty has been a longstanding problem in deep image +matting. To escape from fine labels, this work explores using rough annotations +such as trimaps coarsely indicating the foreground/background as supervision. +We present that the cooperation between learned semantics from indicated known +regions and proper assumed matting rules can help infer alpha values at +transition areas. Inspired by the nonlocal principle in traditional image +matting, we build a directional distance consistency loss (DDC loss) at each +pixel neighborhood to constrain the alpha values conditioned on the input +image. DDC loss forces the distance of similar pairs on the alpha matte and on +its corresponding image to be consistent. In this way, the alpha values can be +propagated from learned known regions to unknown transition areas. With only +images and trimaps, a matting model can be trained under the supervision of a +known loss and the proposed DDC loss. Experiments on AM-2K and P3M-10K dataset +show that our paradigm achieves comparable performance with the +fine-label-supervised baseline, while sometimes offers even more satisfying +results than human-labelled ground truth. Code is available at +\url{https://github.com/poppuppy/alpha-free-matting}. + +
+
+ comment: 12 pages, 12 figures +
+
+
+
+
+ + ☆ Surgical Workflow Recognition and Blocking Effectiveness Detection in + Laparoscopic Liver Resections with Pringle Maneuver + + +
+ Pringle maneuver (PM) in laparoscopic liver resection aims to reduce blood +loss and provide a clear surgical view by intermittently blocking blood inflow +of the liver, whereas prolonged PM may cause ischemic injury. To +comprehensively monitor this surgical procedure and provide timely warnings of +ineffective and prolonged blocking, we suggest two complementary AI-assisted +surgical monitoring tasks: workflow recognition and blocking effectiveness +detection in liver resections. The former presents challenges in real-time +capturing of short-term PM, while the latter involves the intraoperative +discrimination of long-term liver ischemia states. To address these challenges, +we meticulously collect a novel dataset, called PmLR50, consisting of 25,037 +video frames covering various surgical phases from 50 laparoscopic liver +resection procedures. Additionally, we develop an online baseline for PmLR50, +termed PmNet. This model embraces Masked Temporal Encoding (MTE) and Compressed +Sequence Modeling (CSM) for efficient short-term and long-term temporal +information modeling, and embeds Contrastive Prototype Separation (CPS) to +enhance action discrimination between similar intraoperative operations. +Experimental results demonstrate that PmNet outperforms existing +state-of-the-art surgical workflow recognition methods on the PmLR50 benchmark. +Our research offers potential clinical applications for the laparoscopic liver +surgery community. Source code and data will be publicly available. + +
+
+
+
+
+ + ☆ Subspace Prototype Guidance for Mitigating Class Imbalance in Point + Cloud Semantic Segmentation + + +
+ Point cloud semantic segmentation can significantly enhance the perception of +an intelligent agent. Nevertheless, the discriminative capability of the +segmentation network is influenced by the quantity of samples available for +different categories. To mitigate the cognitive bias induced by class +imbalance, this paper introduces a novel method, namely subspace prototype +guidance (\textbf{SPG}), to guide the training of segmentation network. +Specifically, the point cloud is initially separated into independent point +sets by category to provide initial conditions for the generation of feature +subspaces. The auxiliary branch which consists of an encoder and a projection +head maps these point sets into separate feature subspaces. Subsequently, the +feature prototypes which are extracted from the current separate subspaces and +then combined with prototypes of historical subspaces guide the feature space +of main branch to enhance the discriminability of features of minority +categories. The prototypes derived from the feature space of main branch are +also employed to guide the training of the auxiliary branch, forming a +supervisory loop to maintain consistent convergence of the entire network. The +experiments conducted on the large public benchmarks (i.e. S3DIS, ScanNet v2, +ScanNet200, Toronto-3D) and collected real-world data illustrate that the +proposed method significantly improves the segmentation performance and +surpasses the state-of-the-art method. The code is available at +\url{https://github.com/Javion11/PointLiBR.git}. + +
+
+
+
+
+ + ☆ FAGStyle: Feature Augmentation on Geodesic Surface for Zero-shot + Text-guided Diffusion Image Style Transfer + + +
+ The goal of image style transfer is to render an image guided by a style +reference while maintaining the original content. Existing image-guided methods +rely on specific style reference images, restricting their wider application +and potentially compromising result quality. As a flexible alternative, +text-guided methods allow users to describe the desired style using text +prompts. Despite their versatility, these methods often struggle with +maintaining style consistency, reflecting the described style accurately, and +preserving the content of the target image. To address these challenges, we +introduce FAGStyle, a zero-shot text-guided diffusion image style transfer +method. Our approach enhances inter-patch information interaction by +incorporating the Sliding Window Crop technique and Feature Augmentation on +Geodesic Surface into our style control loss. Furthermore, we integrate a +Pre-Shape self-correlation consistency loss to ensure content consistency. +FAGStyle demonstrates superior performance over existing methods, consistently +achieving stylization that retains the semantic content of the source image. +Experimental results confirms the efficacy of FAGStyle across a diverse range +of source contents and styles, both imagined and common. + +
+
+
+
+
+ + ☆ NutrifyAI: An AI-Powered System for Real-Time Food Detection, + Nutritional Analysis, and Personalized Meal Recommendations + + +
+ With diet and nutrition apps reaching 1.4 billion users in 2022 [1], it's no +surprise that health apps like MyFitnessPal, Noom, and Calorie Counter, are +surging in popularity. However, one major setback [2] of nearly all nutrition +applications is that users must enter food data manually, which is +time-consuming and tedious. Thus, there has been an increasing demand for +applications that can accurately identify food items, analyze their nutritional +content, and offer dietary recommendations in real-time. This paper introduces +a comprehensive system that combines advanced computer vision techniques with +nutrition analysis, implemented in a versatile mobile and web application. The +system is divided into three key components: 1) food detection using the YOLOv8 +model, 2) nutrient analysis via the Edamam Nutrition Analysis API, and 3) +personalized meal recommendations using the Edamam Meal Planning and Recipe +Search APIs. Designed for both mobile and web platforms, the application +ensures fast processing times with an intuitive user interface, with features +such as data visualizations using Chart.js, a login system, and personalized +settings for dietary preferences, allergies, and cuisine choices. Preliminary +results showcase the system's effectiveness, making it a valuable tool for +users to make informed dietary decisions. + +
+
+ comment: 7 pages, 12 figures +
+
+
+
+
+ + ☆ EdgeNAT: Transformer for Efficient Edge Detection + + +
+ Transformers, renowned for their powerful feature extraction capabilities, +have played an increasingly prominent role in various vision tasks. Especially, +recent advancements present transformer with hierarchical structures such as +Dilated Neighborhood Attention Transformer (DiNAT), demonstrating outstanding +ability to efficiently capture both global and local features. However, +transformers' application in edge detection has not been fully exploited. In +this paper, we propose EdgeNAT, a one-stage transformer-based edge detector +with DiNAT as the encoder, capable of extracting object boundaries and +meaningful edges both accurately and efficiently. On the one hand, EdgeNAT +captures global contextual information and detailed local cues with DiNAT, on +the other hand, it enhances feature representation with a novel SCAF-MLA +decoder by utilizing both inter-spatial and inter-channel relationships of +feature maps. Extensive experiments on multiple datasets show that our method +achieves state-of-the-art performance on both RGB and depth images. Notably, on +the widely used BSDS500 dataset, our L model achieves impressive performances, +with ODS F-measure and OIS F-measure of 86.0%, 87.6% for multi-scale input,and +84.9%, and 86.3% for single-scale input, surpassing the current +state-of-the-art EDTER by 1.2%, 1.1%, 1.7%, and 1.6%, respectively. Moreover, +as for throughput, our approach runs at 20.87 FPS on RTX 4090 GPU with +single-scale input. The code for our method will be released soon. + +
+
+
+
+
+ + ☆ BAUST Lipi: A BdSL Dataset with Deep Learning Based Bangla Sign Language + Recognition + + +
+ People commonly communicate in English, Arabic, and Bengali spoken languages +through various mediums. However, deaf and hard-of-hearing individuals +primarily use body language and sign language to express their needs and +achieve independence. Sign language research is burgeoning to enhance +communication with the deaf community. While many researchers have made strides +in recognizing sign languages such as French, British, Arabic, Turkish, and +American, there has been limited research on Bangla sign language (BdSL) with +less-than-satisfactory results. One significant barrier has been the lack of a +comprehensive Bangla sign language dataset. In our work, we introduced a new +BdSL dataset comprising alphabets totaling 18,000 images, with each image being +224x224 pixels in size. Our dataset encompasses 36 Bengali symbols, of which 30 +are consonants and the remaining six are vowels. Despite our dataset +contribution, many existing systems continue to grapple with achieving +high-performance accuracy for BdSL. To address this, we devised a hybrid +Convolutional Neural Network (CNN) model, integrating multiple convolutional +layers, activation functions, dropout techniques, and LSTM layers. Upon +evaluating our hybrid-CNN model with the newly created BdSL dataset, we +achieved an accuracy rate of 97.92\%. We are confident that both our BdSL +dataset and hybrid CNN model will be recognized as significant milestones in +BdSL research. + +
+
+
+
+
+ + ☆ Adaptive Knowledge Distillation for Classification of Hand Images using + Explainable Vision Transformers ECML + + +
+ Assessing the forensic value of hand images involves the use of unique +features and patterns present in an individual's hand. The human hand has +distinct characteristics, such as the pattern of veins, fingerprints, and the +geometry of the hand itself. This paper investigates the use of vision +transformers (ViTs) for classification of hand images. We use explainability +tools to explore the internal representations of ViTs and assess their impact +on the model outputs. Utilizing the internal understanding of ViTs, we +introduce distillation methods that allow a student model to adaptively extract +knowledge from a teacher model while learning on data of a different domain to +prevent catastrophic forgetting. Two publicly available hand image datasets are +used to conduct a series of experiments to evaluate performance of the ViTs and +our proposed adaptive distillation methods. The experimental results +demonstrate that ViT models significantly outperform traditional machine +learning methods and the internal states of ViTs are useful for explaining the +model outputs in the classification task. By averting catastrophic forgetting, +our distillation methods achieve excellent performance on data from both source +and target domains, particularly when these two domains exhibit significant +dissimilarity. The proposed approaches therefore can be developed and +implemented effectively for real-world applications such as access control, +identity verification, and authentication systems. + +
+
+ comment: Accepted at the ECML PKDD 2024 (Research Track) +
+
+
+
+
+ + ☆ SZTU-CMU at MER2024: Improving Emotion-LLaMA with Conv-Attention for + Multimodal Emotion Recognition + + +
+ This paper presents our winning approach for the MER-NOISE and MER-OV tracks +of the MER2024 Challenge on multimodal emotion recognition. Our system +leverages the advanced emotional understanding capabilities of Emotion-LLaMA to +generate high-quality annotations for unlabeled samples, addressing the +challenge of limited labeled data. To enhance multimodal fusion while +mitigating modality-specific noise, we introduce Conv-Attention, a lightweight +and efficient hybrid framework. Extensive experimentation vali-dates the +effectiveness of our approach. In the MER-NOISE track, our system achieves a +state-of-the-art weighted average F-score of 85.30%, surpassing the second and +third-place teams by 1.47% and 1.65%, respectively. For the MER-OV track, our +utilization of Emotion-LLaMA for open-vocabulary annotation yields an 8.52% +improvement in average accuracy and recall compared to GPT-4V, securing the +highest score among all participating large multimodal models. The code and +model for Emotion-LLaMA are available at +https://github.com/ZebangCheng/Emotion-LLaMA. + +
+
+
+
+
+ + ☆ Cervical Cancer Detection Using Multi-Branch Deep Learning Model + + +
+ Cervical cancer is a crucial global health concern for women, and the +persistent infection of High-risk HPV mainly triggers this remains a global +health challenge, with young women diagnosis rates soaring from 10\% to 40\% +over three decades. While Pap smear screening is a prevalent diagnostic method, +visual image analysis can be lengthy and often leads to mistakes. Early +detection of the disease can contribute significantly to improving patient +outcomes. In recent decades, many researchers have employed machine learning +techniques that achieved promise in cervical cancer detection processes based +on medical images. In recent years, many researchers have employed various +deep-learning techniques to achieve high-performance accuracy in detecting +cervical cancer but are still facing various challenges. This research proposes +an innovative and novel approach to automate cervical cancer image +classification using Multi-Head Self-Attention (MHSA) and convolutional neural +networks (CNNs). The proposed method leverages the strengths of both MHSA +mechanisms and CNN to effectively capture both local and global features within +cervical images in two streams. MHSA facilitates the model's ability to focus +on relevant regions of interest, while CNN extracts hierarchical features that +contribute to accurate classification. Finally, we combined the two stream +features and fed them into the classification module to refine the feature and +the classification. To evaluate the performance of the proposed approach, we +used the SIPaKMeD dataset, which classifies cervical cells into five +categories. Our model achieved a remarkable accuracy of 98.522\%. This +performance has high recognition accuracy of medical image classification and +holds promise for its applicability in other medical image recognition tasks. + +
+
+
+
+
+ + ☆ GPT-based Textile Pilling Classification Using 3D Point Cloud Data + + +
+ Textile pilling assessment is critical for textile quality control. We +collect thousands of 3D point cloud images in the actual test environment of +textiles and organize and label them as TextileNet8 dataset. To the best of our +knowledge, it is the first publicly available eight-categories 3D point cloud +dataset in the field of textile pilling assessment. Based on PointGPT, the +GPT-like big model of point cloud analysis, we incorporate the global features +of the input point cloud extracted from the non-parametric network into it, +thus proposing the PointGPT+NN model. Using TextileNet8 as a benchmark, the +experimental results show that the proposed PointGPT+NN model achieves an +overall accuracy (OA) of 91.8% and a mean per-class accuracy (mAcc) of 92.2%. +Test results on other publicly available datasets also validate the competitive +performance of the proposed PointGPT+NN model. The proposed TextileNet8 dataset +will be publicly available. + +
+
+ comment: 8 pages, 2 figures +
+
+
+
+
+ + ☆ Event Stream based Sign Language Translation: A High-Definition + Benchmark Dataset and A New Algorithm + + +
+ Sign Language Translation (SLT) is a core task in the field of AI-assisted +disability. Unlike traditional SLT based on visible light videos, which is +easily affected by factors such as lighting, rapid hand movements, and privacy +breaches, this paper proposes the use of high-definition Event streams for SLT, +effectively mitigating the aforementioned issues. This is primarily because +Event streams have a high dynamic range and dense temporal signals, which can +withstand low illumination and motion blur well. Additionally, due to their +sparsity in space, they effectively protect the privacy of the target person. +More specifically, we propose a new high-resolution Event stream sign language +dataset, termed Event-CSL, which effectively fills the data gap in this area of +research. It contains 14,827 videos, 14,821 glosses, and 2,544 Chinese words in +the text vocabulary. These samples are collected in a variety of indoor and +outdoor scenes, encompassing multiple angles, light intensities, and camera +movements. We have benchmarked existing mainstream SLT works to enable fair +comparison for future efforts. Based on this dataset and several other +large-scale datasets, we propose a novel baseline method that fully leverages +the Mamba model's ability to integrate temporal information of CNN features, +resulting in improved sign language translation outcomes. Both the benchmark +dataset and source code will be released on +https://github.com/Event-AHU/OpenESL + +
+
+ comment: First Large-scale and High-Definition Benchmark Dataset for + Event-based Sign Language Translation +
+
+
+
+
+ + ☆ MambaEVT: Event Stream based Visual Object Tracking using State Space + Model + + +
+ Event camera-based visual tracking has drawn more and more attention in +recent years due to the unique imaging principle and advantages of low energy +consumption, high dynamic range, and dense temporal resolution. Current +event-based tracking algorithms are gradually hitting their performance +bottlenecks, due to the utilization of vision Transformer and the static +template for target object localization. In this paper, we propose a novel +Mamba-based visual tracking framework that adopts the state space model with +linear complexity as a backbone network. The search regions and target template +are fed into the vision Mamba network for simultaneous feature extraction and +interaction. The output tokens of search regions will be fed into the tracking +head for target localization. More importantly, we consider introducing a +dynamic template update strategy into the tracking framework using the Memory +Mamba network. By considering the diversity of samples in the target template +library and making appropriate adjustments to the template memory module, a +more effective dynamic template can be integrated. The effective combination of +dynamic and static templates allows our Mamba-based tracking algorithm to +achieve a good balance between accuracy and computational cost on multiple +large-scale datasets, including EventVOT, VisEvent, and FE240hz. The source +code will be released on https://github.com/Event-AHU/MambaEVT + +
+
+ comment: In Peer Review +
+
+
+
+
+ + ☆ LSVOS Challenge 3rd Place Report: SAM2 and Cutie based VOS + + +
+ Video Object Segmentation (VOS) presents several challenges, including object +occlusion and fragmentation, the dis-appearance and re-appearance of objects, +and tracking specific objects within crowded scenes. In this work, we combine +the strengths of the state-of-the-art (SOTA) models SAM2 and Cutie to address +these challenges. Additionally, we explore the impact of various +hyperparameters on video instance segmentation performance. Our approach +achieves a J\&F score of 0.7952 in the testing phase of LSVOS challenge VOS +track, ranking third overa1l. + +
+
+
+
+
+ + ☆ Learning Multimodal Latent Space with EBM Prior and MCMC Inference + + +
+ Multimodal generative models are crucial for various applications. We propose +an approach that combines an expressive energy-based model (EBM) prior with +Markov Chain Monte Carlo (MCMC) inference in the latent space for multimodal +generation. The EBM prior acts as an informative guide, while MCMC inference, +specifically through short-run Langevin dynamics, brings the posterior +distribution closer to its true form. This method not only provides an +expressive prior to better capture the complexity of multimodality but also +improves the learning of shared latent variables for more coherent generation +across modalities. Our proposed method is supported by empirical experiments, +underscoring the effectiveness of our EBM prior with MCMC inference in +enhancing cross-modal and joint generative tasks in multimodal contexts. + +
+
+
+
+
+ + ♻ ☆ LongVILA: Scaling Long-Context Visual Language Models for Long Videos + + +
+ Long-context capability is critical for multi-modal foundation models. We +introduce LongVILA, a full-stack solution for long-context vision-language +models, including system, model training, and dataset development. On the +system side, we introduce the first long-context Multi-Modal Sequence +Parallelism (MM-SP) system that enables long training and inference, enabling +2M context length training on 256 GPUs without any gradient checkpointing. +MM-SP is 2.1x - 5.7x faster than ring sequence parallelism and 1.1x - 1.4x +faster than Megatron context parallelism + tensor parallelism in text-only +settings. Moreover, it seamlessly integrates with Hugging Face Transformers. +For model training, we propose a five-stage pipeline comprising alignment, +pre-training, short supervised fine-tuning, context extension, and long +supervised fine-tuning. On datasets, we construct large-scale visual language +pre-training datasets and long video instruction-following datasets to support +our multi-stage training process. LongVILA extends the number of frames of VILA +from 8 to 1024, and improves the long video captioning score from 2.00 to 3.26 +(1.6x), achieving 99.5% accuracy in 1400-frames video (274k context length) +needle-in-a-haystack. LongVILA-8B demonstrates consistent accuracy improvements +on long videos in the VideoMME benchmark as the number of frames increases. + +
+
+ comment: Code and models are available at + https://github.com/NVlabs/VILA/blob/main/LongVILA.md +
+
+
+
+
+ + ♻ ☆ Unified Domain Adaptive Semantic Segmentation + + +
+ Unsupervised Domain Adaptive Semantic Segmentation (UDA-SS) aims to transfer +the supervision from a labeled source domain to an unlabeled target domain. The +majority of existing UDA-SS works typically consider images whilst recent +attempts have extended further to tackle videos by modeling the temporal +dimension. Although the two lines of research share the major challenges -- +overcoming the underlying domain distribution shift, their studies are largely +independent, resulting in fragmented insights, a lack of holistic +understanding, and missed opportunities for cross-pollination of ideas. This +fragmentation prevents the unification of methods, leading to redundant efforts +and suboptimal knowledge transfer across image and video domains. Under this +observation, we advocate unifying the study of UDA-SS across video and image +scenarios, enabling a more comprehensive understanding, synergistic +advancements, and efficient knowledge sharing. To that end, we explore the +unified UDA-SS from a general data augmentation perspective, serving as a +unifying conceptual framework, enabling improved generalization, and potential +for cross-pollination of ideas, ultimately contributing to the overall progress +and practical impact of this field of research. Specifically, we propose a +Quad-directional Mixup (QuadMix) method, characterized by tackling distinct +point attributes and feature inconsistencies through four-directional paths for +intra- and inter-domain mixing in a feature space. To deal with temporal shifts +with videos, we incorporate optical flow-guided feature aggregation across +spatial and temporal dimensions for fine-grained domain alignment. Extensive +experiments show that our method outperforms the state-of-the-art works by +large margins on four challenging UDA-SS benchmarks. Our source code and models +will be released at \url{https://github.com/ZHE-SAPI/UDASS}. + +
+
+ comment: 18 pages,10 figures, 11 tables +
+
+
+
+
+ + ♻ ☆ Self-supervised Photographic Image Layout Representation Learning + + +
+ In the domain of image layout representation learning, the critical process +of translating image layouts into succinct vector forms is increasingly +significant across diverse applications, such as image retrieval, manipulation, +and generation. Most approaches in this area heavily rely on costly labeled +datasets and notably lack in adapting their modeling and learning methods to +the specific nuances of photographic image layouts. This shortfall makes the +learning process for photographic image layouts suboptimal. In our research, we +directly address these challenges. We innovate by defining basic layout +primitives that encapsulate various levels of layout information and by mapping +these, along with their interconnections, onto a heterogeneous graph structure. +This graph is meticulously engineered to capture the intricate layout +information within the pixel domain explicitly. Advancing further, we introduce +novel pretext tasks coupled with customized loss functions, strategically +designed for effective self-supervised learning of these layout graphs. +Building on this foundation, we develop an autoencoder-based network +architecture skilled in compressing these heterogeneous layout graphs into +precise, dimensionally-reduced layout representations. Additionally, we +introduce the LODB dataset, which features a broader range of layout categories +and richer semantics, serving as a comprehensive benchmark for evaluating the +effectiveness of layout representation learning methods. Our extensive +experimentation on this dataset demonstrates the superior performance of our +approach in the realm of photographic image layout representation learning. + +
+
+ comment: The authors of the paper believe that there is an error in the + measurement of the F1 curve in the metrics description +
+
+
+
+
+ + ♻ ☆ Efficient and Robust Quantization-aware Training via Adaptive Coreset + Selection + + +
+ Quantization-aware training (QAT) is a representative model compression +method to reduce redundancy in weights and activations. However, most existing +QAT methods require end-to-end training on the entire dataset, which suffers +from long training time and high energy costs. In addition, the potential label +noise in the training data undermines the robustness of QAT. We propose two +metrics based on analysis of loss and gradient of quantized weights: error +vector score and disagreement score, to quantify the importance of each sample +during training. Guided by these two metrics, we proposed a quantization-aware +Adaptive Coreset Selection (ACS) method to select the data for the current +training epoch. We evaluate our method on various networks (ResNet-18, +MobileNetV2, RetinaNet), datasets(CIFAR-10, CIFAR-100, ImageNet-1K, COCO), and +under different quantization settings. Specifically, our method can achieve an +accuracy of 68.39\% of 4-bit quantized ResNet-18 on the ImageNet-1K dataset +with only a 10\% subset, which has an absolute gain of 4.24\% compared to the +baseline. Our method can also improve the robustness of QAT by removing noisy +samples in the training set. + +
+
+ comment: Accepted by TMLR, Code: https://github.com/HuangOwen/QAT-ACS +
+
+
+
+
+ + ♻ ☆ SR+Codec: a Benchmark of Super-Resolution for Video Compression Bitrate + Reduction + + +
+ In recent years, there has been significant interest in Super-Resolution +(SR), which focuses on generating a high-resolution image from a low-resolution +input. Deep learning-based methods for super-resolution have been particularly +popular and have shown impressive results on various benchmarks. However, +research indicates that these methods may not perform as well on strongly +compressed videos. We developed a super-resolution benchmark to analyze SR's +capacity to upscale compressed videos. Our dataset employed video codecs based +on five widely-used compression standards: H.264, H.265, H.266, AV1, and AVS3. +We assessed 19 popular SR models using our benchmark and evaluated their +ability to restore details and their susceptibility to compression artifacts. +To get an accurate perceptual ranking of SR models, we conducted a +crowd-sourced side-by-side comparison of their outputs. We found that some SR +models, combined with compression, allow us to reduce the video bitrate without +significant loss of quality. We also compared a range of image and video +quality metrics with subjective scores to evaluate their accuracy on +super-resolved compressed videos. The benchmark is publicly available at +https://videoprocessing.ai/benchmarks/super-resolution-for-video-compression.html + +
+
+
+
+
+ + ♻ ☆ Self-Supervised Disentanglement by Leveraging Structure in Data + Augmentations + + +
+ Self-supervised representation learning often uses data augmentations to +induce some invariance to "style" attributes of the data. However, with +downstream tasks generally unknown at training time, it is difficult to deduce +a priori which attributes of the data are indeed "style" and can be safely +discarded. To deal with this, current approaches try to retain some style +information by tuning the degree of invariance to some particular task, such as +ImageNet object classification. However, prior work has shown that such +task-specific tuning can lead to significant performance degradation on other +tasks that rely on the discarded style. To address this, we introduce a more +principled approach that seeks to disentangle style features rather than +discard them. The key idea is to add multiple style embedding spaces where: (i) +each is invariant to all-but-one augmentation; and (ii) joint entropy is +maximized. We formalize our structured data-augmentation procedure from a +causal latent-variable-model perspective, and prove identifiability of both +content and individual style variables. We empirically demonstrate the benefits +of our approach on both synthetic and real-world data. + +
+
+
+
+
+ + ♻ ☆ Vision-Language Dataset Distillation + + +
+ Dataset distillation methods reduce large-scale datasets to smaller sets of +synthetic data, preserving sufficient information to quickly train a new model +from scratch. However, prior work on dataset distillation has focused +exclusively on image classification datasets, whereas modern large-scale +datasets are primarily vision-language datasets. In this work, we design the +first vision-language dataset distillation method, building on the idea of +trajectory matching. A key challenge is that vision-language datasets do not +have a set of discrete classes. To overcome this, our proposed method jointly +distills image-text pairs in a contrastive formulation. Further, we leverage +Low-Rank Adaptation (LoRA) matching to enable more efficient and effective +trajectory matching in complex modern vision-language models. Since there are +no existing baselines, we compare our distillation approach with three adapted +vision-language coreset selection methods. We demonstrate significant +improvements on the challenging Flickr30K and COCO retrieval benchmarks: for +example, on Flickr30K, the best coreset selection method selecting 1000 +image-text pairs for training achieves only 5.6% image-to-text retrieval +accuracy (i.e., recall@1); in contrast, our dataset distillation almost doubles +that to 9.9% with just 100 training pairs, an order of magnitude fewer. + +
+
+ comment: 31 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ MagicID: Flexible ID Fidelity Generation System + + +
+ Portrait Fidelity Generation is a prominent research area in generative +models, with a primary focus on enhancing both controllability and fidelity. +Current methods face challenges in generating high-fidelity portrait results +when faces occupy a small portion of the image with a low resolution, +especially in multi-person group photo settings. To tackle these issues, we +propose a systematic solution called MagicID, based on a self-constructed +million-level multi-modal dataset named IDZoom. MagicID consists of Multi-Mode +Fusion training strategy (MMF) and DDIM Inversion based ID Restoration +inference framework (DIIR). During training, MMF iteratively uses the skeleton +and landmark modalities from IDZoom as conditional guidance. By introducing the +Clone Face Tuning in training stage and Mask Guided Multi-ID Cross Attention +(MGMICA) in inference stage, explicit constraints on face positional features +are achieved for multi-ID group photo generation. The DIIR aims to address the +issue of artifacts. The DDIM Inversion is used in conjunction with face +landmarks, global and local face features to achieve face restoration while +keeping the background unchanged. Additionally, DIIR is plug-and-play and can +be applied to any diffusion-based portrait generation method. To validate the +effectiveness of MagicID, we conducted extensive comparative and ablation +experiments. The experimental results demonstrate that MagicID has significant +advantages in both subjective and objective metrics, and achieves controllable +generation in multi-person scenarios. + +
+
+
+
+
+ + ♻ ☆ Using Unreliable Pseudo-Labels for Label-Efficient Semantic Segmentation + + +
+ The crux of label-efficient semantic segmentation is to produce high-quality +pseudo-labels to leverage a large amount of unlabeled or weakly labeled data. A +common practice is to select the highly confident predictions as the +pseudo-ground-truths for each pixel, but it leads to a problem that most pixels +may be left unused due to their unreliability. However, we argue that every +pixel matters to the model training, even those unreliable and ambiguous +pixels. Intuitively, an unreliable prediction may get confused among the top +classes, however, it should be confident about the pixel not belonging to the +remaining classes. Hence, such a pixel can be convincingly treated as a +negative key to those most unlikely categories. Therefore, we develop an +effective pipeline to make sufficient use of unlabeled data. Concretely, we +separate reliable and unreliable pixels via the entropy of predictions, push +each unreliable pixel to a category-wise queue that consists of negative keys, +and manage to train the model with all candidate pixels. Considering the +training evolution, we adaptively adjust the threshold for the +reliable-unreliable partition. Experimental results on various benchmarks and +training settings demonstrate the superiority of our approach over the +state-of-the-art alternatives. + +
+
+ comment: Accepted by IJCV. arXiv admin note: text overlap with + arXiv:2203.03884 +
+
+
+
+
+ + ♻ ☆ Language-Guided Self-Supervised Video Summarization Using Text Semantic + Matching Considering the Diversity of the Video + + +
+ Current video summarization methods rely heavily on supervised computer +vision techniques, which demands time-consuming and subjective manual +annotations. To overcome these limitations, we investigated self-supervised +video summarization. Inspired by the success of Large Language Models (LLMs), +we explored the feasibility in transforming the video summarization task into a +Natural Language Processing (NLP) task. By leveraging the advantages of LLMs in +context understanding, we aim to enhance the effectiveness of self-supervised +video summarization. Our method begins by generating captions for individual +video frames, which are then synthesized into text summaries by LLMs. +Subsequently, we measure semantic distance between the captions and the text +summary. Notably, we propose a novel loss function to optimize our model +according to the diversity of the video. Finally, the summarized video can be +generated by selecting the frames with captions similar to the text summary. +Our method achieves state-of-the-art performance on the SumMe dataset in rank +correlation coefficients. In addition, our method has a novel feature of being +able to achieve personalized summarization. + +
+
+
+
+
+ + ♻ ☆ BrainVis: Exploring the Bridge between Brain and Visual Signals via + Image Reconstruction + + +
+ Analyzing and reconstructing visual stimuli from brain signals effectively +advances the understanding of human visual system. However, the EEG signals are +complex and contain significant noise. This leads to substantial limitations in +existing works of visual stimuli reconstruction from EEG, such as difficulties +in aligning EEG embeddings with the fine-grained semantic information and a +heavy reliance on additional large self-collected dataset for training. To +address these challenges, we propose a novel approach called BrainVis. Firstly, +we divide the EEG signals into various units and apply a self-supervised +approach on them to obtain EEG time-domain features, in an attempt to ease the +training difficulty. Additionally, we also propose to utilize the +frequency-domain features to enhance the EEG representations. Then, we +simultaneously align EEG time-frequency embeddings with the interpolation of +the coarse and fine-grained semantics in the CLIP space, to highlight the +primary visual components and reduce the cross-modal alignment difficulty. +Finally, we adopt the cascaded diffusion models to reconstruct images. Using +only 10\% training data of the previous work, our proposed BrainVis outperforms +state of the arts in both semantic fidelity reconstruction and generation +quality. The code is available at https://github.com/RomGai/BrainVis. + +
+
+
+
+
+ + ♻ ☆ Human-Aware 3D Scene Generation with Spatially-constrained Diffusion + Models + + +
+ Generating 3D scenes from human motion sequences supports numerous +applications, including virtual reality and architectural design. However, +previous auto-regression-based human-aware 3D scene generation methods have +struggled to accurately capture the joint distribution of multiple objects and +input humans, often resulting in overlapping object generation in the same +space. To address this limitation, we explore the potential of diffusion models +that simultaneously consider all input humans and the floor plan to generate +plausible 3D scenes. Our approach not only satisfies all input human +interactions but also adheres to spatial constraints with the floor plan. +Furthermore, we introduce two spatial collision guidance mechanisms: +human-object collision avoidance and object-room boundary constraints. These +mechanisms help avoid generating scenes that conflict with human motions while +respecting layout constraints. To enhance the diversity and accuracy of +human-guided scene generation, we have developed an automated pipeline that +improves the variety and plausibility of human-object interactions in the +existing 3D FRONT HUMAN dataset. Extensive experiments on both synthetic and +real-world datasets demonstrate that our framework can generate more natural +and plausible 3D scenes with precise human-scene interactions, while +significantly reducing human-object collisions compared to previous +state-of-the-art methods. Our code and data will be made publicly available +upon publication of this work. + +
+
+
+
+
+ + ♻ ☆ Compression-Realized Deep Structural Network for Video Quality + Enhancement + + +
+ This paper focuses on the task of quality enhancement for compressed videos. +Although deep network-based video restorers achieve impressive progress, most +of the existing methods lack a structured design to optimally leverage the +priors within compression codecs. Since the quality degradation of the video is +primarily induced by the compression algorithm, a new paradigm is urgently +needed for a more ``conscious'' process of quality enhancement. As a result, we +propose the Compression-Realized Deep Structural Network (CRDS), introducing +three inductive biases aligned with the three primary processes in the classic +compression codec, merging the strengths of classical encoder architecture with +deep network capabilities. Inspired by the residual extraction and domain +transformation process in the codec, a pre-trained Latent Degradation Residual +Auto-Encoder is proposed to transform video frames into a latent feature space, +and the mutual neighborhood attention mechanism is integrated for precise +motion estimation and residual extraction. Furthermore, drawing inspiration +from the quantization noise distribution of the codec, CRDS proposes a novel +Progressive Denoising framework with intermediate supervision that decomposes +the quality enhancement into a series of simpler denoising sub-tasks. +Experimental results on datasets like LDV 2.0 and MFQE 2.0 indicate our +approach surpasses state-of-the-art models. + +
+
+
+
+
+ + ♻ ☆ Conditional Brownian Bridge Diffusion Model for VHR SAR to Optical Image + Translation + + +
+ Synthetic Aperture Radar (SAR) imaging technology provides the unique +advantage of being able to collect data regardless of weather conditions and +time. However, SAR images exhibit complex backscatter patterns and speckle +noise, which necessitate expertise for interpretation. Research on translating +SAR images into optical-like representations has been conducted to aid the +interpretation of SAR data. Nevertheless, existing studies have predominantly +utilized low-resolution satellite imagery datasets and have largely been based +on Generative Adversarial Network (GAN) which are known for their training +instability and low fidelity. To overcome these limitations of low-resolution +data usage and GAN-based approaches, this paper introduces a conditional +image-to-image translation approach based on Brownian Bridge Diffusion Model +(BBDM). We conducted comprehensive experiments on the MSAW dataset, a paired +SAR and optical images collection of 0.5m Very-High-Resolution (VHR). The +experimental results indicate that our method surpasses both the Conditional +Diffusion Models (CDMs) and the GAN-based models in diverse perceptual quality +metrics. + +
+
+ comment: 5 pages, 2 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Gaussian in the Dark: Real-Time View Synthesis From Inconsistent Dark + Images Using Gaussian Splatting + + +
+ 3D Gaussian Splatting has recently emerged as a powerful representation that +can synthesize remarkable novel views using consistent multi-view images as +input. However, we notice that images captured in dark environments where the +scenes are not fully illuminated can exhibit considerable brightness variations +and multi-view inconsistency, which poses great challenges to 3D Gaussian +Splatting and severely degrades its performance. To tackle this problem, we +propose Gaussian-DK. Observing that inconsistencies are mainly caused by camera +imaging, we represent a consistent radiance field of the physical world using a +set of anisotropic 3D Gaussians, and design a camera response module to +compensate for multi-view inconsistencies. We also introduce a step-based +gradient scaling strategy to constrain Gaussians near the camera, which turn +out to be floaters, from splitting and cloning. Experiments on our proposed +benchmark dataset demonstrate that Gaussian-DK produces high-quality renderings +without ghosting and floater artifacts and significantly outperforms existing +methods. Furthermore, we can also synthesize light-up images by controlling +exposure levels that clearly show details in shadow areas. + +
+
+ comment: accepted by PG 2024 +
+
+
+
+
+ + ♻ ☆ Screen Them All: High-Throughput Pan-Cancer Genetic and Phenotypic + Biomarker Screening from H&E Whole Slide Images + + +
+ Many molecular alterations serve as clinically prognostic or +therapy-predictive biomarkers, typically detected using single or multi-gene +molecular assays. However, these assays are expensive, tissue destructive and +often take weeks to complete. Using AI on routine H&E WSIs offers a fast and +economical approach to screen for multiple molecular biomarkers. We present a +high-throughput AI-based system leveraging Virchow2, a foundation model +pre-trained on 3 million slides, to interrogate genomic features previously +determined by an next-generation sequencing (NGS) assay, using 47,960 scanned +hematoxylin and eosin (H&E) whole slide images (WSIs) from 38,984 cancer +patients. Unlike traditional methods that train individual models for each +biomarker or cancer type, our system employs a unified model to simultaneously +predict a wide range of clinically relevant molecular biomarkers across cancer +types. By training the network to replicate the MSK-IMPACT targeted biomarker +panel of 505 genes, it identified 80 high performing biomarkers with a mean +AU-ROC of 0.89 in 15 most common cancer types. In addition, 40 biomarkers +demonstrated strong associations with specific cancer histologic subtypes. +Furthermore, 58 biomarkers were associated with targets frequently assayed +clinically for therapy selection and response prediction. The model can also +predict the activity of five canonical signaling pathways, identify defects in +DNA repair mechanisms, and predict genomic instability measured by tumor +mutation burden, microsatellite instability (MSI), and chromosomal instability +(CIN). The proposed model can offer potential to guide therapy selection, +improve treatment efficacy, accelerate patient screening for clinical trials +and provoke the interrogation of new therapeutic targets. + +
+
+
+
+
+ + ♻ ☆ PriorMapNet: Enhancing Online Vectorized HD Map Construction with Priors + + +
+ Online vectorized High-Definition (HD) map construction is crucial for +subsequent prediction and planning tasks in autonomous driving. Following MapTR +paradigm, recent works have made noteworthy achievements. However, reference +points are randomly initialized in mainstream methods, leading to unstable +matching between predictions and ground truth. To address this issue, we +introduce PriorMapNet to enhance online vectorized HD map construction with +priors. We propose the PPS-Decoder, which provides reference points with +position and structure priors. Fitted from the map elements in the dataset, +prior reference points lower the learning difficulty and achieve stable +matching. Furthermore, we propose the PF-Encoder to enhance the image-to-BEV +transformation with BEV feature priors. Besides, we propose the DMD +cross-attention, which decouples cross-attention along multi-scale and +multi-sample respectively to achieve efficiency. Our proposed PriorMapNet +achieves state-of-the-art performance in the online vectorized HD map +construction task on nuScenes and Argoverse2 datasets. The code will be +released publicly soon. + +
+
+
+
+
+ + ♻ ☆ SAM Meets UAP: Attacking Segment Anything Model With Universal + Adversarial Perturbation + + +
+ As Segment Anything Model (SAM) becomes a popular foundation model in +computer vision, its adversarial robustness has become a concern that cannot be +ignored. This works investigates whether it is possible to attack SAM with +image-agnostic Universal Adversarial Perturbation (UAP). In other words, we +seek a single perturbation that can fool the SAM to predict invalid masks for +most (if not all) images. We demonstrate convetional image-centric attack +framework is effective for image-independent attacks but fails for universal +adversarial attack. To this end, we propose a novel perturbation-centric +framework that results in a UAP generation method based on self-supervised +contrastive learning (CL), where the UAP is set to the anchor sample and the +positive sample is augmented from the UAP. The representations of negative +samples are obtained from the image encoder in advance and saved in a memory +bank. The effectiveness of our proposed CL-based UAP generation method is +validated by both quantitative and qualitative results. On top of the ablation +study to understand various components in our proposed method, we shed light on +the roles of positive and negative samples in making the generated UAP +effective for attacking SAM. + +
+
+
+
+
+ + ♻ ☆ A study on the adequacy of common IQA measures for medical images + + +
+ Image quality assessment (IQA) is standard practice in the development stage +of novel machine learning algorithms that operate on images. The most commonly +used IQA measures have been developed and tested for natural images, but not in +the medical setting. Reported inconsistencies arising in medical images are not +surprising, as they have different properties than natural images. In this +study, we test the applicability of common IQA measures for medical image data +by comparing their assessment to manually rated chest X-ray (5 experts) and +photoacoustic image data (2 experts). Moreover, we include supplementary +studies on grayscale natural images and accelerated brain MRI data. The results +of all experiments show a similar outcome in line with previous findings for +medical imaging: PSNR and SSIM in the default setting are in the lower range of +the result list and HaarPSI outperforms the other tested measures in the +overall performance. Also among the top performers in our medical experiments +are the full reference measures FSIM, GMSD, LPIPS and MS-SSIM. Generally, the +results on natural images yield considerably higher correlations, suggesting +that the additional employment of tailored IQA measures for medical imaging +algorithms is needed. + +
+
+
+
+
+ + ♻ ☆ Human-inspired Explanations for Vision Transformers and Convolutional + Neural Networks ECCV 2024 + + +
+ We introduce Foveation-based Explanations (FovEx), a novel human-inspired +visual explainability (XAI) method for Deep Neural Networks. Our method +achieves state-of-the-art performance on both transformer (on 4 out of 5 +metrics) and convolutional models (on 3 out of 5 metrics), demonstrating its +versatility. Furthermore, we show the alignment between the explanation map +produced by FovEx and human gaze patterns (+14\% in NSS compared to RISE, ++203\% in NSS compared to gradCAM), enhancing our confidence in FovEx's ability +to close the interpretation gap between humans and machines. + +
+
+ comment: Accepted at the Human-inspired Computer Vision (HCV) ECCV 2024 + Workshop as an extended abstract. A long version of the work can be found at + arXiv:2408.02123v1 +
+
+
+
+
+ + ♻ ☆ PersonViT: Large-scale Self-supervised Vision Transformer for Person + Re-Identification + + +
+ Person Re-Identification (ReID) aims to retrieve relevant individuals in +non-overlapping camera images and has a wide range of applications in the field +of public safety. In recent years, with the development of Vision Transformer +(ViT) and self-supervised learning techniques, the performance of person ReID +based on self-supervised pre-training has been greatly improved. Person ReID +requires extracting highly discriminative local fine-grained features of the +human body, while traditional ViT is good at extracting context-related global +features, making it difficult to focus on local human body features. To this +end, this article introduces the recently emerged Masked Image Modeling (MIM) +self-supervised learning method into person ReID, and effectively extracts +high-quality global and local features through large-scale unsupervised +pre-training by combining masked image modeling and discriminative contrastive +learning, and then conducts supervised fine-tuning training in the person ReID +task. This person feature extraction method based on ViT with masked image +modeling (PersonViT) has the good characteristics of unsupervised, scalable, +and strong generalization capabilities, overcoming the problem of difficult +annotation in supervised person ReID, and achieves state-of-the-art results on +publicly available benchmark datasets, including MSMT17, Market1501, +DukeMTMC-reID, and Occluded-Duke. The code and pre-trained models of the +PersonViT method are released at \url{https://github.com/hustvl/PersonViT} to +promote further research in the person ReID field. + +
+
+
+
+
+ + ♻ ☆ Investigating and Improving Latent Density Segmentation Models for + Aleatoric Uncertainty Quantification in Medical Imaging + + +
+ Data uncertainties, such as sensor noise, occlusions or limitations in the +acquisition method can introduce irreducible ambiguities in images, which +result in varying, yet plausible, semantic hypotheses. In Machine Learning, +this ambiguity is commonly referred to as aleatoric uncertainty. In image +segmentation, latent density models can be utilized to address this problem. +The most popular approach is the Probabilistic U-Net (PU-Net), which uses +latent Normal densities to optimize the conditional data log-likelihood +Evidence Lower Bound. In this work, we demonstrate that the PU-Net latent space +is severely sparse and heavily under-utilized. To address this, we introduce +mutual information maximization and entropy-regularized Sinkhorn Divergence in +the latent space to promote homogeneity across all latent dimensions, +effectively improving gradient-descent updates and latent space +informativeness. Our results show that by applying this on public datasets of +various clinical segmentation problems, our proposed methodology receives up to +11% performance gains compared against preceding latent variable models for +probabilistic segmentation on the Hungarian-Matched Intersection over Union. +The results indicate that encouraging a homogeneous latent space significantly +improves latent density modeling for medical image segmentation. + +
+
+
+
+
+ + ♻ ☆ PetFace: A Large-Scale Dataset and Benchmark for Animal Identification ECCV 2024 + + +
+ Automated animal face identification plays a crucial role in the monitoring +of behaviors, conducting of surveys, and finding of lost animals. Despite the +advancements in human face identification, the lack of datasets and benchmarks +in the animal domain has impeded progress. In this paper, we introduce the +PetFace dataset, a comprehensive resource for animal face identification +encompassing 257,484 unique individuals across 13 animal families and 319 breed +categories, including both experimental and pet animals. This large-scale +collection of individuals facilitates the investigation of unseen animal face +verification, an area that has not been sufficiently explored in existing +datasets due to the limited number of individuals. Moreover, PetFace also has +fine-grained annotations such as sex, breed, color, and pattern. We provide +multiple benchmarks including re-identification for seen individuals and +verification for unseen individuals. The models trained on our dataset +outperform those trained on prior datasets, even for detailed breed variations +and unseen animal families. Our result also indicates that there is some room +to improve the performance of integrated identification on multiple animal +families. We hope the PetFace dataset will facilitate animal face +identification and encourage the development of non-invasive animal automatic +identification methods. + +
+
+ comment: ECCV 2024. Dataset and code: https://dahlian00.github.io/PetFacePage/ +
+
+
+
+
+ + ♻ ☆ Hibou: A Family of Foundational Vision Transformers for Pathology + + +
+ Pathology, the microscopic examination of diseased tissue, is critical for +diagnosing various medical conditions, particularly cancers. Traditional +methods are labor-intensive and prone to human error. Digital pathology, which +converts glass slides into high-resolution digital images for analysis by +computer algorithms, revolutionizes the field by enhancing diagnostic accuracy, +consistency, and efficiency through automated image analysis and large-scale +data processing. Foundational transformer pretraining is crucial for developing +robust, generalizable models as it enables learning from vast amounts of +unannotated data. + This paper introduces the Hibou family of foundational vision transformers +for pathology, leveraging the DINOv2 framework to pretrain two model variants, +Hibou-B and Hibou-L, on a proprietary dataset of over 1 million whole slide +images (WSIs) representing diverse tissue types and staining techniques. Our +pretrained models demonstrate superior performance on both patch-level and +slide-level benchmarks, surpassing existing state-of-the-art methods. Notably, +Hibou-L achieves the highest average accuracy across multiple benchmark +datasets. To support further research and application in the field, we have +open-sourced the Hibou models, which can be accessed at +https://github.com/HistAI/hibou. + +
+
+
+
+
+ + ♻ ☆ Segment, Select, Correct: A Framework for Weakly-Supervised Referring + Segmentation ECCV'24 + + +
+ Referring Image Segmentation (RIS) - the problem of identifying objects in +images through natural language sentences - is a challenging task currently +mostly solved through supervised learning. However, while collecting referred +annotation masks is a time-consuming process, the few existing +weakly-supervised and zero-shot approaches fall significantly short in +performance compared to fully-supervised learning ones. To bridge the +performance gap without mask annotations, we propose a novel weakly-supervised +framework that tackles RIS by decomposing it into three steps: obtaining +instance masks for the object mentioned in the referencing instruction +(segment), using zero-shot learning to select a potentially correct mask for +the given instruction (select), and bootstrapping a model which allows for +fixing the mistakes of zero-shot selection (correct). In our experiments, using +only the first two steps (zero-shot segment and select) outperforms other +zero-shot baselines by as much as 16.5%, while our full method improves upon +this much stronger baseline and sets the new state-of-the-art for +weakly-supervised RIS, reducing the gap between the weakly-supervised and +fully-supervised methods in some cases from around 33% to as little as 7%. Code +is available at https://github.com/fgirbal/segment-select-correct. + +
+
+ comment: Accepted to ECCV'24 Workshop Proceedings (Instance-Level Recognition + Workshop) +
+
+
+
+
+ + ♻ ☆ Scene123: One Prompt to 3D Scene Generation via Video-Assisted and + Consistency-Enhanced MAE + + +
+ As Artificial Intelligence Generated Content (AIGC) advances, a variety of +methods have been developed to generate text, images, videos, and 3D objects +from single or multimodal inputs, contributing efforts to emulate human-like +cognitive content creation. However, generating realistic large-scale scenes +from a single input presents a challenge due to the complexities involved in +ensuring consistency across extrapolated views generated by models. Benefiting +from recent video generation models and implicit neural representations, we +propose Scene123, a 3D scene generation model, that not only ensures realism +and diversity through the video generation framework but also uses implicit +neural fields combined with Masked Autoencoders (MAE) to effectively ensures +the consistency of unseen areas across views. Specifically, we initially warp +the input image (or an image generated from text) to simulate adjacent views, +filling the invisible areas with the MAE model. However, these filled images +usually fail to maintain view consistency, thus we utilize the produced views +to optimize a neural radiance field, enhancing geometric consistency. + Moreover, to further enhance the details and texture fidelity of generated +views, we employ a GAN-based Loss against images derived from the input image +through the video generation model. Extensive experiments demonstrate that our +method can generate realistic and consistent scenes from a single prompt. Both +qualitative and quantitative results indicate that our approach surpasses +existing state-of-the-art methods. We show encourage video examples at +https://yiyingyang12.github.io/Scene123.github.io/. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2305.11588 by other authors +
+
+
+
+
+ + ♻ ☆ Novel class discovery meets foundation models for 3D semantic + segmentation + + +
+ The task of Novel Class Discovery (NCD) in semantic segmentation entails +training a model able to accurately segment unlabelled (novel) classes, relying +on the available supervision from annotated (base) classes. Although +extensively investigated in 2D image data, the extension of the NCD task to the +domain of 3D point clouds represents a pioneering effort, characterized by +assumptions and challenges that are not present in the 2D case. This paper +represents an advancement in the analysis of point cloud data in four +directions. Firstly, it introduces the novel task of NCD for point cloud +semantic segmentation. Secondly, it demonstrates that directly transposing the +only existing NCD method for 2D image semantic segmentation to 3D data yields +suboptimal results. Thirdly, a new NCD approach based on online clustering, +uncertainty estimation, and semantic distillation is presented. Lastly, a novel +evaluation protocol is proposed to rigorously assess the performance of NCD in +point cloud semantic segmentation. Through comprehensive evaluations on the +SemanticKITTI, SemanticPOSS, and S3DIS datasets, the paper demonstrates +substantial superiority of the proposed method over the considered baselines. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2303.11610 +
+
+
+
+
+ + ♻ ☆ SearchLVLMs: A Plug-and-Play Framework for Augmenting Large + Vision-Language Models by Searching Up-to-Date Internet Knowledge + + +
+ Large vision-language models (LVLMs) are ignorant of the up-to-date +knowledge, such as LLaVA series, because they cannot be updated frequently due +to the large amount of resources required, and therefore fail in many cases. +For example, if a LVLM was released on January 2024, and it wouldn't know the +singer of the theme song for the new Detective Conan movie, which wasn't +released until April 2024. To solve the problem, a promising solution motivated +by retrieval-augmented generation (RAG) is to provide LVLMs with up-to-date +knowledge via internet search during inference, i.e., internet-augmented +generation (IAG), which is already integrated in some closed-source commercial +LVLMs such as GPT-4V. However, the specific mechanics underpinning them remain +a mystery. In this paper, we propose a plug-and-play framework, for augmenting +existing LVLMs in handling visual question answering (VQA) about up-to-date +knowledge, dubbed SearchLVLMs. A hierarchical filtering model is trained to +effectively and efficiently find the most helpful content from the websites +returned by a search engine to prompt LVLMs with up-to-date knowledge. To train +the model and evaluate our framework's performance, we propose a pipeline to +automatically generate news-related VQA samples to construct a dataset, dubbed +UDK-VQA. A multi-model voting mechanism is introduced to label the usefulness +of website/content for VQA samples to construct the training set. Experimental +results demonstrate the effectiveness of our framework, outperforming GPT-4V by +about 25% in accuracy. + +
+
+ comment: 13 pages, 6 figures, a plug-and-play framework to augment large + vision-language models with up-to-date internet knowledge +
+
+
+
+
+ + ♻ ☆ Atlas-Based Interpretable Age Prediction In Whole-Body MR Images + + +
+ Age prediction is an important part of medical assessments and research. It +can aid in detecting diseases as well as abnormal ageing by highlighting +potential discrepancies between chronological and biological age. To improve +understanding of age-related changes in various body parts, we investigate the +ageing of the human body on a large scale by using whole-body 3D images. We +utilise the Grad-CAM method to determine the body areas most predictive of a +person's age. In order to expand our analysis beyond individual subjects, we +employ registration techniques to generate population-wide importance maps that +show the most predictive areas in the body for a whole cohort of subjects. We +show that the investigation of the full 3D volume of the whole body and the +population-wide analysis can give important insights into which body parts play +the most important roles in predicting a person's age. Our findings reveal +three primary areas of interest: the spine, the autochthonous back muscles, and +the cardiac region, which exhibits the highest importance. Finally, we +investigate differences between subjects that show accelerated and decelerated +ageing. + +
+
+
+
+
+ + ♻ ☆ A Fast and Computationally Inexpensive Method For Image Translation of + 3D Volume Patient Data + + +
+ CycleGAN was trained on SynthRAD Grand Challenge Dataset using the +single-epoch modification (SEM) method proposed in this paper which is referred +to as (CycleGAN-single) compared to the usual method of training CycleGAN on +around 200 epochs (CycleGAN-multi). Model performance were evaluated +qualitatively and quantitatively with quantitative performance metrics like +PSNR, SSIM, MAE and MSE. The consideration of both quantitative and qualitative +performance when evaluating a model is unique to certain image-translation +tasks like medical imaging as detailed in this paper. Also, this paper shows +that good quantitative performance does not always imply good qualitative +performance and the converse is also not always True (i.e. good qualitative +performance does not always imply good quantitative performance). This paper +also proposes FQGA (Fast Paired Image-to-Image Translation Quarter-Generator +Adversary) Model which has 1/4 the number of parameters compared to CycleGAN +(when comparing their Generator Models). FQGA outperforms CycleGAN +qualitatively and quantitatively even only after training on 20 epochs. +Finally, using SEM method on FQGA allowed it to again outperform CycleGAN both +quantitatively and qualitatively. These performance gains with fewer model +parameters and time savings from running fewer epochs may also be applicable to +other image-to-image translation tasks in Machine Learning apart from the +Medical image-translation task discussed in this paper between Cone Beam +Computed Tomography (CBCT) and Computed Tomography (CT) images. + +
+
+
+
+
+ + ♻ ☆ MambaLoc: Efficient Camera Localisation via State Space Model + + +
+ Location information is pivotal for the automation and intelligence of +terminal devices and edge-cloud IoT systems, such as autonomous vehicles and +augmented reality. However, achieving reliable positioning across diverse IoT +applications remains challenging due to significant training costs and the +necessity of densely collected data. To tackle these issues, we have +innovatively applied the selective state space (SSM) model to visual +localization, introducing a new model named MambaLoc. The proposed model +demonstrates exceptional training efficiency by capitalizing on the SSM model's +strengths in efficient feature extraction, rapid computation, and memory +optimization, and it further ensures robustness in sparse data environments due +to its parameter sparsity. Additionally, we propose the Global Information +Selector (GIS), which leverages selective SSM to implicitly achieve the +efficient global feature extraction capabilities of Non-local Neural Networks. +This design leverages the computational efficiency of the SSM model alongside +the Non-local Neural Networks' capacity to capture long-range dependencies with +minimal layers. Consequently, the GIS enables effective global information +capture while significantly accelerating convergence. Our extensive +experimental validation using public indoor and outdoor datasets first +demonstrates our model's effectiveness, followed by evidence of its versatility +with various existing localization models. Our code and models are publicly +available to support further research and development in this area. + +
+
+
+
+
+ + ♻ ☆ Snuffy: Efficient Whole Slide Image Classifier ECCV 2024 + + +
+ Whole Slide Image (WSI) classification with multiple instance learning (MIL) +in digital pathology faces significant computational challenges. Current +methods mostly rely on extensive self-supervised learning (SSL) for +satisfactory performance, requiring long training periods and considerable +computational resources. At the same time, no pre-training affects performance +due to domain shifts from natural images to WSIs. We introduce Snuffy +architecture, a novel MIL-pooling method based on sparse transformers that +mitigates performance loss with limited pre-training and enables continual +few-shot pre-training as a competitive option. Our sparsity pattern is tailored +for pathology and is theoretically proven to be a universal approximator with +the tightest probabilistic sharp bound on the number of layers for sparse +transformers, to date. We demonstrate Snuffy's effectiveness on CAMELYON16 and +TCGA Lung cancer datasets, achieving superior WSI and patch-level accuracies. +The code is available on https://github.com/jafarinia/snuffy. + +
+
+ comment: Accepted for ECCV 2024 +
+
+
+
+
+ + ♻ ☆ S3E: A Mulit-Robot Multimodal Dataset for Collaborative SLAM + + +
+ The burgeoning demand for collaborative robotic systems to execute complex +tasks collectively has intensified the research community's focus on advancing +simultaneous localization and mapping (SLAM) in a cooperative context. Despite +this interest, the scalability and diversity of existing datasets for +collaborative trajectories remain limited, especially in scenarios with +constrained perspectives where the generalization capabilities of Collaborative +SLAM (C-SLAM) are critical for the feasibility of multi-agent missions. +Addressing this gap, we introduce S3E, an expansive multimodal dataset. +Captured by a fleet of unmanned ground vehicles traversing four distinct +collaborative trajectory paradigms, S3E encompasses 13 outdoor and 5 indoor +sequences. These sequences feature meticulously synchronized and spatially +calibrated data streams, including 360-degree LiDAR point cloud, +high-resolution stereo imagery, high-frequency inertial measurement units +(IMU), and Ultra-wideband (UWB) relative observations. Our dataset not only +surpasses previous efforts in scale, scene diversity, and data intricacy but +also provides a thorough analysis and benchmarks for both collaborative and +individual SLAM methodologies. For access to the dataset and the latest +information, please visit our repository at https://pengyu-team.github.io/S3E. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ Contrastive Learning for Lane Detection via cross-similarity + + +
+ Detecting lane markings in road scenes poses a challenge due to their +intricate nature, which is susceptible to unfavorable conditions. While lane +markings have strong shape priors, their visibility is easily compromised by +lighting conditions, occlusions by other vehicles or pedestrians, and fading of +colors over time. The detection process is further complicated by the presence +of several lane shapes and natural variations, necessitating large amounts of +data to train a robust lane detection model capable of handling various +scenarios. In this paper, we present a novel self-supervised learning method +termed Contrastive Learning for Lane Detection via cross-similarity (CLLD) to +enhance the resilience of lane detection models in real-world scenarios, +particularly when the visibility of lanes is compromised. CLLD introduces a +contrastive learning (CL) method that assesses the similarity of local features +within the global context of the input image. It uses the surrounding +information to predict lane markings. This is achieved by integrating local +feature contrastive learning with our proposed cross-similar operation. The +local feature CL concentrates on extracting features from small patches, a +necessity for accurately localizing lane segments. Meanwhile, cross-similarity +captures global features, enabling the detection of obscured lane segments +based on their surroundings. We enhance cross-similarity by randomly masking +portions of input images in the process of augmentation. Extensive experiments +on TuSimple and CuLane benchmarks demonstrate that CLLD outperforms SOTA +contrastive learning methods, particularly in visibility-impairing conditions +like shadows, while it also delivers comparable results under normal +conditions. Compared to supervised learning, CLLD still excels in challenging +scenarios such as shadows and crowded scenes, which are common in real-world +driving. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ♻ ☆ IIDM: Image-to-Image Diffusion Model for Semantic Image Synthesis + + +
+ Semantic image synthesis aims to generate high-quality images given semantic +conditions, i.e. segmentation masks and style reference images. Existing +methods widely adopt generative adversarial networks (GANs). GANs take all +conditional inputs and directly synthesize images in a single forward step. In +this paper, semantic image synthesis is treated as an image denoising task and +is handled with a novel image-to-image diffusion model (IIDM). Specifically, +the style reference is first contaminated with random noise and then +progressively denoised by IIDM, guided by segmentation masks. Moreover, three +techniques, refinement, color-transfer and model ensembles, are proposed to +further boost the generation quality. They are plug-in inference modules and do +not require additional training. Extensive experiments show that our IIDM +outperforms existing state-of-the-art methods by clear margins. Further +analysis is provided via detailed demonstrations. We have implemented IIDM +based on the Jittor framework; code is available at +https://github.com/ader47/jittor-jieke-semantic_images_synthesis. + +
+
+ comment: 6 pages, 7 figures, accepted by CVMJ 2024 +
+
+
+
+
+ + ♻ ☆ ECAFormer: Low-light Image Enhancement using Cross Attention + + +
+ Low-light image enhancement (LLIE) is critical in computer vision. Existing +LLIE methods often fail to discover the underlying relationships between +different sub-components, causing the loss of complementary information between +multiple modules and network layers, ultimately resulting in the loss of image +details. To beat this shortage, we design a hierarchical mutual Enhancement via +a Cross Attention transformer (ECAFormer), which introduces an architecture +that enables concurrent propagation and interaction of multiple features. The +model preserves detailed information by introducing a Dual Multi-head +self-attention (DMSA), which leverages visual and semantic features across +different scales, allowing them to guide and complement each other. Besides, a +Cross-Scale DMSA block is introduced to capture the residual connection, +integrating cross-layer information to further enhance image detail. +Experimental results show that ECAFormer reaches competitive performance across +multiple benchmarks, yielding nearly a 3% improvement in PSNR over the +suboptimal method, demonstrating the effectiveness of information interaction +in LLIE. + +
+
+
+
+
+ + ♻ ☆ Conditional Prototype Rectification Prompt Learning + + +
+ Pre-trained large-scale vision-language models (VLMs) have acquired profound +understanding of general visual concepts. Recent advancements in efficient +transfer learning (ETL) have shown remarkable success in fine-tuning VLMs +within the scenario of limited data, introducing only a few parameters to +harness task-specific insights from VLMs. Despite significant progress, current +leading ETL methods tend to overfit the narrow distributions of base classes +seen during training and encounter two primary challenges: (i) only utilizing +uni-modal information to modeling task-specific knowledge; and (ii) using +costly and time-consuming methods to supplement knowledge. To address these +issues, we propose a Conditional Prototype Rectification Prompt Learning (CPR) +method to correct the bias of base examples and augment limited data in an +effective way. Specifically, we alleviate overfitting on base classes from two +aspects. First, each input image acquires knowledge from both textual and +visual prototypes, and then generates sample-conditional text tokens. Second, +we extract utilizable knowledge from unlabeled data to further refine the +prototypes. These two strategies mitigate biases stemming from base classes, +yielding a more effective classifier. Extensive experiments on 11 benchmark +datasets show that our CPR achieves state-of-the-art performance on both +few-shot classification and base-to-new generalization tasks. Our code is +avaliable at \url{https://github.com/chenhaoxing/CPR}. + +
+
+
+
+
+ + ♻ ☆ AvatarPose: Avatar-guided 3D Pose Estimation of Close Human Interaction + from Sparse Multi-view Videos + + +
+ Despite progress in human motion capture, existing multi-view methods often +face challenges in estimating the 3D pose and shape of multiple closely +interacting people. This difficulty arises from reliance on accurate 2D joint +estimations, which are hard to obtain due to occlusions and body contact when +people are in close interaction. To address this, we propose a novel method +leveraging the personalized implicit neural avatar of each individual as a +prior, which significantly improves the robustness and precision of this +challenging pose estimation task. Concretely, the avatars are efficiently +reconstructed via layered volume rendering from sparse multi-view videos. The +reconstructed avatar prior allows for the direct optimization of 3D poses based +on color and silhouette rendering loss, bypassing the issues associated with +noisy 2D detections. To handle interpenetration, we propose a collision loss on +the overlapping shape regions of avatars to add penetration constraints. +Moreover, both 3D poses and avatars are optimized in an alternating manner. Our +experimental results demonstrate state-of-the-art performance on several public +datasets. + +
+
+ comment: Project Page: https://eth-ait.github.io/AvatarPose/ +
+
+
+
+
+ + ♻ ☆ DeCoF: Generated Video Detection via Frame Consistency: The First + Benchmark Dataset + + +
+ The escalating quality of video generated by advanced video generation +methods results in new security challenges, while there have been few relevant +research efforts: 1) There is no open-source dataset for generated video +detection, 2) No generated video detection method has been proposed so far. To +this end, we propose an open-source dataset and a detection method for +generated video for the first time. First, we propose a scalable dataset +consisting of 964 prompts, covering various forgery targets, scenes, behaviors, +and actions, as well as various generation models with different architectures +and generation methods, including the most popular commercial models like +OpenAI's Sora and Google's Veo. Second, we found via probing experiments that +spatial artifact-based detectors lack generalizability. Hence, we propose a +simple yet effective \textbf{de}tection model based on \textbf{f}rame +\textbf{co}nsistency (\textbf{DeCoF}), which focuses on temporal artifacts by +eliminating the impact of spatial artifacts during feature learning. Extensive +experiments demonstrate the efficacy of DeCoF in detecting videos generated by +unseen video generation models and confirm its powerful generalizability across +several commercially proprietary models. Our code and dataset will be released +at \url{https://github.com/wuwuwuyue/DeCoF}. + +
+
+
+
+
+ + ♻ ☆ Panorama Tomosynthesis from Head CBCT with Simulated Projection Geometry + + +
+ Cone Beam Computed Tomography (CBCT) and Panoramic X-rays are the most +commonly used imaging modalities in dental health care. CBCT can produce +three-dimensional views of a patient's head, providing clinicians with better +diagnostic capability, whereas Panoramic X-ray can capture the entire +maxillofacial region in a single image. If the CBCT is already available, it +can be beneficial to synthesize a Panoramic X-ray, thereby avoiding an +immediate additional scan and extra radiation exposure. Existing methods focus +on delineating an approximate dental arch and creating orthogonal projections +along this arch. However, no golden standard is available for such dental arch +extractions, and this choice can affect the quality of synthesized X-rays. To +avoid such issues, we propose a novel method for synthesizing Panoramic X-rays +from diverse head CBCTs, employing a simulated projection geometry and dynamic +rotation centers. Our method effectively synthesized panoramic views from CBCT, +even for patients with missing or nonexistent teeth and in the presence of +severe metal implants. Our results demonstrate that this method can generate +high-quality panoramic images irrespective of the CBCT scanner geometry. + +
+
+ comment: 12 pages, 6 figures, 1 table, Journal submission planned +
+
+
+
+
+ + ♻ ☆ Box-Free Model Watermarks Are Prone to Black-Box Removal Attacks + + +
+ Box-free model watermarking is an emerging technique to safeguard the +intellectual property of deep learning models, particularly those for low-level +image processing tasks. Existing works have verified and improved its +effectiveness in several aspects. However, in this paper, we reveal that +box-free model watermarking is prone to removal attacks, even under the +real-world threat model such that the protected model and the watermark +extractor are in black boxes. Under this setting, we carry out three studies. +1) We develop an extractor-gradient-guided (EGG) remover and show its +effectiveness when the extractor uses ReLU activation only. 2) More generally, +for an unknown extractor, we leverage adversarial attacks and design the EGG +remover based on the estimated gradients. 3) Under the most stringent condition +that the extractor is inaccessible, we design a transferable remover based on a +set of private proxy models. In all cases, the proposed removers can +successfully remove embedded watermarks while preserving the quality of the +processed images, and we also demonstrate that the EGG remover can even replace +the watermarks. Extensive experimental results verify the effectiveness and +generalizability of the proposed attacks, revealing the vulnerabilities of the +existing box-free methods and calling for further research. + +
+
+
+
+
+ + ♻ ☆ Quantum Visual Feature Encoding Revisited + + +
+ Although quantum machine learning has been introduced for a while, its +applications in computer vision are still limited. This paper, therefore, +revisits the quantum visual encoding strategies, the initial step in quantum +machine learning. Investigating the root cause, we uncover that the existing +quantum encoding design fails to ensure information preservation of the visual +features after the encoding process, thus complicating the learning process of +the quantum machine learning models. In particular, the problem, termed +"Quantum Information Gap" (QIG), leads to a gap of information between +classical and corresponding quantum features. We provide theoretical proof and +practical demonstrations of that found and underscore the significance of QIG, +as it directly impacts the performance of quantum machine learning algorithms. +To tackle this challenge, we introduce a simple but efficient new loss function +named Quantum Information Preserving (QIP) to minimize this gap, resulting in +enhanced performance of quantum machine learning algorithms. Extensive +experiments validate the effectiveness of our approach, showcasing superior +performance compared to current methodologies and consistently achieving +state-of-the-art results in quantum modeling. + +
+
+ comment: Accepted to Quantum Machine Intelligence +
+
+
+
+
+ + ♻ ☆ PrimeComposer: Faster Progressively Combined Diffusion for Image + Composition with Attention Steering + + +
+ Image composition involves seamlessly integrating given objects into a +specific visual context. Current training-free methods rely on composing +attention weights from several samplers to guide the generator. However, since +these weights are derived from disparate contexts, their combination leads to +coherence confusion and loss of appearance information. These issues worsen +with their excessive focus on background generation, even when unnecessary in +this task. This not only impedes their swift implementation but also +compromises foreground generation quality. Moreover, these methods introduce +unwanted artifacts in the transition area. In this paper, we formulate image +composition as a subject-based local editing task, solely focusing on +foreground generation. At each step, the edited foreground is combined with the +noisy background to maintain scene consistency. To address the remaining +issues, we propose PrimeComposer, a faster training-free diffuser that +composites the images by well-designed attention steering across different +noise levels. This steering is predominantly achieved by our Correlation +Diffuser, utilizing its self-attention layers at each step. Within these +layers, the synthesized subject interacts with both the referenced object and +background, capturing intricate details and coherent relationships. This prior +information is encoded into the attention weights, which are then integrated +into the self-attention layers of the generator to guide the synthesis process. +Besides, we introduce a Region-constrained Cross-Attention to confine the +impact of specific subject-related tokens to desired regions, addressing the +unwanted artifacts shown in the prior method thereby further improving the +coherence in the transition area. Our method exhibits the fastest inference +efficiency and extensive experiments demonstrate our superiority both +qualitatively and quantitatively. + +
+
+ comment: Accepted by ACMMM2024. Code: + https://github.com/CodeGoat24/PrimeComposer +
+
+
+
+
+ + ♻ ☆ D$^3$FlowSLAM: Self-Supervised Dynamic SLAM with Flow Motion + Decomposition and DINO Guidance + + +
+ In this paper, we introduce a self-supervised deep SLAM method that robustly +operates in dynamic scenes while accurately identifying dynamic components. Our +method leverages a dual-flow representation for static flow and dynamic flow, +facilitating effective scene decomposition in dynamic environments. We propose +a dynamic update module based on this representation and develop a dense SLAM +system that excels in dynamic scenarios. In addition, we design a +self-supervised training scheme using DINO as a prior, enabling label-free +training. Our method achieves superior accuracy compared to other +self-supervised methods. It also matches or even surpasses the performance of +existing supervised methods in some cases. All code and data will be made +publicly available upon acceptance. + +
+
+ comment: Homepage: https://zju3dv.github.io/deflowslam +
+
+
+
+
+ + ♻ ☆ New Job, New Gender? Measuring the Social Bias in Image Generation + Models ACM MM 2024 + + +
+ Image generation models can generate or edit images from a given text. Recent +advancements in image generation technology, exemplified by DALL-E and +Midjourney, have been groundbreaking. These advanced models, despite their +impressive capabilities, are often trained on massive Internet datasets, making +them susceptible to generating content that perpetuates social stereotypes and +biases, which can lead to severe consequences. Prior research on assessing bias +within image generation models suffers from several shortcomings, including +limited accuracy, reliance on extensive human labor, and lack of comprehensive +analysis. In this paper, we propose BiasPainter, a novel evaluation framework +that can accurately, automatically and comprehensively trigger social bias in +image generation models. BiasPainter uses a diverse range of seed images of +individuals and prompts the image generation models to edit these images using +gender, race, and age-neutral queries. These queries span 62 professions, 39 +activities, 57 types of objects, and 70 personality traits. The framework then +compares the edited images to the original seed images, focusing on the +significant changes related to gender, race, and age. BiasPainter adopts a key +insight that these characteristics should not be modified when subjected to +neutral prompts. Built upon this design, BiasPainter can trigger the social +bias and evaluate the fairness of image generation models. We use BiasPainter +to evaluate six widely-used image generation models, such as stable diffusion +and Midjourney. Experimental results show that BiasPainter can successfully +trigger social bias in image generation models. According to our human +evaluation, BiasPainter can achieve 90.8% accuracy on automatic bias detection, +which is significantly higher than the results reported in previous work. + +
+
+ comment: ACM MM 2024 Oral +
+
+
+
+
+ + ♻ ☆ MMBench: Is Your Multi-modal Model an All-around Player? ECCV2024 + + +
+ Large vision-language models (VLMs) have recently achieved remarkable +progress, exhibiting impressive multimodal perception and reasoning abilities. +However, effectively evaluating these large VLMs remains a major challenge, +hindering future development in this domain. Traditional benchmarks like VQAv2 +or COCO Caption provide quantitative performance measurements but lack +fine-grained ability assessment and robust evaluation metrics. Meanwhile, +subjective benchmarks, such as OwlEval, offer comprehensive evaluations of a +model's abilities by incorporating human labor, which is not scalable and may +display significant bias. In response to these challenges, we propose MMBench, +a bilingual benchmark for assessing the multi-modal capabilities of VLMs. +MMBench methodically develops a comprehensive evaluation pipeline, primarily +comprised of the following key features: 1. MMBench is meticulously curated +with well-designed quality control schemes, surpassing existing similar +benchmarks in terms of the number and variety of evaluation questions and +abilities; 2. MMBench introduces a rigorous CircularEval strategy and +incorporates large language models to convert free-form predictions into +pre-defined choices, which helps to yield accurate evaluation results for +models with limited instruction-following capabilities. 3. MMBench incorporates +multiple-choice questions in both English and Chinese versions, enabling an +apples-to-apples comparison of VLMs' performance under a bilingual context. To +summarize, MMBench is a systematically designed objective benchmark for a +robust and holistic evaluation of vision-language models. We hope MMBench will +assist the research community in better evaluating their models and facilitate +future progress in this area. The evalutation code of MMBench has been +integrated into VLMEvalKit: https://github.com/open-compass/VLMEvalKit. + +
+
+ comment: Accepted in ECCV2024 as Oral Presentation +
+
+
+
+
+ + ♻ ☆ GECO: Generative Image-to-3D within a SECOnd + + +
+ Recent years have seen significant advancements in 3D generation. While +methods like score distillation achieve impressive results, they often require +extensive per-scene optimization, which limits their time efficiency. On the +other hand, reconstruction-based approaches are more efficient but tend to +compromise quality due to their limited ability to handle uncertainty. We +introduce GECO, a novel method for high-quality 3D generative modeling that +operates within a second. Our approach addresses the prevalent issues of +uncertainty and inefficiency in existing methods through a two-stage approach. +In the first stage, we train a single-step multi-view generative model with +score distillation. Then, a second-stage distillation is applied to address the +challenge of view inconsistency in the multi-view generation. This two-stage +process ensures a balanced approach to 3D generation, optimizing both quality +and efficiency. Our comprehensive experiments demonstrate that GECO achieves +high-quality image-to-3D mesh generation with an unprecedented level of +efficiency. We will make the code and model publicly available. + +
+
+ comment: Project Page: https://cwchenwang.github.io/geco +
+
+
+
+
+ + ♻ ☆ Multimodal self-supervised learning for lesion localization + + +
+ Multimodal deep learning utilizing imaging and diagnostic reports has made +impressive progress in the field of medical imaging diagnostics, demonstrating +a particularly strong capability for auxiliary diagnosis in cases where +sufficient annotation information is lacking. Nonetheless, localizing diseases +accurately without detailed positional annotations remains a challenge. +Although existing methods have attempted to utilize local information to +achieve fine-grained semantic alignment, their capability in extracting the +fine-grained semantics of the comprehensive context within reports is limited. +To address this problem, a new method is introduced that takes full sentences +from textual reports as the basic units for local semantic alignment. This +approach combines chest X-ray images with their corresponding textual reports, +performing contrastive learning at both global and local levels. The leading +results obtained by this method on multiple datasets confirm its efficacy in +the task of lesion localization. + +
+
+
+
+
+ + ♻ ☆ Rethinking the Zigzag Flattening for Image Reading + + +
+ Sequence ordering of word vector matters a lot to text reading, which has +been proven in natural language processing (NLP). However, the rule of +different sequence ordering in computer vision (CV) was not well explored, +e.g., why the ``zigzag" flattening (ZF) is commonly utilized as a default +option to get the image patches ordering in vision networks. Notably, when +decomposing multi-scale images, the ZF could not maintain the invariance of +feature point positions. To this end, we investigate the Hilbert fractal +flattening (HF) as another method for sequence ordering in CV and contrast it +against ZF. The HF has proven to be superior to other curves in maintaining +spatial locality, when performing multi-scale transformations of dimensional +space. And it can be easily plugged into most deep neural networks (DNNs). +Extensive experiments demonstrate that it can yield consistent and significant +performance boosts for a variety of architectures. Finally, we hope that our +studies spark further research about the flattening strategy of image reading. + +
+
+ comment: Modify the title, and introduce more innovative content +
+
+
+
+
+ + ♻ ☆ High-Quality Mesh Blendshape Generation from Face Videos via Neural + Inverse Rendering + + +
+ Readily editable mesh blendshapes have been widely used in animation +pipelines, while recent advancements in neural geometry and appearance +representations have enabled high-quality inverse rendering. Building upon +these observations, we introduce a novel technique that reconstructs mesh-based +blendshape rigs from single or sparse multi-view videos, leveraging +state-of-the-art neural inverse rendering. We begin by constructing a +deformation representation that parameterizes vertex displacements into +differential coordinates with tetrahedral connections, allowing for +high-quality vertex deformation on high-resolution meshes. By constructing a +set of semantic regulations in this representation, we achieve joint +optimization of blendshapes and expression coefficients. Furthermore, to enable +a user-friendly multi-view setup with unsynchronized cameras, we propose a +neural regressor to model time-varying motion parameters. This approach +implicitly considers the time difference across multiple cameras, enhancing the +accuracy of motion modeling. Experiments demonstrate that, with the flexible +input of single or sparse multi-view videos, we reconstruct personalized +high-fidelity blendshapes. These blendshapes are both geometrically and +semantically accurate, and they are compatible with industrial animation +pipelines. Code and data are available at +https://github.com/grignarder/high-quality-blendshape-generation. + +
+
+
+
+
+ + ♻ ☆ HYDEN: Hyperbolic Density Representations for Medical Images and Reports + + +
+ In light of the inherent entailment relations between images and text, +hyperbolic point vector embeddings, leveraging the hierarchical modeling +advantages of hyperbolic space, have been utilized for visual semantic +representation learning. However, point vector embedding approaches fail to +address the issue of semantic uncertainty, where an image may have multiple +interpretations, and text may refer to different images, a phenomenon +particularly prevalent in the medical domain. Therefor, we propose +\textbf{HYDEN}, a novel hyperbolic density embedding based image-text +representation learning approach tailored for specific medical domain data. +This method integrates text-aware local features alongside global features from +images, mapping image-text features to density features in hyperbolic space via +using hyperbolic pseudo-Gaussian distributions. An encapsulation loss function +is employed to model the partial order relations between image-text density +distributions. Experimental results demonstrate the interpretability of our +approach and its superior performance compared to the baseline methods across +various zero-shot tasks and different datasets. + +
+
+
+
+
+ + ♻ ☆ Dual-path Frequency Discriminators for Few-shot Anomaly Detection + + +
+ Few-shot anomaly detection (FSAD) plays a crucial role in industrial +manufacturing. However, existing FSAD methods encounter difficulties leveraging +a limited number of normal samples, frequently failing to detect and locate +inconspicuous anomalies in the spatial domain. We have further discovered that +these subtle anomalies would be more noticeable in the frequency domain. In +this paper, we propose a Dual-Path Frequency Discriminators (DFD) network from +a frequency perspective to tackle these issues. The original spatial images are +transformed into multi-frequency images, making them more conducive to the +tailored discriminators in detecting anomalies. Additionally, the +discriminators learn a joint representation with forms of pseudo-anomalies. +Extensive experiments conducted on MVTec AD and VisA benchmarks demonstrate +that our DFD surpasses current state-of-the-art methods. Source code will be +available. + +
+
+ comment: Accepted by KBS +
+
+
+
+
+ + ♻ ☆ LoopSplat: Loop Closure by Registering 3D Gaussian Splats + + +
+ Simultaneous Localization and Mapping (SLAM) based on 3D Gaussian Splats +(3DGS) has recently shown promise towards more accurate, dense 3D scene maps. +However, existing 3DGS-based methods fail to address the global consistency of +the scene via loop closure and/or global bundle adjustment. To this end, we +propose LoopSplat, which takes RGB-D images as input and performs dense mapping +with 3DGS submaps and frame-to-model tracking. LoopSplat triggers loop closure +online and computes relative loop edge constraints between submaps directly via +3DGS registration, leading to improvements in efficiency and accuracy over +traditional global-to-local point cloud registration. It uses a robust pose +graph optimization formulation and rigidly aligns the submaps to achieve global +consistency. Evaluation on the synthetic Replica and real-world TUM-RGBD, +ScanNet, and ScanNet++ datasets demonstrates competitive or superior tracking, +mapping, and rendering compared to existing methods for dense RGB-D SLAM. Code +is available at loopsplat.github.io. + +
+
+ comment: Project page: https://loopsplat.github.io/ +
+
+
+
+
+ + ♻ ☆ SVIPTR: Fast and Efficient Scene Text Recognition with Vision Permutable + Extractor + + +
+ Scene Text Recognition (STR) is an important and challenging upstream task +for building structured information databases, that involves recognizing text +within images of natural scenes. Although current state-of-the-art (SOTA) +models for STR exhibit high performance, they typically suffer from low +inference efficiency due to their reliance on hybrid architectures comprised of +visual encoders and sequence decoders. In this work, we propose a VIsion +Permutable extractor for fast and efficient Scene Text Recognition (SVIPTR), +which achieves an impressive balance between high performance and rapid +inference speeds in the domain of STR. Specifically, SVIPTR leverages a +visual-semantic extractor with a pyramid structure, characterized by the +Permutation and combination of local and global self-attention layers. This +design results in a lightweight and efficient model and its inference is +insensitive to input length. Extensive experimental results on various standard +datasets for both Chinese and English scene text recognition validate the +superiority of SVIPTR. Notably, the SVIPTR-T (Tiny) variant delivers highly +competitive accuracy on par with other lightweight models and achieves SOTA +inference speeds. Meanwhile, the SVIPTR-L (Large) attains SOTA accuracy in +single-encoder-type models, while maintaining a low parameter count and +favorable inference speed. Our proposed method provides a compelling solution +for the STR challenge, which greatly benefits real-world applications requiring +fast and efficient STR. The code is publicly available at +https://github.com/cxfyxl/VIPTR. + +
+
+ comment: 10 pages, 4 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ CHASE: 3D-Consistent Human Avatars with Sparse Inputs via Gaussian + Splatting and Contrastive Learning + + +
+ Recent advancements in human avatar synthesis have utilized radiance fields +to reconstruct photo-realistic animatable human avatars. However, both +NeRFs-based and 3DGS-based methods struggle with maintaining 3D consistency and +exhibit suboptimal detail reconstruction, especially with sparse inputs. To +address this challenge, we propose CHASE, which introduces supervision from +intrinsic 3D consistency across poses and 3D geometry contrastive learning, +achieving performance comparable with sparse inputs to that with full inputs. +Following previous work, we first integrate a skeleton-driven rigid deformation +and a non-rigid cloth dynamics deformation to coordinate the movements of +individual Gaussians during animation, reconstructing basic avatar with coarse +3D consistency. To improve 3D consistency under sparse inputs, we design +Dynamic Avatar Adjustment(DAA) to adjust deformed Gaussians based on a selected +similar pose/image from the dataset. Minimizing the difference between the +image rendered by adjusted Gaussians and the image with the similar pose serves +as an additional form of supervision for avatar. Furthermore, we propose a 3D +geometry contrastive learning strategy to maintain the 3D global consistency of +generated avatars. Though CHASE is designed for sparse inputs, it surprisingly +outperforms current SOTA methods \textbf{in both full and sparse settings} on +the ZJU-MoCap and H36M datasets, demonstrating that our CHASE successfully +maintains avatar's 3D consistency, hence improving rendering quality. + +
+
+ comment: 13 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ GaussianStyle: Gaussian Head Avatar via StyleGAN + + +
+ Existing methods like Neural Radiation Fields (NeRF) and 3D Gaussian +Splatting (3DGS) have made significant strides in facial attribute control such +as facial animation and components editing, yet they struggle with fine-grained +representation and scalability in dynamic head modeling. To address these +limitations, we propose GaussianStyle, a novel framework that integrates the +volumetric strengths of 3DGS with the powerful implicit representation of +StyleGAN. The GaussianStyle preserves structural information, such as +expressions and poses, using Gaussian points, while projecting the implicit +volumetric representation into StyleGAN to capture high-frequency details and +mitigate the over-smoothing commonly observed in neural texture rendering. +Experimental outcomes indicate that our method achieves state-of-the-art +performance in reenactment, novel view synthesis, and animation. + +
+
+ comment: demo page and code to be updated soon +
+
+
+
+
+
+
+
+ + Information Retrieval 29 + +
+
+
+ + ☆ ColBERT Retrieval and Ensemble Response Scoring for Language Model + Question Answering + + +
+ Domain-specific question answering remains challenging for language models, +given the deep technical knowledge required to answer questions correctly. This +difficulty is amplified for smaller language models that cannot encode as much +information in their parameters as larger models. The "Specializing Large +Language Models for Telecom Networks" challenge aimed to enhance the +performance of two small language models, Phi-2 and Falcon-7B in +telecommunication question answering. In this paper, we present our question +answering systems for this challenge. Our solutions achieved leading marks of +81.9% accuracy for Phi-2 and 57.3% for Falcon-7B. We have publicly released our +code and fine-tuned models. + +
+
+ comment: This work has been submitted to the 2024 IEEE Globecom Workshops for + possible publication. Copyright may be transferred without notice, after + which this version may no longer be accessible +
+
+
+
+
+ + ☆ Vector Symbolic Open Source Information Discovery + + +
+ Combined, joint, intra-governmental, inter-agency and multinational (CJIIM) +operations require rapid data sharing without the bottlenecks of metadata +curation and alignment. Curation and alignment is particularly infeasible for +external open source information (OSINF), e.g., social media, which has become +increasingly valuable in understanding unfolding situations. Large language +models (transformers) facilitate semantic data and metadata alignment but are +inefficient in CJIIM settings characterised as denied, degraded, intermittent +and low bandwidth (DDIL). Vector symbolic architectures (VSA) support semantic +information processing using highly compact binary vectors, typically 1-10k +bits, suitable in a DDIL setting. We demonstrate a novel integration of +transformer models with VSA, combining the power of the former for semantic +matching with the compactness and representational structure of the latter. The +approach is illustrated via a proof-of-concept OSINF data discovery portal that +allows partners in a CJIIM operation to share data sources with minimal +metadata curation and low communications bandwidth. This work was carried out +as a bridge between previous low technology readiness level (TRL) research and +future higher-TRL technology demonstration and deployment. + +
+
+
+
+
+ + ☆ Accelerating the Surrogate Retraining for Poisoning Attacks against + Recommender Systems RecSys 2024 + + +
+ Recent studies have demonstrated the vulnerability of recommender systems to +data poisoning attacks, where adversaries inject carefully crafted fake user +interactions into the training data of recommenders to promote target items. +Current attack methods involve iteratively retraining a surrogate recommender +on the poisoned data with the latest fake users to optimize the attack. +However, this repetitive retraining is highly time-consuming, hindering the +efficient assessment and optimization of fake users. To mitigate this +computational bottleneck and develop a more effective attack in an affordable +time, we analyze the retraining process and find that a change in the +representation of one user/item will cause a cascading effect through the +user-item interaction graph. Under theoretical guidance, we introduce +\emph{Gradient Passing} (GP), a novel technique that explicitly passes +gradients between interacted user-item pairs during backpropagation, thereby +approximating the cascading effect and accelerating retraining. With just a +single update, GP can achieve effects comparable to multiple original training +iterations. Under the same number of retraining epochs, GP enables a closer +approximation of the surrogate recommender to the victim. This more accurate +approximation provides better guidance for optimizing fake users, ultimately +leading to enhanced data poisoning attacks. Extensive experiments on real-world +datasets demonstrate the efficiency and effectiveness of our proposed GP. + +
+
+ comment: Accepted by RecSys 2024 +
+
+
+
+
+ + ☆ CoRA: Collaborative Information Perception by Large Language Model's + Weights for Recommendation + + +
+ Involving collaborative information in Large Language Models (LLMs) is a +promising technique for adapting LLMs for recommendation. Existing methods +achieve this by concatenating collaborative features with text tokens into a +unified sequence input and then fine-tuning to align these features with LLM's +input space. Although effective, in this work, we identify two limitations when +adapting LLMs to recommendation tasks, which hinder the integration of general +knowledge and collaborative information, resulting in sub-optimal +recommendation performance. (1) Fine-tuning LLM with recommendation data can +undermine its inherent world knowledge and fundamental competencies, which are +crucial for interpreting and inferring recommendation text. (2) Incorporating +collaborative features into textual prompts disrupts the semantics of the +original prompts, preventing LLM from generating appropriate outputs. In this +paper, we propose a new paradigm, CoRA (an acronym for Collaborative LoRA), +with a collaborative weights generator. Rather than input space alignment, this +method aligns collaborative information with LLM's parameter space, +representing them as incremental weights to update LLM's output. This way, LLM +perceives collaborative information without altering its general knowledge and +text inference capabilities. Specifically, we employ a collaborative filtering +model to extract user and item embeddings, converting them into collaborative +weights with low-rank properties through the collaborative weights generator. +We then merge the collaborative weights into LLM's weights, enabling LLM to +perceive the collaborative signals and generate personalized recommendations +without fine-tuning or extra collaborative tokens in prompts. Extensive +experiments confirm that CoRA effectively integrates collaborative information +into LLM, enhancing recommendation performance. + +
+
+
+
+
+ + ☆ Task-level Distributionally Robust Optimization for Large Language + Model-based Dense Retrieval + + +
+ Large Language Model-based Dense Retrieval (LLM-DR) optimizes over numerous +heterogeneous fine-tuning collections from different domains. However, the +discussion about its training data distribution is still minimal. Previous +studies rely on empirically assigned dataset choices or sampling ratios, which +inevitably leads to sub-optimal retrieval performances. In this paper, we +propose a new task-level Distributionally Robust Optimization (tDRO) algorithm +for LLM-DR fine-tuning, targeted at improving the universal domain +generalization ability by end-to-end reweighting the data distribution of each +task. The tDRO parameterizes the domain weights and updates them with scaled +domain gradients. The optimized weights are then transferred to the LLM-DR +fine-tuning to train more robust retrievers. Experiments show optimal +improvements in large-scale retrieval benchmarks and reduce up to 30% dataset +usage after applying our optimization algorithm with a series of +different-sized LLM-DR models. + +
+
+
+
+
+ + ☆ Multilingual Non-Factoid Question Answering with Silver Answers + + +
+ Most existing Question Answering Datasets (QuADs) primarily focus on +factoid-based short-context Question Answering (QA) in high-resource languages. +However, the scope of such datasets for low-resource languages remains limited, +with only a few works centered on factoid-based QuADs and none on non-factoid +QuADs. Therefore, this work presents MuNfQuAD, a multilingual QuAD with +non-factoid questions. It utilizes interrogative sub-headings from BBC news +articles as questions and the corresponding paragraphs as silver answers. The +dataset comprises over 370K QA pairs across 38 languages, encompassing several +low-resource languages, and stands as the largest multilingual QA dataset to +date. Based on the manual annotations of 790 QA-pairs from MuNfQuAD (golden +set), we observe that 98\% of questions can be answered using their +corresponding silver answer. Our fine-tuned Answer Paragraph Selection (APS) +model outperforms the baselines. The APS model attained an accuracy of 80\% and +72\%, as well as a macro F1 of 72\% and 66\%, on the MuNfQuAD testset and the +golden set, respectively. Furthermore, the APS model effectively generalizes +certain a language within the golden set, even after being fine-tuned on silver +labels. + +
+
+
+
+
+ + ☆ Target-Prompt Online Graph Collaborative Learning for Temporal QoS + Prediction + + +
+ In service-oriented architecture, accurately predicting the Quality of +Service (QoS) is vital for maintaining reliability and enhancing user +satisfaction. However, current methods often neglect high-order latent +collaborative relationships and fail to dynamically adjust feature learning for +specific user-service invocations, which are critical for precise feature +extraction. Moreover, relying on RNNs to capture QoS evolution limits the +ability to detect long-term trends due to challenges in managing long-range +dependencies. To address these issues, we propose the Target-Prompt Online +Graph Collaborative Learning (TOGCL) framework for temporal QoS prediction. It +leverages a dynamic user-service invocation graph to comprehensively model +historical interactions. Building on this graph, it develops a target-prompt +graph attention network to extract online deep latent features of users and +services at each time slice, considering implicit target-neighboring +collaborative relationships and historical QoS values. Additionally, a +multi-layer Transformer encoder is employed to uncover temporal feature +evolution patterns, enhancing temporal QoS prediction. Extensive experiments on +the WS-DREAM dataset demonstrate that TOGCL significantly outperforms +state-of-the-art methods across multiple metrics, achieving improvements of up +to 38.80\%. These results underscore the effectiveness of TOGCL for temporal +QoS prediction. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Synergistic Approach for Simultaneous Optimization of Monolingual, + Cross-lingual, and Multilingual Information Retrieval + + +
+ Information retrieval across different languages is an increasingly important +challenge in natural language processing. Recent approaches based on +multilingual pre-trained language models have achieved remarkable success, yet +they often optimize for either monolingual, cross-lingual, or multilingual +retrieval performance at the expense of others. This paper proposes a novel +hybrid batch training strategy to simultaneously improve zero-shot retrieval +performance across monolingual, cross-lingual, and multilingual settings while +mitigating language bias. The approach fine-tunes multilingual language models +using a mix of monolingual and cross-lingual question-answer pair batches +sampled based on dataset size. Experiments on XQuAD-R, MLQA-R, and MIRACL +benchmark datasets show that the proposed method consistently achieves +comparable or superior results in zero-shot retrieval across various languages +and retrieval tasks compared to monolingual-only or cross-lingual-only +training. Hybrid batch training also substantially reduces language bias in +multilingual retrieval compared to monolingual training. These results +demonstrate the effectiveness of the proposed approach for learning +language-agnostic representations that enable strong zero-shot retrieval +performance across diverse languages. + +
+
+ comment: 15 pages, 2 figures, 13 tables +
+
+
+
+
+ + ☆ Efficient and Deployable Knowledge Infusion for Open-World + Recommendations via Large Language Models + + +
+ Recommender systems (RSs) play a pervasive role in today's online services, +yet their closed-loop nature constrains their access to open-world knowledge. +Recently, large language models (LLMs) have shown promise in bridging this gap. +However, previous attempts to directly implement LLMs as recommenders fall +short in meeting the requirements of industrial RSs, particularly in terms of +online inference latency and offline resource efficiency. Thus, we propose REKI +to acquire two types of external knowledge about users and items from LLMs. +Specifically, we introduce factorization prompting to elicit accurate knowledge +reasoning on user preferences and items. We develop individual knowledge +extraction and collective knowledge extraction tailored for different scales of +scenarios, effectively reducing offline resource consumption. Subsequently, +generated knowledge undergoes efficient transformation and condensation into +augmented vectors through a hybridized expert-integrated network, ensuring +compatibility. The obtained vectors can then be used to enhance any +conventional recommendation model. We also ensure efficient inference by +preprocessing and prestoring the knowledge from LLMs. Experiments demonstrate +that REKI outperforms state-of-the-art baselines and is compatible with lots of +recommendation algorithms and tasks. Now, REKI has been deployed to Huawei's +news and music recommendation platforms and gained a 7% and 1.99% improvement +during the online A/B test. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2306.10933 +
+
+
+
+
+ + ☆ Analysis of Plan-based Retrieval for Grounded Text Generation + + +
+ In text generation, hallucinations refer to the generation of seemingly +coherent text that contradicts established knowledge. One compelling hypothesis +is that hallucinations occur when a language model is given a generation task +outside its parametric knowledge (due to rarity, recency, domain, etc.). A +common strategy to address this limitation is to infuse the language models +with retrieval mechanisms, providing the model with relevant knowledge for the +task. In this paper, we leverage the planning capabilities of instruction-tuned +LLMs and analyze how planning can be used to guide retrieval to further reduce +the frequency of hallucinations. We empirically evaluate several variations of +our proposed approach on long-form text generation tasks. By improving the +coverage of relevant facts, plan-guided retrieval and generation can produce +more informative responses while providing a higher rate of attribution to +source documents. + +
+
+
+
+
+ + ☆ LSVOS Challenge 3rd Place Report: SAM2 and Cutie based VOS + + +
+ Video Object Segmentation (VOS) presents several challenges, including object +occlusion and fragmentation, the dis-appearance and re-appearance of objects, +and tracking specific objects within crowded scenes. In this work, we combine +the strengths of the state-of-the-art (SOTA) models SAM2 and Cutie to address +these challenges. Additionally, we explore the impact of various +hyperparameters on video instance segmentation performance. Our approach +achieves a J\&F score of 0.7952 in the testing phase of LSVOS challenge VOS +track, ranking third overa1l. + +
+
+
+
+
+ + ☆ Reading with Intent + + +
+ Retrieval augmented generation (RAG) systems augment how knowledge language +models are by integrating external information sources such as Wikipedia, +internal documents, scientific papers, or the open internet. RAG systems that +rely on the open internet as their knowledge source have to contend with the +complexities of human-generated content. Human communication extends much +deeper than just the words rendered as text. Intent, tonality, and connotation +can all change the meaning of what is being conveyed. Recent real-world +deployments of RAG systems have shown some difficulty in understanding these +nuances of human communication. One significant challenge for these systems +lies in processing sarcasm. Though the Large Language Models (LLMs) that make +up the backbone of these RAG systems are able to detect sarcasm, they currently +do not always use these detections for the subsequent processing of text. To +address these issues, in this paper, we synthetically generate sarcastic +passages from Natural Question's Wikipedia retrieval corpus. We then test the +impact of these passages on the performance of both the retriever and reader +portion of the RAG pipeline. We introduce a prompting system designed to +enhance the model's ability to interpret and generate responses in the presence +of sarcasm, thus improving overall system performance. Finally, we conduct +ablation studies to validate the effectiveness of our approach, demonstrating +improvements in handling sarcastic content within RAG systems. + +
+
+
+
+
+ + ☆ Public Health in Disaster: Emotional Health and Life Incidents + Extraction during Hurricane Harvey + + +
+ Countless disasters have resulted from climate change, causing severe damage +to infrastructure and the economy. These disasters have significant societal +impacts, necessitating mental health services for the millions affected. To +prepare for and respond effectively to such events, it is important to +understand people's emotions and the life incidents they experience before and +after a disaster strikes. In this case study, we collected a dataset of +approximately 400,000 public tweets related to the storm. Using a BERT-based +model, we predicted the emotions associated with each tweet. To efficiently +identify these topics, we utilized the Latent Dirichlet Allocation (LDA) +technique for topic modeling, which allowed us to bypass manual content +analysis and extract meaningful patterns from the data. However, rather than +stopping at topic identification like previous methods \cite{math11244910}, we +further refined our analysis by integrating Graph Neural Networks (GNN) and +Large Language Models (LLM). The GNN was employed to generate embeddings and +construct a similarity graph of the tweets, which was then used to optimize +clustering. Subsequently, we used an LLM to automatically generate descriptive +names for each event cluster, offering critical insights for disaster +preparedness and response strategies. + +
+
+
+
+
+ + ☆ Mistral-SPLADE: LLMs for for better Learned Sparse Retrieval + + +
+ Learned Sparse Retrievers (LSR) have evolved into an effective retrieval +strategy that can bridge the gap between traditional keyword-based sparse +retrievers and embedding-based dense retrievers. At its core, learned sparse +retrievers try to learn the most important semantic keyword expansions from a +query and/or document which can facilitate better retrieval with overlapping +keyword expansions. LSR like SPLADE has typically been using encoder only +models with MLM (masked language modeling) style objective in conjunction with +known ways of retrieval performance improvement such as hard negative mining, +distillation, etc. In this work, we propose to use decoder-only model for +learning semantic keyword expansion. We posit, decoder only models that have +seen much higher magnitudes of data are better equipped to learn keyword +expansions needed for improved retrieval. We use Mistral as the backbone to +develop our Learned Sparse Retriever similar to SPLADE and train it on a subset +of sentence-transformer data which is often used for training text embedding +models. Our experiments support the hypothesis that a sparse retrieval model +based on decoder only large language model (LLM) surpasses the performance of +existing LSR systems, including SPLADE and all its variants. The LLM based +model (Echo-Mistral-SPLADE) now stands as a state-of-the-art learned sparse +retrieval model on the BEIR text retrieval benchmark. + +
+
+
+
+
+ + ☆ Hierarchical Retrieval-Augmented Generation Model with Rethink for + Multi-hop Question Answering + + +
+ Multi-hop Question Answering (QA) necessitates complex reasoning by +integrating multiple pieces of information to resolve intricate questions. +However, existing QA systems encounter challenges such as outdated information, +context window length limitations, and an accuracy-quantity trade-off. To +address these issues, we propose a novel framework, the Hierarchical +Retrieval-Augmented Generation Model with Rethink (HiRAG), comprising +Decomposer, Definer, Retriever, Filter, and Summarizer five key modules. We +introduce a new hierarchical retrieval strategy that incorporates both sparse +retrieval at the document level and dense retrieval at the chunk level, +effectively integrating their strengths. Additionally, we propose a +single-candidate retrieval method to mitigate the limitations of +multi-candidate retrieval. We also construct two new corpora, Indexed +Wikicorpus and Profile Wikicorpus, to address the issues of outdated and +insufficient knowledge. + Our experimental results on four datasets demonstrate that HiRAG outperforms +state-of-the-art models across most metrics, and our Indexed Wikicorpus is +effective. The code for HiRAG is available at +https://github.com/2282588541a/HiRAG + +
+
+ comment: undereview +
+
+
+
+
+ + ♻ ☆ Harnessing Multimodal Large Language Models for Multimodal Sequential + Recommendation + + +
+ Recent advances in Large Language Models (LLMs) have demonstrated significant +potential in the field of Recommendation Systems (RSs). Most existing studies +have focused on converting user behavior logs into textual prompts and +leveraging techniques such as prompt tuning to enable LLMs for recommendation +tasks. Meanwhile, research interest has recently grown in multimodal +recommendation systems that integrate data from images, text, and other sources +using modality fusion techniques. This introduces new challenges to the +existing LLM-based recommendation paradigm which relies solely on text modality +information. Moreover, although Multimodal Large Language Models (MLLMs) +capable of processing multi-modal inputs have emerged, how to equip MLLMs with +multi-modal recommendation capabilities remains largely unexplored. To this +end, in this paper, we propose the Multimodal Large Language Model-enhanced +Multimodaln Sequential Recommendation (MLLM-MSR) model. To capture the dynamic +user preference, we design a two-stage user preference summarization method. +Specifically, we first utilize an MLLM-based item-summarizer to extract image +feature given an item and convert the image into text. Then, we employ a +recurrent user preference summarization generation paradigm to capture the +dynamic changes in user preferences based on an LLM-based user-summarizer. +Finally, to enable the MLLM for multi-modal recommendation task, we propose to +fine-tune a MLLM-based recommender using Supervised Fine-Tuning (SFT) +techniques. Extensive evaluations across various datasets validate the +effectiveness of MLLM-MSR, showcasing its superior ability to capture and adapt +to the evolving dynamics of user preferences. + +
+
+
+
+
+ + ♻ ☆ Hybrid Semantic Search: Unveiling User Intent Beyond Keywords + + +
+ This paper addresses the limitations of traditional keyword-based search in +understanding user intent and introduces a novel hybrid search approach that +leverages the strengths of non-semantic search engines, Large Language Models +(LLMs), and embedding models. The proposed system integrates keyword matching, +semantic vector embeddings, and LLM-generated structured queries to deliver +highly relevant and contextually appropriate search results. By combining these +complementary methods, the hybrid approach effectively captures both explicit +and implicit user intent.The paper further explores techniques to optimize +query execution for faster response times and demonstrates the effectiveness of +this hybrid search model in producing comprehensive and accurate search +outcomes. + +
+
+
+
+
+ + ♻ ☆ Distillation Matters: Empowering Sequential Recommenders to Match the + Performance of Large Language Model + + +
+ Owing to their powerful semantic reasoning capabilities, Large Language +Models (LLMs) have been effectively utilized as recommenders, achieving +impressive performance. However, the high inference latency of LLMs +significantly restricts their practical deployment. To address this issue, this +work investigates knowledge distillation from cumbersome LLM-based +recommendation models to lightweight conventional sequential models. It +encounters three challenges: 1) the teacher's knowledge may not always be +reliable; 2) the capacity gap between the teacher and student makes it +difficult for the student to assimilate the teacher's knowledge; 3) divergence +in semantic space poses a challenge to distill the knowledge from embeddings. +To tackle these challenges, this work proposes a novel distillation strategy, +DLLM2Rec, specifically tailored for knowledge distillation from LLM-based +recommendation models to conventional sequential models. DLLM2Rec comprises: 1) +Importance-aware ranking distillation, which filters reliable and +student-friendly knowledge by weighting instances according to teacher +confidence and student-teacher consistency; 2) Collaborative embedding +distillation integrates knowledge from teacher embeddings with collaborative +signals mined from the data. Extensive experiments demonstrate the +effectiveness of the proposed DLLM2Rec, boosting three typical sequential +models with an average improvement of 47.97%, even enabling them to surpass +LLM-based recommenders in some cases. + +
+
+ comment: 11 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ SUBER: An RL Environment with Simulated Human Behavior for Recommender + Systems + + +
+ Reinforcement learning (RL) has gained popularity in the realm of recommender +systems due to its ability to optimize long-term rewards and guide users in +discovering relevant content. However, the successful implementation of RL in +recommender systems is challenging because of several factors, including the +limited availability of online data for training on-policy methods. This +scarcity requires expensive human interaction for online model training. +Furthermore, the development of effective evaluation frameworks that accurately +reflect the quality of models remains a fundamental challenge in recommender +systems. To address these challenges, we propose a comprehensive framework for +synthetic environments that simulate human behavior by harnessing the +capabilities of large language models (LLMs). We complement our framework with +in-depth ablation studies and demonstrate its effectiveness with experiments on +movie and book recommendations. Using LLMs as synthetic users, this work +introduces a modular and novel framework to train RL-based recommender systems. +The software, including the RL environment, is publicly available on GitHub. + +
+
+
+
+
+ + ♻ ☆ ELASTIC: Efficient Linear Attention for Sequential Interest Compression + + +
+ State-of-the-art sequential recommendation models heavily rely on +transformer's attention mechanism. However, the quadratic computational and +memory complexities of self attention have limited its scalability for modeling +users' long range behaviour sequences. To address this problem, we propose +ELASTIC, an Efficient Linear Attention for SequenTial Interest Compression, +requiring only linear time complexity and decoupling model capacity from +computational cost. Specifically, ELASTIC introduces a fixed length interest +experts with linear dispatcher attention mechanism which compresses the +long-term behaviour sequences to a significantly more compact representation +which reduces up to 90% GPU memory usage with x2.7 inference speed up. The +proposed linear dispatcher attention mechanism significantly reduces the +quadratic complexity and makes the model feasible for adequately modeling +extremely long sequences. Moreover, in order to retain the capacity for +modeling various user interests, ELASTIC initializes a vast learnable interest +memory bank and sparsely retrieves compressed user's interests from the memory +with a negligible computational overhead. The proposed interest memory +retrieval technique significantly expands the cardinality of available interest +space while keeping the same computational cost, thereby striking a trade-off +between recommendation accuracy and efficiency. To validate the effectiveness +of our proposed ELASTIC, we conduct extensive experiments on various public +datasets and compare it with several strong sequential recommenders. +Experimental results demonstrate that ELASTIC consistently outperforms +baselines by a significant margin and also highlight the computational +efficiency of ELASTIC when modeling long sequences. We will make our +implementation code publicly available. + +
+
+ comment: We hereby withdraw this paper from arXiv due to incomplete + experiments. Upon further review, we have determined that additional + experimental work is necessary to fully validate our findings and conclusions +
+
+
+
+
+ + ♻ ☆ Carbon Footprint Accounting Driven by Large Language Models and + Retrieval-augmented Generation + + +
+ Carbon footprint accounting is crucial for quantifying greenhouse gas +emissions and achieving carbon neutrality.The dynamic nature of processes, +accounting rules, carbon-related policies, and energy supply structures +necessitates real-time updates of CFA. Traditional life cycle assessment +methods rely heavily on human expertise, making near-real-time updates +challenging. This paper introduces a novel approach integrating large language +models (LLMs) with retrieval-augmented generation technology to enhance the +real-time, professional, and economical aspects of carbon footprint information +retrieval and analysis. By leveraging LLMs' logical and language understanding +abilities and RAG's efficient retrieval capabilities, the proposed method +LLMs-RAG-CFA can retrieve more relevant professional information to assist +LLMs, enhancing the model's generative abilities. This method offers broad +professional coverage, efficient real-time carbon footprint information +acquisition and accounting, and cost-effective automation without frequent +LLMs' parameter updates. Experimental results across five industries(primary +aluminum, lithium battery, photovoltaic, new energy vehicles, and +transformers)demonstrate that the LLMs-RAG-CFA method outperforms traditional +methods and other LLMs, achieving higher information retrieval rates and +significantly lower information deviations and carbon footprint accounting +deviations. The economically viable design utilizes RAG technology to balance +real-time updates with cost-effectiveness, providing an efficient, reliable, +and cost-saving solution for real-time carbon emission management, thereby +enhancing environmental sustainability practices. + +
+
+
+
+
+ + ♻ ☆ Non-autoregressive Generative Models for Reranking Recommendation KDD 2024 + + +
+ Contemporary recommendation systems are designed to meet users' needs by +delivering tailored lists of items that align with their specific demands or +interests. In a multi-stage recommendation system, reranking plays a crucial +role by modeling the intra-list correlations among items. The key challenge of +reranking lies in the exploration of optimal sequences within the combinatorial +space of permutations. Recent research proposes a generator-evaluator learning +paradigm, where the generator generates multiple feasible sequences and the +evaluator picks out the best sequence based on the estimated listwise score. +The generator is of vital importance, and generative models are well-suited for +the generator function. Current generative models employ an autoregressive +strategy for sequence generation. However, deploying autoregressive models in +real-time industrial systems is challenging. To address these issues, we +propose a Non-AutoRegressive generative model for reranking Recommendation +(NAR4Rec) designed to enhance efficiency and effectiveness. To tackle +challenges such as sparse training samples and dynamic candidates, we introduce +a matching model. Considering the diverse nature of user feedback, we employ a +sequence-level unlikelihood training objective to differentiate feasible +sequences from unfeasible ones. Additionally, to overcome the lack of +dependency modeling in non-autoregressive models regarding target items, we +introduce contrastive decoding to capture correlations among these items. +Extensive offline experiments validate the superior performance of NAR4Rec over +state-of-the-art reranking methods. Online A/B tests reveal that NAR4Rec +significantly enhances the user experience. Furthermore, NAR4Rec has been fully +deployed in a popular video app Kuaishou with over 300 million daily active +users. + +
+
+ comment: Accepted by KDD 2024 +
+
+
+
+
+ + ♻ ☆ mdendro: An R package for extended agglomerative hierarchical clustering + + +
+ "mdendro" is an R package that provides a comprehensive collection of linkage +methods for agglomerative hierarchical clustering on a matrix of proximity data +(distances or similarities), returning a multifurcated dendrogram or +multidendrogram. Multidendrograms can group more than two clusters at the same +time, solving the nonuniqueness problem that arises when there are ties in the +data. This problem causes that different binary dendrograms are possible +depending both on the order of the input data and on the criterion used to +break ties. Weighted and unweighted versions of the most common linkage methods +are included in the package, which also implements two parametric linkage +methods. In addition, package "mdendro" provides five descriptive measures to +analyze the resulting dendrograms: cophenetic correlation coefficient, space +distortion ratio, agglomeration coefficient, chaining coefficient and tree +balance. + +
+
+ comment: 29 pages, 15 figures. Software available at CRAN + (https://cran.r-project.org/package=mdendro) and Github + (https://sergio-gomez.github.io/mdendro/) +
+
+
+
+
+ + ♻ ☆ Breaking Language Barriers with MMTweets: Advancing Cross-Lingual + Debunked Narrative Retrieval for Fact-Checking + + +
+ Finding previously debunked narratives involves identifying claims that have +already undergone fact-checking. The issue intensifies when similar false +claims persist in multiple languages, despite the availability of debunks for +several months in another language. Hence, automatically finding debunks (or +fact-checks) in multiple languages is crucial to make the best use of scarce +fact-checkers' resources. Mainly due to the lack of readily available data, +this is an understudied problem, particularly when considering the +cross-lingual scenario, i.e. the retrieval of debunks in a language different +from the language of the online post being checked. This study introduces +cross-lingual debunked narrative retrieval and addresses this research gap by: +(i) creating Multilingual Misinformation Tweets (MMTweets): a dataset that +stands out, featuring cross-lingual pairs, images, human annotations, and +fine-grained labels, making it a comprehensive resource compared to its +counterparts; (ii) conducting an extensive experiment to benchmark +state-of-the-art cross-lingual retrieval models and introducing multistage +retrieval methods tailored for the task; and (iii) comprehensively evaluating +retrieval models for their cross-lingual and cross-dataset transfer +capabilities within MMTweets, and conducting a retrieval latency analysis. We +find that MMTweets presents challenges for cross-lingual debunked narrative +retrieval, highlighting areas for improvement in retrieval models. Nonetheless, +the study provides valuable insights for creating MMTweets datasets and +optimising debunked narrative retrieval models to empower fact-checking +endeavours. The dataset and annotation codebook are publicly available at +https://doi.org/10.5281/zenodo.10637161. + +
+
+
+
+
+ + ♻ ☆ Touch the Core: Exploring Task Dependence Among Hybrid Targets for + Recommendation RecSys 2024 + + +
+ As user behaviors become complicated on business platforms, online +recommendations focus more on how to touch the core conversions, which are +highly related to the interests of platforms. These core conversions are +usually continuous targets, such as \textit{watch time}, \textit{revenue}, and +so on, whose predictions can be enhanced by previous discrete conversion +actions. Therefore, multi-task learning (MTL) can be adopted as the paradigm to +learn these hybrid targets. However, existing works mainly emphasize +investigating the sequential dependence among discrete conversion actions, +which neglects the complexity of dependence between discrete conversions and +the final continuous conversion. Moreover, simultaneously optimizing hybrid +tasks with stronger task dependence will suffer from volatile issues where the +core regression task might have a larger influence on other tasks. In this +paper, we study the MTL problem with hybrid targets for the first time and +propose the model named Hybrid Targets Learning Network (HTLNet) to explore +task dependence and enhance optimization. Specifically, we introduce label +embedding for each task to explicitly transfer the label information among +these tasks, which can effectively explore logical task dependence. We also +further design the gradient adjustment regime between the final regression task +and other classification tasks to enhance the optimization. Extensive +experiments on two offline public datasets and one real-world industrial +dataset are conducted to validate the effectiveness of HTLNet. Moreover, online +A/B tests on the financial recommender system also show that our model has +improved significantly. Our implementation is available +here\footnote{\url{https://github.com/fuyuanlyu/HTLNet}}. + +
+
+ comment: Accepted by RecSys 2024 +
+
+
+
+
+ + ♻ ☆ Towards Personalized Federated Multi-Scenario Multi-Task Recommendation + + +
+ In modern recommender systems, especially in e-commerce, predicting multiple +targets such as click-through rate (CTR) and post-view conversion rate (CTCVR) +is common. Multi-task recommender systems are increasingly popular in both +research and practice, as they leverage shared knowledge across diverse +business scenarios to enhance performance. However, emerging real-world +scenarios and data privacy concerns complicate the development of a unified +multi-task recommendation model. + In this paper, we propose PF-MSMTrec, a novel framework for personalized +federated multi-scenario multi-task recommendation. In this framework, each +scenario is assigned to a dedicated client utilizing the Multi-gate +Mixture-of-Experts (MMoE) structure. To address the unique challenges of +multiple optimization conflicts, we introduce a bottom-up joint learning +mechanism. First, we design a parameter template to decouple the expert network +parameters, distinguishing scenario-specific parameters as shared knowledge for +federated parameter aggregation. Second, we implement personalized federated +learning for each expert network during a federated communication round, using +three modules: federated batch normalization, conflict coordination, and +personalized aggregation. Finally, we conduct an additional round of +personalized federated parameter aggregation on the task tower network to +obtain prediction results for multiple tasks. Extensive experiments on two +public datasets demonstrate that our proposed method outperforms +state-of-the-art approaches. The source code and datasets will be released as +open-source for public access. + +
+
+
+
+
+ + ♻ ☆ Text-Driven Neural Collaborative Filtering Model for Paper Source + Tracing KDD + + +
+ Identifying significant references within the complex interrelations of a +citation knowledge graph is challenging, which encompasses connections through +citations, authorship, keywords, and other relational attributes. The Paper +Source Tracing (PST) task seeks to automate the identification of pivotal +references for given scholarly articles utilizing advanced data mining +techniques. In the KDD CUP OAG-Challenge PST track, we design a +recommendation-based framework tailored for the PST task. This framework +employs the Neural Collaborative Filtering (NCF) model to generate final +predictions. To process the textual attributes of the papers and extract input +features for the model, we utilize SciBERT, a pre-trained language model. +According to the experimental results, our method achieved a score of 0.37814 +on the Mean Average Precision (MAP) metric, outperforming baseline models and +ranking 11th among all participating teams. The source code is publicly +available at https://github.com/MyLove-XAB/KDDCupFinal. + +
+
+ comment: KDD CUP 2024 OAG-Challenges, Paper Source Tracing, Technical Report + of Team AoboSama @ KDD CUP 2024. August 25--29, 2024. Barcelona, Spain +
+
+
+
+
+ + ♻ ☆ Aligning Explanations for Recommendation with Rating and Feature via + Maximizing Mutual Information + + +
+ Providing natural language-based explanations to justify recommendations +helps to improve users' satisfaction and gain users' trust. However, as current +explanation generation methods are commonly trained with an objective to mimic +existing user reviews, the generated explanations are often not aligned with +the predicted ratings or some important features of the recommended items, and +thus, are suboptimal in helping users make informed decision on the +recommendation platform. To tackle this problem, we propose a flexible +model-agnostic method named MMI (Maximizing Mutual Information) framework to +enhance the alignment between the generated natural language explanations and +the predicted rating/important item features. Specifically, we propose to use +mutual information (MI) as a measure for the alignment and train a neural MI +estimator. Then, we treat a well-trained explanation generation model as the +backbone model and further fine-tune it through reinforcement learning with +guidance from the MI estimator, which rewards a generated explanation that is +more aligned with the predicted rating or a pre-defined feature of the +recommended item. Experiments on three datasets demonstrate that our MMI +framework can boost different backbone models, enabling them to outperform +existing baselines in terms of alignment with predicted ratings and item +features. Additionally, user studies verify that MI-enhanced explanations +indeed facilitate users' decisions and are favorable compared with other +baselines due to their better alignment properties. + +
+
+ comment: This paper has been accepted by cikm2024, and the code repository + will be updated soon +
+
+
+
+
+ + ♻ ☆ A Roadmap to Pluralistic Alignment ICML 2024 + + +
+ With increased power and prevalence of AI systems, it is ever more critical +that AI systems are designed to serve all, i.e., people with diverse values and +perspectives. However, aligning models to serve pluralistic human values +remains an open research question. In this piece, we propose a roadmap to +pluralistic alignment, specifically using language models as a test bed. We +identify and formalize three possible ways to define and operationalize +pluralism in AI systems: 1) Overton pluralistic models that present a spectrum +of reasonable responses; 2) Steerably pluralistic models that can steer to +reflect certain perspectives; and 3) Distributionally pluralistic models that +are well-calibrated to a given population in distribution. We also formalize +and discuss three possible classes of pluralistic benchmarks: 1) +Multi-objective benchmarks, 2) Trade-off steerable benchmarks, which +incentivize models to steer to arbitrary trade-offs, and 3) Jury-pluralistic +benchmarks which explicitly model diverse human ratings. We use this framework +to argue that current alignment techniques may be fundamentally limited for +pluralistic AI; indeed, we highlight empirical evidence, both from our own +experiments and from other work, that standard alignment procedures might +reduce distributional pluralism in models, motivating the need for further +research on pluralistic alignment. + +
+
+ comment: ICML 2024 +
+
+
+
+
+
+
+
+ + Machine Learning 142 + +
+
+
+ + ☆ Accelerating Goal-Conditioned RL Algorithms and Research + + +
+ Self-supervision has the potential to transform reinforcement learning (RL), +paralleling the breakthroughs it has enabled in other areas of machine +learning. While self-supervised learning in other domains aims to find patterns +in a fixed dataset, self-supervised goal-conditioned reinforcement learning +(GCRL) agents discover new behaviors by learning from the goals achieved during +unstructured interaction with the environment. However, these methods have +failed to see similar success, both due to a lack of data from slow +environments as well as a lack of stable algorithms. We take a step toward +addressing both of these issues by releasing a high-performance codebase and +benchmark JaxGCRL for self-supervised GCRL, enabling researchers to train +agents for millions of environment steps in minutes on a single GPU. The key to +this performance is a combination of GPU-accelerated environments and a stable, +batched version of the contrastive reinforcement learning algorithm, based on +an infoNCE objective, that effectively makes use of this increased data +throughput. With this approach, we provide a foundation for future research in +self-supervised GCRL, enabling researchers to quickly iterate on new ideas and +evaluate them in a diverse set of challenging environments. Website + Code: +https://github.com/MichalBortkiewicz/JaxGCRL + +
+
+
+
+
+ + ☆ RP1M: A Large-Scale Motion Dataset for Piano Playing with Bi-Manual + Dexterous Robot Hands + + +
+ It has been a long-standing research goal to endow robot hands with +human-level dexterity. Bi-manual robot piano playing constitutes a task that +combines challenges from dynamic tasks, such as generating fast while precise +motions, with slower but contact-rich manipulation problems. Although +reinforcement learning based approaches have shown promising results in +single-task performance, these methods struggle in a multi-song setting. Our +work aims to close this gap and, thereby, enable imitation learning approaches +for robot piano playing at scale. To this end, we introduce the Robot Piano 1 +Million (RP1M) dataset, containing bi-manual robot piano playing motion data of +more than one million trajectories. We formulate finger placements as an +optimal transport problem, thus, enabling automatic annotation of vast amounts +of unlabeled songs. Benchmarking existing imitation learning approaches shows +that such approaches reach state-of-the-art robot piano playing performance by +leveraging RP1M. + +
+
+ comment: Project Website: https://rp1m.github.io/ +
+
+
+
+
+ + ☆ Atmospheric Transport Modeling of CO$_2$ with Neural Networks + + +
+ Accurately describing the distribution of CO$_2$ in the atmosphere with +atmospheric tracer transport models is essential for greenhouse gas monitoring +and verification support systems to aid implementation of international climate +agreements. Large deep neural networks are poised to revolutionize weather +prediction, which requires 3D modeling of the atmosphere. While similar in this +regard, atmospheric transport modeling is subject to new challenges. Both, +stable predictions for longer time horizons and mass conservation throughout +need to be achieved, while IO plays a larger role compared to computational +costs. In this study we explore four different deep neural networks (UNet, +GraphCast, Spherical Fourier Neural Operator and SwinTransformer) which have +proven as state-of-the-art in weather prediction to assess their usefulness for +atmospheric tracer transport modeling. For this, we assemble the CarbonBench +dataset, a systematic benchmark tailored for machine learning emulators of +Eulerian atmospheric transport. Through architectural adjustments, we decouple +the performance of our emulators from the distribution shift caused by a steady +rise in atmospheric CO$_2$. More specifically, we center CO$_2$ input fields to +zero mean and then use an explicit flux scheme and a mass fixer to assure mass +balance. This design enables stable and mass conserving transport for over 6 +months with all four neural network architectures. In our study, the +SwinTransformer displays particularly strong emulation skill (90-day $R^2 > +0.99$), with physically plausible emulation even for forward runs of multiple +years. This work paves the way forward towards high resolution forward and +inverse modeling of inert trace gases with neural networks. + +
+
+ comment: Code: https://github.com/vitusbenson/carbonbench +
+
+
+
+
+ + ☆ An Overlooked Role of Context-Sensitive Dendrites + + +
+ To date, most dendritic studies have predominantly focused on the apical zone +of pyramidal two-point neurons (TPNs) receiving only feedback (FB) connections +from higher perceptual layers and using them for learning. Recent cellular +neurophysiology and computational neuroscience studies suggests that the apical +input (context), coming from feedback and lateral connections, is multifaceted +and far more diverse, with greater implications for ongoing learning and +processing in the brain than previously realized. In addition to the FB, the +apical tuft receives signals from neighboring cells of the same network as +proximal (P) context, other parts of the brain as distal (D) context, and +overall coherent information across the network as universal (U) context. The +integrated context (C) amplifies and suppresses the transmission of coherent +and conflicting feedforward (FF) signals, respectively. Specifically, we show +that complex context-sensitive (CS)-TPNs flexibly integrate C moment-by-moment +with the FF somatic current at the soma such that the somatic current is +amplified when both feedforward (FF) and C are coherent; otherwise, it is +attenuated. This generates the event only when the FF and C currents are +coherent, which is then translated into a singlet or a burst based on the FB +information. Spiking simulation results show that this flexible integration of +somatic and contextual currents enables the propagation of more coherent +signals (bursts), making learning faster with fewer neurons. Similar behavior +is observed when this functioning is used in conventional artificial networks, +where orders of magnitude fewer neurons are required to process vast amounts of +heterogeneous real-world audio-visual (AV) data trained using backpropagation +(BP). The computational findings presented here demonstrate the universality of +CS-TPNs, suggesting a dendritic narrative that was previously overlooked. + +
+
+
+
+
+ + ☆ Audio Match Cutting: Finding and Creating Matching Audio Transitions in + Movies and Videos ICASSP 2024 + + +
+ A "match cut" is a common video editing technique where a pair of shots that +have a similar composition transition fluidly from one to another. Although +match cuts are often visual, certain match cuts involve the fluid transition of +audio, where sounds from different sources merge into one indistinguishable +transition between two shots. In this paper, we explore the ability to +automatically find and create "audio match cuts" within videos and movies. We +create a self-supervised audio representation for audio match cutting and +develop a coarse-to-fine audio match pipeline that recommends matching shots +and creates the blended audio. We further annotate a dataset for the proposed +audio match cut task and compare the ability of multiple audio representations +to find audio match cut candidates. Finally, we evaluate multiple methods to +blend two matching audio candidates with the goal of creating a smooth +transition. Project page and examples are available at: +https://denfed.github.io/audiomatchcut/ + +
+
+ comment: Accepted to ICASSP 2024 +
+
+
+
+
+ + ☆ Approximation Rates for Shallow ReLU$^k$ Neural Networks on Sobolev + Spaces via the Radon Transform + + +
+ Let $\Omega\subset \mathbb{R}^d$ be a bounded domain. We consider the problem +of how efficiently shallow neural networks with the ReLU$^k$ activation +function can approximate functions from Sobolev spaces $W^s(L_p(\Omega))$ with +error measured in the $L_q(\Omega)$-norm. Utilizing the Radon transform and +recent results from discrepancy theory, we provide a simple proof of nearly +optimal approximation rates in a variety of cases, including when $q\leq p$, +$p\geq 2$, and $s \leq k + (d+1)/2$. The rates we derive are optimal up to +logarithmic factors, and significantly generalize existing results. An +interesting consequence is that the adaptivity of shallow ReLU$^k$ neural +networks enables them to obtain optimal approximation rates for smoothness up +to order $s = k + (d+1)/2$, even though they represent piecewise polynomials of +fixed degree $k$. + +
+
+
+
+
+ + ☆ Kernel-Based Differentiable Learning of Non-Parametric Directed Acyclic + Graphical Models + + +
+ Causal discovery amounts to learning a directed acyclic graph (DAG) that +encodes a causal model. This model selection problem can be challenging due to +its large combinatorial search space, particularly when dealing with +non-parametric causal models. Recent research has sought to bypass the +combinatorial search by reformulating causal discovery as a continuous +optimization problem, employing constraints that ensure the acyclicity of the +graph. In non-parametric settings, existing approaches typically rely on +finite-dimensional approximations of the relationships between nodes, resulting +in a score-based continuous optimization problem with a smooth acyclicity +constraint. In this work, we develop an alternative approximation method by +utilizing reproducing kernel Hilbert spaces (RKHS) and applying general +sparsity-inducing regularization terms based on partial derivatives. Within +this framework, we introduce an extended RKHS representer theorem. To enforce +acyclicity, we advocate the log-determinant formulation of the acyclicity +constraint and show its stability. Finally, we assess the performance of our +proposed RKHS-DAGMA procedure through simulations and illustrative data +analyses. + +
+
+ comment: To be published in the Proceedings of Probabilistic Graphical Models + (PGM) 2024 +
+
+
+
+
+ + ☆ Kilometer-Scale Convection Allowing Model Emulation using Generative + Diffusion Modeling + + +
+ Storm-scale convection-allowing models (CAMs) are an important tool for +predicting the evolution of thunderstorms and mesoscale convective systems that +result in damaging extreme weather. By explicitly resolving convective dynamics +within the atmosphere they afford meteorologists the nuance needed to provide +outlook on hazard. Deep learning models have thus far not proven skilful at +km-scale atmospheric simulation, despite being competitive at coarser +resolution with state-of-the-art global, medium-range weather forecasting. We +present a generative diffusion model called StormCast, which emulates the +high-resolution rapid refresh (HRRR) model-NOAA's state-of-the-art 3km +operational CAM. StormCast autoregressively predicts 99 state variables at km +scale using a 1-hour time step, with dense vertical resolution in the +atmospheric boundary layer, conditioned on 26 synoptic variables. We present +evidence of successfully learnt km-scale dynamics including competitive 1-6 +hour forecast skill for composite radar reflectivity alongside physically +realistic convective cluster evolution, moist updrafts, and cold pool +morphology. StormCast predictions maintain realistic power spectra for multiple +predicted variables across multi-hour forecasts. Together, these results +establish the potential for autoregressive ML to emulate CAMs -- opening up new +km-scale frontiers for regional ML weather prediction and future climate hazard +dynamical downscaling. + +
+
+
+
+
+ + ☆ Wave-Mask/Mix: Exploring Wavelet-Based Augmentations for Time Series + Forecasting + + +
+ Data augmentation is important for improving machine learning model +performance when faced with limited real-world data. In time series forecasting +(TSF), where accurate predictions are crucial in fields like finance, +healthcare, and manufacturing, traditional augmentation methods for +classification tasks are insufficient to maintain temporal coherence. This +research introduces two augmentation approaches using the discrete wavelet +transform (DWT) to adjust frequency elements while preserving temporal +dependencies in time series data. Our methods, Wavelet Masking (WaveMask) and +Wavelet Mixing (WaveMix), are evaluated against established baselines across +various forecasting horizons. To the best of our knowledge, this is the first +study to conduct extensive experiments on multivariate time series using +Discrete Wavelet Transform as an augmentation technique. Experimental results +demonstrate that our techniques achieve competitive results with previous +methods. We also explore cold-start forecasting using downsampled training +datasets, comparing outcomes to baseline methods. + +
+
+
+
+
+ + ☆ GAIM: Attacking Graph Neural Networks via Adversarial Influence + Maximization + + +
+ Recent studies show that well-devised perturbations on graph structures or +node features can mislead trained Graph Neural Network (GNN) models. However, +these methods often overlook practical assumptions, over-rely on heuristics, or +separate vital attack components. In response, we present GAIM, an integrated +adversarial attack method conducted on a node feature basis while considering +the strict black-box setting. Specifically, we define an adversarial influence +function to theoretically assess the adversarial impact of node perturbations, +thereby reframing the GNN attack problem into the adversarial influence +maximization problem. In our approach, we unify the selection of the target +node and the construction of feature perturbations into a single optimization +problem, ensuring a unique and consistent feature perturbation for each target +node. We leverage a surrogate model to transform this problem into a solvable +linear programming task, streamlining the optimization process. Moreover, we +extend our method to accommodate label-oriented attacks, broadening its +applicability. Thorough evaluations on five benchmark datasets across three +popular models underscore the effectiveness of our method in both untargeted +and label-oriented targeted attacks. Through comprehensive analysis and +ablation studies, we demonstrate the practical value and efficacy inherent to +our design choices. + +
+
+
+
+
+ + ☆ Robust Regression with Ensembles Communicating over Noisy Channels + + +
+ As machine-learning models grow in size, their implementation requirements +cannot be met by a single computer system. This observation motivates +distributed settings, in which intermediate computations are performed across a +network of processing units, while the central node only aggregates their +outputs. However, distributing inference tasks across low-precision or faulty +edge devices, operating over a network of noisy communication channels, gives +rise to serious reliability challenges. We study the problem of an ensemble of +devices, implementing regression algorithms, that communicate through additive +noisy channels in order to collaboratively perform a joint regression task. We +define the problem formally, and develop methods for optimizing the aggregation +coefficients for the parameters of the noise in the channels, which can +potentially be correlated. Our results apply to the leading state-of-the-art +ensemble regression methods: bagging and gradient boosting. We demonstrate the +effectiveness of our algorithms on both synthetic and real-world datasets. + +
+
+
+
+
+ + ☆ A Closer Look at Data Augmentation Strategies for Finetuning-Based + Low/Few-Shot Object Detection + + +
+ Current methods for low- and few-shot object detection have primarily focused +on enhancing model performance for detecting objects. One common approach to +achieve this is by combining model finetuning with data augmentation +strategies. However, little attention has been given to the energy efficiency +of these approaches in data-scarce regimes. This paper seeks to conduct a +comprehensive empirical study that examines both model performance and energy +efficiency of custom data augmentations and automated data augmentation +selection strategies when combined with a lightweight object detector. The +methods are evaluated in three different benchmark datasets in terms of their +performance and energy consumption, and the Efficiency Factor is employed to +gain insights into their effectiveness considering both performance and +efficiency. Consequently, it is shown that in many cases, the performance gains +of data augmentation strategies are overshadowed by their increased energy +usage, necessitating the development of more energy efficient data augmentation +strategies to address data scarcity. + +
+
+
+
+
+ + ☆ Conformalized Interval Arithmetic with Symmetric Calibration + + +
+ Uncertainty quantification is essential in decision-making, especially when +joint distributions of random variables are involved. While conformal +prediction provides distribution-free prediction sets with valid coverage +guarantees, it traditionally focuses on single predictions. This paper +introduces novel conformal prediction methods for estimating the sum or average +of unknown labels over specific index sets. We develop conformal prediction +intervals for single target to the prediction interval for sum of multiple +targets. Under permutation invariant assumptions, we prove the validity of our +proposed method. We also apply our algorithms on class average estimation and +path cost prediction tasks, and we show that our method outperforms existing +conformalized approaches as well as non-conformal approaches. + +
+
+
+
+
+ + ☆ The Evolution of Reinforcement Learning in Quantitative Finance + + +
+ Reinforcement Learning (RL) has experienced significant advancement over the +past decade, prompting a growing interest in applications within finance. This +survey critically evaluates 167 publications, exploring diverse RL applications +and frameworks in finance. Financial markets, marked by their complexity, +multi-agent nature, information asymmetry, and inherent randomness, serve as an +intriguing test-bed for RL. Traditional finance offers certain solutions, and +RL advances these with a more dynamic approach, incorporating machine learning +methods, including transfer learning, meta-learning, and multi-agent solutions. +This survey dissects key RL components through the lens of Quantitative +Finance. We uncover emerging themes, propose areas for future research, and +critique the strengths and weaknesses of existing methods. + +
+
+ comment: This work is currently submitted to and under-review for ACM + Computing Surveys. This copy is an unedited, pre-print version and it is the + author's version of the work. I +
+
+
+
+
+ + ☆ Recurrent Neural Networks Learn to Store and Generate Sequences using + Non-Linear Representations + + +
+ The Linear Representation Hypothesis (LRH) states that neural networks learn +to encode concepts as directions in activation space, and a strong version of +the LRH states that models learn only such encodings. In this paper, we present +a counterexample to this strong LRH: when trained to repeat an input token +sequence, gated recurrent neural networks (RNNs) learn to represent the token +at each position with a particular order of magnitude, rather than a direction. +These representations have layered features that are impossible to locate in +distinct linear subspaces. To show this, we train interventions to predict and +manipulate tokens by learning the scaling factor corresponding to each sequence +position. These interventions indicate that the smallest RNNs find only this +magnitude-based solution, while larger RNNs have linear representations. These +findings strongly indicate that interpretability research should not be +confined by the LRH. + +
+
+
+
+
+ + ☆ CrossFi: A Cross Domain Wi-Fi Sensing Framework Based on Siamese Network + + +
+ In recent years, Wi-Fi sensing has garnered significant attention due to its +numerous benefits, such as privacy protection, low cost, and penetration +ability. Extensive research has been conducted in this field, focusing on areas +such as gesture recognition, people identification, and fall detection. +However, many data-driven methods encounter challenges related to domain shift, +where the model fails to perform well in environments different from the +training data. One major factor contributing to this issue is the limited +availability of Wi-Fi sensing datasets, which makes models learn excessive +irrelevant information and over-fit to the training set. Unfortunately, +collecting large-scale Wi-Fi sensing datasets across diverse scenarios is a +challenging task. To address this problem, we propose CrossFi, a siamese +network-based approach that excels in both in-domain scenario and cross-domain +scenario, including few-shot, zero-shot scenarios, and even works in few-shot +new-class scenario where testing set contains new categories. The core +component of CrossFi is a sample-similarity calculation network called CSi-Net, +which improves the structure of the siamese network by using an attention +mechanism to capture similarity information, instead of simply calculating the +distance or cosine similarity. Based on it, we develop an extra Weight-Net that +can generate a template for each class, so that our CrossFi can work in +different scenarios. Experimental results demonstrate that our CrossFi achieves +state-of-the-art performance across various scenarios. In gesture recognition +task, our CrossFi achieves an accuracy of 98.17% in in-domain scenario, 91.72% +in one-shot cross-domain scenario, 64.81% in zero-shot cross-domain scenario, +and 84.75% in one-shot new-class scenario. To facilitate future research, we +will release the code for our model upon publication. + +
+
+
+
+
+ + ☆ A Grey-box Attack against Latent Diffusion Model-based Image Editing by + Posterior Collapse + + +
+ Recent advancements in generative AI, particularly Latent Diffusion Models +(LDMs), have revolutionized image synthesis and manipulation. However, these +generative techniques raises concerns about data misappropriation and +intellectual property infringement. Adversarial attacks on machine learning +models have been extensively studied, and a well-established body of research +has extended these techniques as a benign metric to prevent the underlying +misuse of generative AI. Current approaches to safeguarding images from +manipulation by LDMs are limited by their reliance on model-specific knowledge +and their inability to significantly degrade semantic quality of generated +images. In response to these shortcomings, we propose the Posterior Collapse +Attack (PCA) based on the observation that VAEs suffer from posterior collapse +during training. Our method minimizes dependence on the white-box information +of target models to get rid of the implicit reliance on model-specific +knowledge. By accessing merely a small amount of LDM parameters, in specific +merely the VAE encoder of LDMs, our method causes a substantial semantic +collapse in generation quality, particularly in perceptual consistency, and +demonstrates strong transferability across various model architectures. +Experimental results show that PCA achieves superior perturbation effects on +image generation of LDMs with lower runtime and VRAM. Our method outperforms +existing techniques, offering a more robust and generalizable solution that is +helpful in alleviating the socio-technical challenges posed by the rapidly +evolving landscape of generative AI. + +
+
+ comment: 21 pages, 7 figures, 10 tables +
+
+
+
+
+ + ☆ DBHP: Trajectory Imputation in Multi-Agent Sports Using Derivative-Based + Hybrid Prediction + + +
+ Many spatiotemporal domains handle multi-agent trajectory data, but in +real-world scenarios, collected trajectory data are often partially missing due +to various reasons. While existing approaches demonstrate good performance in +trajectory imputation, they face challenges in capturing the complex dynamics +and interactions between agents due to a lack of physical constraints that +govern realistic trajectories, leading to suboptimal results. To address this +issue, the paper proposes a Derivative-Based Hybrid Prediction (DBHP) framework +that can effectively impute multiple agents' missing trajectories. First, a +neural network equipped with Set Transformers produces a naive prediction of +missing trajectories while satisfying the permutation-equivariance in terms of +the order of input agents. Then, the framework makes alternative predictions +leveraging velocity and acceleration information and combines all the +predictions with properly determined weights to provide final imputed +trajectories. In this way, our proposed framework not only accurately predicts +position, velocity, and acceleration values but also enforces the physical +relationship between them, eventually improving both the accuracy and +naturalness of the predicted trajectories. Accordingly, the experiment results +about imputing player trajectories in team sports show that our framework +significantly outperforms existing imputation baselines. + +
+
+
+
+
+ + ☆ More Options for Prelabor Rupture of Membranes, A Bayesian Analysis + + +
+ An obstetric goal for a laboring mother is to achieve a vaginal delivery as +it reduces the risks inherent in major abdominal surgery (i.e., a Cesarean +section). Various medical interventions may be used by a physician to increase +the likelihood of this occurring while minimizing maternal and fetal morbidity. +However, patients with prelabor rupture of membranes (PROM) have only two +commonly used options for cervical ripening, Pitocin and misoprostol. Little +research exists on the benefits/risks for these two key drugs for PROM +patients. A major limitation with most induction-of-labor related research is +the inability to account for differences in \textit{Bishop scores} that are +commonly used in obstetrical practice to determine the next induction agent +offered to the patient. This creates a confounding factor, which biases the +results, but has not been realized in the literature. In this work, we use a +Bayesian model of the relationships between the relevant factors, informed by +expert physicians, to separate the confounding variable from its actual impact. +In doing so, we provide strong evidence that pitocin and buccal misoprostol are +equally effective and safe; thus, physicians have more choice in clinical care +than previously realized. This is particularly important for developing +countries where neither medication may be readily available, and prior +guidelines may create an artificial barrier to needed medication. + +
+
+ comment: To appear in the 2024 IEEE 11th International Conference on Data + Science and Advanced Analytics (DSAA) +
+
+
+
+
+ + ☆ Radio U-Net: a convolutional neural network to detect diffuse radio + sources in galaxy clusters and beyond + + +
+ The forthcoming generation of radio telescope arrays promises significant +advancements in sensitivity and resolution, enabling the identification and +characterization of many new faint and diffuse radio sources. Conventional +manual cataloging methodologies are anticipated to be insufficient to exploit +the capabilities of new radio surveys. Radio interferometric images of diffuse +sources present a challenge for image segmentation tasks due to noise, +artifacts, and embedded radio sources. In response to these challenges, we +introduce Radio U-Net, a fully convolutional neural network based on the U-Net +architecture. Radio U-Net is designed to detect faint and extended sources in +radio surveys, such as radio halos, relics, and cosmic web filaments. Radio +U-Net was trained on synthetic radio observations built upon cosmological +simulations and then tested on a sample of galaxy clusters, where the detection +of cluster diffuse radio sources relied on customized data reduction and visual +inspection of LOFAR Two Metre Sky Survey (LoTSS) data. The 83% of clusters +exhibiting diffuse radio emission were accurately identified, and the +segmentation successfully recovered the morphology of the sources even in +low-quality images. In a test sample comprising 246 galaxy clusters, we +achieved a 73% accuracy rate in distinguishing between clusters with and +without diffuse radio emission. Our results establish the applicability of +Radio U-Net to extensive radio survey datasets, probing its efficiency on +cutting-edge high-performance computing systems. This approach represents an +advancement in optimizing the exploitation of forthcoming large radio surveys +for scientific exploration. + +
+
+ comment: Accepted by MNRAS, 16 pages, 9 figures, 2 tables +
+
+
+
+
+ + ☆ Feature Selection from Differentially Private Correlations + + +
+ Data scientists often seek to identify the most important features in +high-dimensional datasets. This can be done through $L_1$-regularized +regression, but this can become inefficient for very high-dimensional datasets. +Additionally, high-dimensional regression can leak information about individual +datapoints in a dataset. In this paper, we empirically evaluate the established +baseline method for feature selection with differential privacy, the two-stage +selection technique, and show that it is not stable under sparsity. This makes +it perform poorly on real-world datasets, so we consider a different approach +to private feature selection. We employ a correlations-based order statistic to +choose important features from a dataset and privatize them to ensure that the +results do not leak information about individual datapoints. We find that our +method significantly outperforms the established baseline for private feature +selection on many datasets. + +
+
+ comment: To appear in Proceedings of the 17th ACM Workshop on Artificial + Intelligence and Security, 2024 +
+
+
+
+
+ + ☆ Knowledge Sharing and Transfer via Centralized Reward Agent for + Multi-Task Reinforcement Learning + + +
+ Reward shaping is effective in addressing the sparse-reward challenge in +reinforcement learning by providing immediate feedback through auxiliary +informative rewards. Based on the reward shaping strategy, we propose a novel +multi-task reinforcement learning framework, that integrates a centralized +reward agent (CRA) and multiple distributed policy agents. The CRA functions as +a knowledge pool, which aims to distill knowledge from various tasks and +distribute it to individual policy agents to improve learning efficiency. +Specifically, the shaped rewards serve as a straightforward metric to encode +knowledge. This framework not only enhances knowledge sharing across +established tasks but also adapts to new tasks by transferring valuable reward +signals. We validate the proposed method on both discrete and continuous +domains, demonstrating its robustness in multi-task sparse-reward settings and +its effective transferability to unseen tasks. + +
+
+
+
+
+ + ☆ Benchmarking Large Language Models for Math Reasoning Tasks + + +
+ The use of Large Language Models (LLMs) in mathematical reasoning has become +a cornerstone of related research, demonstrating the intelligence of these +models and enabling potential practical applications through their advanced +performance, such as in educational settings. Despite the variety of datasets +and in-context learning algorithms designed to improve the ability of LLMs to +automate mathematical problem solving, the lack of comprehensive benchmarking +across different datasets makes it complicated to select an appropriate model +for specific tasks. In this project, we present a benchmark that fairly +compares seven state-of-the-art in-context learning algorithms for mathematical +problem solving across five widely used mathematical datasets on four powerful +foundation models. Furthermore, we explore the trade-off between efficiency and +performance, highlighting the practical applications of LLMs for mathematical +reasoning. Our results indicate that larger foundation models like GPT-4o and +LLaMA 3-70B can solve mathematical reasoning independently from the concrete +prompting strategy, while for smaller models the in-context learning approach +significantly influences the performance. Moreover, the optimal prompt depends +on the chosen foundation model. We open-source our benchmark code to support +the integration of additional models in future research. + +
+
+
+
+
+ + ☆ Multilevel CNNs for Parametric PDEs based on Adaptive Finite Elements + + +
+ A neural network architecture is presented that exploits the multilevel +properties of high-dimensional parameter-dependent partial differential +equations, enabling an efficient approximation of parameter-to-solution maps, +rivaling best-in-class methods such as low-rank tensor regression in terms of +accuracy and complexity. The neural network is trained with data on adaptively +refined finite element meshes, thus reducing data complexity significantly. +Error control is achieved by using a reliable finite element a posteriori error +estimator, which is also provided as input to the neural network. + The proposed U-Net architecture with CNN layers mimics a classical finite +element multigrid algorithm. It can be shown that the CNN efficiently +approximates all operations required by the solver, including the evaluation of +the residual-based error estimator. In the CNN, a culling mask set-up according +to the local corrections due to refinement on each mesh level reduces the +overall complexity, allowing the network optimization with localized fine-scale +finite element data. + A complete convergence and complexity analysis is carried out for the +adaptive multilevel scheme, which differs in several aspects from previous +non-adaptive multilevel CNN. Moreover, numerical experiments with common +benchmark problems from Uncertainty Quantification illustrate the practical +performance of the architecture. + +
+
+
+
+
+ + ☆ Navigating Spatio-Temporal Heterogeneity: A Graph Transformer Approach + for Traffic Forecasting + + +
+ Traffic forecasting has emerged as a crucial research area in the development +of smart cities. Although various neural networks with intricate architectures +have been developed to address this problem, they still face two key +challenges: i) Recent advancements in network designs for modeling +spatio-temporal correlations are starting to see diminishing returns in +performance enhancements. ii) Additionally, most models do not account for the +spatio-temporal heterogeneity inherent in traffic data, i.e., traffic +distribution varies significantly across different regions and traffic flow +patterns fluctuate across various time slots. To tackle these challenges, we +introduce the Spatio-Temporal Graph Transformer (STGormer), which effectively +integrates attribute and structure information inherent in traffic data for +learning spatio-temporal correlations, and a mixture-of-experts module for +capturing heterogeneity along spaital and temporal axes. Specifically, we +design two straightforward yet effective spatial encoding methods based on the +graph structure and integrate time position encoding into the vanilla +transformer to capture spatio-temporal traffic patterns. Additionally, a +mixture-of-experts enhanced feedforward neural network (FNN) module adaptively +assigns suitable expert layers to distinct patterns via a spatio-temporal +gating network, further improving overall prediction accuracy. Experiments on +five real-world datasets demonstrate that STGormer achieves state-of-the-art +performance. + +
+
+
+
+
+ + ☆ Learning Randomized Algorithms with Transformers + + +
+ Randomization is a powerful tool that endows algorithms with remarkable +properties. For instance, randomized algorithms excel in adversarial settings, +often surpassing the worst-case performance of deterministic algorithms with +large margins. Furthermore, their success probability can be amplified by +simple strategies such as repetition and majority voting. In this paper, we +enhance deep neural networks, in particular transformer models, with +randomization. We demonstrate for the first time that randomized algorithms can +be instilled in transformers through learning, in a purely data- and +objective-driven manner. First, we analyze known adversarial objectives for +which randomized algorithms offer a distinct advantage over deterministic ones. +We then show that common optimization techniques, such as gradient descent or +evolutionary strategies, can effectively learn transformer parameters that make +use of the randomness provided to the model. To illustrate the broad +applicability of randomization in empowering neural networks, we study three +conceptual tasks: associative recall, graph coloring, and agents that explore +grid worlds. In addition to demonstrating increased robustness against +oblivious adversaries through learned randomization, our experiments reveal +remarkable performance improvements due to the inherently random nature of the +neural networks' computation and predictions. + +
+
+
+
+
+ + ☆ Deep Learning-based Classification of Dementia using Image + Representation of Subcortical Signals + + +
+ Dementia is a neurological syndrome marked by cognitive decline. Alzheimer's +disease (AD) and Frontotemporal dementia (FTD) are the common forms of +dementia, each with distinct progression patterns. EEG, a non-invasive tool for +recording brain activity, has shown potential in distinguishing AD from FTD and +mild cognitive impairment (MCI). Previous studies have utilized various EEG +features, such as subband power and connectivity patterns to differentiate +these conditions. However, artifacts in EEG signals can obscure crucial +information, necessitating advanced signal processing techniques. This study +aims to develop a deep learning-based classification system for dementia by +analyzing scout time-series signals from deep brain regions, specifically the +hippocampus, amygdala, and thalamus. The study utilizes scout time series +extracted via the standardized low-resolution brain electromagnetic tomography +(sLORETA) technique. The time series is converted to image representations +using continuous wavelet transform (CWT) and fed as input to deep learning +models. Two high-density EEG datasets are utilized to check for the efficacy of +the proposed method: the online BrainLat dataset (comprising AD, FTD, and +healthy controls (HC)) and the in-house IITD-AIIA dataset (including subjects +with AD, MCI, and HC). Different classification strategies and classifier +combinations have been utilized for the accurate mapping of classes on both +datasets. The best results were achieved by using a product of probabilities +from classifiers for left and right subcortical regions in conjunction with the +DenseNet model architecture. It yields accuracies of 94.17$\%$ and 77.72$\%$ on +the BrainLat and IITD-AIIA datasets, respectively. This highlights the +potential of this approach for early and accurate differentiation of +neurodegenerative disorders. + +
+
+
+
+
+ + ☆ DisMix: Disentangling Mixtures of Musical Instruments for Source-level + Pitch and Timbre Manipulation + + +
+ Existing work on pitch and timbre disentanglement has been mostly focused on +single-instrument music audio, excluding the cases where multiple instruments +are presented. To fill the gap, we propose DisMix, a generative framework in +which the pitch and timbre representations act as modular building blocks for +constructing the melody and instrument of a source, and the collection of which +forms a set of per-instrument latent representations underlying the observed +mixture. By manipulating the representations, our model samples mixtures with +novel combinations of pitch and timbre of the constituent instruments. We can +jointly learn the disentangled pitch-timbre representations and a latent +diffusion transformer that reconstructs the mixture conditioned on the set of +source-level representations. We evaluate the model using both a simple dataset +of isolated chords and a realistic four-part chorales in the style of J.S. +Bach, identify the key components for the success of disentanglement, and +demonstrate the application of mixture transformation based on source-level +attribute manipulation. + +
+
+
+
+
+ + ☆ Inverse Deep Learning Ray Tracing for Heliostat Surface Prediction + + +
+ Concentrating Solar Power (CSP) plants play a crucial role in the global +transition towards sustainable energy. A key factor in ensuring the safe and +efficient operation of CSP plants is the distribution of concentrated flux +density on the receiver. However, the non-ideal flux density generated by +individual heliostats can undermine the safety and efficiency of the power +plant. The flux density from each heliostat is influenced by its precise +surface profile, which includes factors such as canting and mirror errors. +Accurately measuring these surface profiles for a large number of heliostats in +operation is a formidable challenge. Consequently, control systems often rely +on the assumption of ideal surface conditions, which compromises both safety +and operational efficiency. In this study, we introduce inverse Deep Learning +Ray Tracing (iDLR), an innovative method designed to predict heliostat surfaces +based solely on target images obtained during heliostat calibration. Our +simulation-based investigation demonstrates that sufficient information +regarding the heliostat surface is retained in the flux density distribution of +a single heliostat, enabling deep learning models to accurately predict the +underlying surface with deflectometry-like precision for the majority of +heliostats. Additionally, we assess the limitations of this method, +particularly in relation to surface accuracy and resultant flux density +predictions. Furthermore, we are presenting a new comprehensive heliostat model +using Non-Uniform Rational B-Spline (NURBS) that has the potential to become +the new State of the Art for heliostat surface parameterization. Our findings +reveal that iDLR has significant potential to enhance CSP plant operations, +potentially increasing the overall efficiency and energy output of the power +plants. + +
+
+
+
+
+ + ☆ Universal Novelty Detection Through Adaptive Contrastive Learning + + +
+ Novelty detection is a critical task for deploying machine learning models in +the open world. A crucial property of novelty detection methods is +universality, which can be interpreted as generalization across various +distributions of training or test data. More precisely, for novelty detection, +distribution shifts may occur in the training set or the test set. Shifts in +the training set refer to cases where we train a novelty detector on a new +dataset and expect strong transferability. Conversely, distribution shifts in +the test set indicate the methods' performance when the trained model +encounters a shifted test sample. We experimentally show that existing methods +falter in maintaining universality, which stems from their rigid inductive +biases. Motivated by this, we aim for more generalized techniques that have +more adaptable inductive biases. In this context, we leverage the fact that +contrastive learning provides an efficient framework to easily switch and adapt +to new inductive biases through the proper choice of augmentations in forming +the negative pairs. We propose a novel probabilistic auto-negative pair +generation method AutoAugOOD, along with contrastive learning, to yield a +universal novelty detector method. Our experiments demonstrate the superiority +of our method under different distribution shifts in various image benchmark +datasets. Notably, our method emerges universality in the lens of adaptability +to different setups of novelty detection, including one-class, unlabeled +multi-class, and labeled multi-class settings. Code: +https://github.com/mojtaba-nafez/UNODE + +
+
+ comment: 16 pages, 5 figures, conference +
+
+
+
+
+ + ☆ LightMDETR: A Lightweight Approach for Low-Cost Open-Vocabulary Object + Detection Training + + +
+ Object detection in computer vision traditionally involves identifying +objects in images. By integrating textual descriptions, we enhance this +process, providing better context and accuracy. The MDETR model significantly +advances this by combining image and text data for more versatile object +detection and classification. However, MDETR's complexity and high +computational demands hinder its practical use. In this paper, we introduce +Lightweight MDETR (LightMDETR), an optimized MDETR variant designed for +improved computational efficiency while maintaining robust multimodal +capabilities. Our approach involves freezing the MDETR backbone and training a +sole component, the Deep Fusion Encoder (DFE), to represent image and text +modalities. A learnable context vector enables the DFE to switch between these +modalities. Evaluation on datasets like RefCOCO, RefCOCO+, and RefCOCOg +demonstrates that LightMDETR achieves superior precision and accuracy. + +
+
+
+
+
+ + ☆ Generative AI in Industrial Machine Vision -- A Review + + +
+ Machine vision enhances automation, quality control, and operational +efficiency in industrial applications by enabling machines to interpret and act +on visual data. While traditional computer vision algorithms and approaches +remain widely utilized, machine learning has become pivotal in current research +activities. In particular, generative \gls*{AI} demonstrates promising +potential by improving pattern recognition capabilities, through data +augmentation, increasing image resolution, and identifying anomalies for +quality control. However, the application of generative \gls*{AI} in machine +vision is still in its early stages due to challenges in data diversity, +computational requirements, and the necessity for robust validation methods. A +comprehensive literature review is essential to understand the current state of +generative \gls*{AI} in industrial machine vision, focusing on recent +advancements, applications, and research trends. Thus, a literature review +based on the PRISMA guidelines was conducted, analyzing over 1,200 papers on +generative \gls*{AI} in industrial machine vision. Our findings reveal various +patterns in current research, with the primary use of generative \gls*{AI} +being data augmentation, for machine vision tasks such as classification and +object detection. Furthermore, we gather a collection of application challenges +together with data requirements to enable a successful application of +generative \gls*{AI} in industrial machine vision. This overview aims to +provide researchers with insights into the different areas and applications +within current research, highlighting significant advancements and identifying +opportunities for future work. + +
+
+ comment: 44 pages, 7 figures, This work has been submitted to the Journal of + Intelligent Manufacturing +
+
+
+
+
+ + ☆ SSL-TTS: Leveraging Self-Supervised Embeddings and kNN Retrieval for + Zero-Shot Multi-speaker TTS + + +
+ While recent zero-shot multispeaker text-to-speech (TTS) models achieve +impressive results, they typically rely on extensive transcribed speech +datasets from numerous speakers and intricate training pipelines. Meanwhile, +self-supervised learning (SSL) speech features have emerged as effective +intermediate representations for TTS. It was also observed that SSL features +from different speakers that are linearly close share phonetic information +while maintaining individual speaker identity, which enables straight-forward +and robust voice cloning. In this study, we introduce SSL-TTS, a lightweight +and efficient zero-shot TTS framework trained on transcribed speech from a +single speaker. SSL-TTS leverages SSL features and retrieval methods for simple +and robust zero-shot multi-speaker synthesis. Objective and subjective +evaluations show that our approach achieves performance comparable to +state-of-the-art models that require significantly larger training datasets. +The low training data requirements mean that SSL-TTS is well suited for the +development of multi-speaker TTS systems for low-resource domains and +languages. We also introduce an interpolation parameter which enables fine +control over the output speech by blending voices. Demo samples are available +at https://idiap.github.io/ssl-tts + +
+
+ comment: Submitted to IEEE Signal Processing Letters +
+
+
+
+
+ + ☆ Generating Synthetic Fair Syntax-agnostic Data by Learning and + Distilling Fair Representation + + +
+ Data Fairness is a crucial topic due to the recent wide usage of AI powered +applications. Most of the real-world data is filled with human or machine +biases and when those data are being used to train AI models, there is a chance +that the model will reflect the bias in the training data. Existing +bias-mitigating generative methods based on GANs, Diffusion models need +in-processing fairness objectives and fail to consider computational overhead +while choosing computationally-heavy architectures, which may lead to high +computational demands, instability and poor optimization performance. To +mitigate this issue, in this work, we present a fair data generation technique +based on knowledge distillation, where we use a small architecture to distill +the fair representation in the latent space. The idea of fair latent space +distillation enables more flexible and stable training of Fair Generative +Models (FGMs). We first learn a syntax-agnostic (for any data type) fair +representation of the data, followed by distillation in the latent space into a +smaller model. After distillation, we use the distilled fair latent space to +generate high-fidelity fair synthetic data. While distilling, we employ quality +loss (for fair distillation) and utility loss (for data utility) to ensure that +the fairness and data utility characteristics remain in the distilled latent +space. Our approaches show a 5%, 5% and 10% rise in performance in fairness, +synthetic sample quality and data utility, respectively, than the +state-of-the-art fair generative model. + +
+
+
+
+
+ + ☆ Security Assessment of Hierarchical Federated Deep Learning + + +
+ Hierarchical federated learning (HFL) is a promising distributed deep +learning model training paradigm, but it has crucial security concerns arising +from adversarial attacks. This research investigates and assesses the security +of HFL using a novel methodology by focusing on its resilience against +adversarial attacks inference-time and training-time. Through a series of +extensive experiments across diverse datasets and attack scenarios, we uncover +that HFL demonstrates robustness against untargeted training-time attacks due +to its hierarchical structure. However, targeted attacks, particularly backdoor +attacks, exploit this architecture, especially when malicious clients are +positioned in the overlapping coverage areas of edge servers. Consequently, HFL +shows a dual nature in its resilience, showcasing its capability to recover +from attacks thanks to its hierarchical aggregation that strengthens its +suitability for adversarial training, thereby reinforcing its resistance +against inference-time attacks. These insights underscore the necessity for +balanced security strategies in HFL systems, leveraging their inherent +strengths while effectively mitigating vulnerabilities. + +
+
+
+
+
+ + ☆ Pluto and Charon: A Time and Memory Efficient Collaborative Edge AI + Framework for Personal LLMs Fine-Tuning + + +
+ Large language models (LLMs) have unlocked a plethora of powerful +applications at the network edge, such as intelligent personal assistants. Data +privacy and security concerns have prompted a shift towards edge-based +fine-tuning of personal LLMs, away from cloud reliance. However, this raises +issues of computational intensity and resource scarcity, hindering training +efficiency and feasibility. While current studies investigate +parameter-efficient fine-tuning (PEFT) techniques to mitigate resource +constraints, our analysis indicates that these techniques are not sufficiently +resource-efficient for edge devices. To tackle these challenges, we propose +Pluto and Charon (PAC), a time and memory efficient collaborative edge AI +framework for personal LLMs fine-tuning. PAC breaks the resource wall of +personal LLMs fine-tuning with a sophisticated algorithm-system co-design. (1) +Algorithmically, PAC implements a personal LLMs fine-tuning technique that is +efficient in terms of parameters, time, and memory. It utilizes Parallel +Adapters to circumvent the need for a full backward pass through the LLM +backbone. Additionally, an activation cache mechanism further streamlining the +process by negating the necessity for repeated forward passes across multiple +epochs. (2) Systematically, PAC leverages edge devices in close proximity, +pooling them as a collective resource for in-situ personal LLMs fine-tuning, +utilizing a hybrid data and pipeline parallelism to orchestrate distributed +training. The use of the activation cache eliminates the need for forward pass +through the LLM backbone,enabling exclusive fine-tuning of the Parallel +Adapters using data parallelism. Extensive evaluation based on prototype +implementation demonstrates that PAC remarkably outperforms state-of-the-art +approaches, achieving up to 8.64x end-to-end speedup and up to 88.16% reduction +in memory footprint. + +
+
+ comment: Accepted by The 53rd International Conference on Parallel Processing + (ICPP'24) +
+
+
+
+
+ + ☆ Towards Foundation Models for the Industrial Forecasting of Chemical + Kinetics + + +
+ Scientific Machine Learning is transforming traditional engineering +industries by enhancing the efficiency of existing technologies and +accelerating innovation, particularly in modeling chemical reactions. Despite +recent advancements, the issue of solving stiff chemically reacting problems +within computational fluid dynamics remains a significant issue. In this study +we propose a novel approach utilizing a multi-layer-perceptron mixer +architecture (MLP-Mixer) to model the time-series of stiff chemical kinetics. +We evaluate this method using the ROBER system, a benchmark model in chemical +kinetics, to compare its performance with traditional numerical techniques. +This study provides insight into the industrial utility of the recently +developed MLP-Mixer architecture to model chemical kinetics and provides +motivation for such neural architecture to be used as a base for time-series +foundation models. + +
+
+ comment: Accepted into the IEEE CAI 2024 Workshop on Scientific Machine + Learning and Its Industrial Applications (SMLIA2024) +
+
+
+
+
+ + ☆ Accelerated training of deep learning surrogate models for surface + displacement and flow, with application to MCMC-based history matching of CO2 + storage operations + + +
+ Deep learning surrogate modeling shows great promise for subsurface flow +applications, but the training demands can be substantial. Here we introduce a +new surrogate modeling framework to predict CO2 saturation, pressure and +surface displacement for use in the history matching of carbon storage +operations. Rather than train using a large number of expensive coupled +flow-geomechanics simulation runs, training here involves a large number of +inexpensive flow-only simulations combined with a much smaller number of +coupled runs. The flow-only runs use an effective rock compressibility, which +is shown to provide accurate predictions for saturation and pressure for our +system. A recurrent residual U-Net architecture is applied for the saturation +and pressure surrogate models, while a new residual U-Net model is introduced +to predict surface displacement. The surface displacement surrogate accepts, as +inputs, geomodel quantities along with saturation and pressure surrogate +predictions. Median relative error for a diverse test set is less than 4% for +all variables. The surrogate models are incorporated into a hierarchical Markov +chain Monte Carlo history matching workflow. Surrogate error is included using +a new treatment involving the full model error covariance matrix. A high degree +of prior uncertainty, with geomodels characterized by uncertain geological +scenario parameters (metaparameters) and associated realizations, is +considered. History matching results for a synthetic true model are generated +using in-situ monitoring-well data only, surface displacement data only, and +both data types. The enhanced uncertainty reduction achieved with both data +types is quantified. Posterior saturation and surface displacement fields are +shown to correspond well with the true solution. + +
+
+
+
+
+ + ☆ Offline Model-Based Reinforcement Learning with Anti-Exploration + + +
+ Model-based reinforcement learning (MBRL) algorithms learn a dynamics model +from collected data and apply it to generate synthetic trajectories to enable +faster learning. This is an especially promising paradigm in offline +reinforcement learning (RL) where data may be limited in quantity, in addition +to being deficient in coverage and quality. Practical approaches to offline +MBRL usually rely on ensembles of dynamics models to prevent exploitation of +any individual model and to extract uncertainty estimates that penalize values +in states far from the dataset support. Uncertainty estimates from ensembles +can vary greatly in scale, making it challenging to generalize hyperparameters +well across even similar tasks. In this paper, we present Morse Model-based +offline RL (MoMo), which extends the anti-exploration paradigm found in offline +model-free RL to the model-based space. We develop model-free and model-based +variants of MoMo and show how the model-free version can be extended to detect +and deal with out-of-distribution (OOD) states using explicit uncertainty +estimation without the need for large ensembles. MoMo performs offline MBRL +using an anti-exploration bonus to counteract value overestimation in +combination with a policy constraint, as well as a truncation function to +terminate synthetic rollouts that are excessively OOD. Experimentally, we find +that both model-free and model-based MoMo perform well, and the latter +outperforms prior model-based and model-free baselines on the majority of D4RL +datasets tested. + +
+
+
+
+
+ + ☆ Variable Assignment Invariant Neural Networks for Learning Logic + Programs + + +
+ Learning from interpretation transition (LFIT) is a framework for learning +rules from observed state transitions. LFIT has been implemented in purely +symbolic algorithms, but they are unable to deal with noise or generalize to +unobserved transitions. Rule extraction based neural network methods suffer +from overfitting, while more general implementation that categorize rules +suffer from combinatorial explosion. In this paper, we introduce a technique to +leverage variable permutation invariance inherent in symbolic domains. Our +technique ensures that the permutation and the naming of the variables would +not affect the results. We demonstrate the effectiveness and the scalability of +this method with various experiments. Our code is publicly available at +https://github.com/phuayj/delta-lfit-2 + +
+
+
+
+
+ + ☆ AnyGraph: Graph Foundation Model in the Wild + + +
+ The growing ubiquity of relational data structured as graphs has underscored +the need for graph learning models with exceptional generalization +capabilities. However, current approaches often struggle to effectively extract +generalizable insights, frequently requiring extensive fine-tuning and limiting +their versatility. Graph foundation models offer a transformative solution, +with the potential to learn robust, generalizable representations from graph +data. This enables more effective and adaptable applications across a wide +spectrum of tasks and domains. In this work, we investigate a unified graph +model, AnyGraph, designed to handle key challenges: i) Structure Heterogenity. +Addressing distribution shift in graph structural information; ii) Feature +Heterogenity. Handling diverse feature representation spaces across graph +datasets; iii) Fast Adaptation. Efficiently adapting the model to new graph +domains; iv) Scaling Law Emergence. Enabling the model to exhibit scaling law +behavior, where its performance scales favorably with the amount of data and +parameter sizes. To tackle these critical challenges, we build the AnyGraph +upon a Graph Mixture-of-Experts (MoE) architecture. This approach empowers the +model to effectively manage both the in-domain and cross-domain distribution +shift concerning structure-level and feature-level heterogeneity. Furthermore, +a lightweight graph expert routing mechanism is proposed to facilitate +AnyGraph's fast adaptability to new data and domains. Our extensive experiments +on diverse 38 graph datasets have demonstrated the strong zero-shot learning +performance of AnyGraph across diverse graph domains with significant +distribution shift. Furthermore, we have validated the model's fast adaptation +ability and scaling law emergence, showcasing its versatility. + +
+
+
+
+
+ + ☆ Towards Robust Knowledge Unlearning: An Adversarial Framework for + Assessing and Improving Unlearning Robustness in Large Language Models + + +
+ LLM have achieved success in many fields but still troubled by problematic +content in the training corpora. LLM unlearning aims at reducing their +influence and avoid undesirable behaviours. However, existing unlearning +methods remain vulnerable to adversarial queries and the unlearned knowledge +resurfaces after the manually designed attack queries. As part of a red-team +effort to proactively assess the vulnerabilities of unlearned models, we design +Dynamic Unlearning Attack (DUA), a dynamic and automated framework to attack +these models and evaluate their robustness. It optimizes adversarial suffixes +to reintroduce the unlearned knowledge in various scenarios. We find that +unlearned knowledge can be recovered in $55.2\%$ of the questions, even without +revealing the unlearned model's parameters. In response to this vulnerability, +we propose Latent Adversarial Unlearning (LAU), a universal framework that +effectively enhances the robustness of the unlearned process. It formulates the +unlearning process as a min-max optimization problem and resolves it through +two stages: an attack stage, where perturbation vectors are trained and added +to the latent space of LLMs to recover the unlearned knowledge, and a defense +stage, where previously trained perturbation vectors are used to enhance +unlearned model's robustness. With our LAU framework, we obtain two robust +unlearning methods, AdvGA and AdvNPO. We conduct extensive experiments across +multiple unlearning benchmarks and various models, and demonstrate that they +improve the unlearning effectiveness by over $53.5\%$, cause only less than a +$11.6\%$ reduction in neighboring knowledge, and have almost no impact on the +model's general capabilities. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ HMoE: Heterogeneous Mixture of Experts for Language Modeling + + +
+ Mixture of Experts (MoE) offers remarkable performance and computational +efficiency by selectively activating subsets of model parameters. +Traditionally, MoE models use homogeneous experts, each with identical +capacity. However, varying complexity in input data necessitates experts with +diverse capabilities, while homogeneous MoE hinders effective expert +specialization and efficient parameter utilization. In this study, we propose a +novel Heterogeneous Mixture of Experts (HMoE), where experts differ in size and +thus possess diverse capacities. This heterogeneity allows for more specialized +experts to handle varying token complexities more effectively. To address the +imbalance in expert activation, we propose a novel training objective that +encourages the frequent activation of smaller experts, enhancing computational +efficiency and parameter utilization. Extensive experiments demonstrate that +HMoE achieves lower loss with fewer activated parameters and outperforms +conventional homogeneous MoE models on various pre-training evaluation +benchmarks. Codes will be released upon acceptance. + +
+
+
+
+
+ + ☆ Representation Norm Amplification for Out-of-Distribution Detection in + Long-Tail Learning + + +
+ Detecting out-of-distribution (OOD) samples is a critical task for reliable +machine learning. However, it becomes particularly challenging when the models +are trained on long-tailed datasets, as the models often struggle to +distinguish tail-class in-distribution samples from OOD samples. We examine the +main challenges in this problem by identifying the trade-offs between OOD +detection and in-distribution (ID) classification, faced by existing methods. +We then introduce our method, called \textit{Representation Norm Amplification} +(RNA), which solves this challenge by decoupling the two problems. The main +idea is to use the norm of the representation as a new dimension for OOD +detection, and to develop a training method that generates a noticeable +discrepancy in the representation norm between ID and OOD data, while not +perturbing the feature learning for ID classification. Our experiments show +that RNA achieves superior performance in both OOD detection and classification +compared to the state-of-the-art methods, by 1.70\% and 9.46\% in FPR95 and +2.43\% and 6.87\% in classification accuracy on CIFAR10-LT and ImageNet-LT, +respectively. The code for this work is available at +https://github.com/dgshin21/RNA. + +
+
+ comment: 30 pages, 8 figures, 17 tables +
+
+
+
+
+ + ☆ Neural Exploratory Landscape Analysis + + +
+ Recent research in Meta-Black-Box Optimization (MetaBBO) have shown that +meta-trained neural networks can effectively guide the design of black-box +optimizers, significantly reducing the need for expert tuning and delivering +robust performance across complex problem distributions. Despite their success, +a paradox remains: MetaBBO still rely on human-crafted Exploratory Landscape +Analysis features to inform the meta-level agent about the low-level +optimization progress. To address the gap, this paper proposes Neural +Exploratory Landscape Analysis (NeurELA), a novel framework that dynamically +profiles landscape features through a two-stage, attention-based neural +network, executed in an entirely end-to-end fashion. NeurELA is pre-trained +over a variety of MetaBBO algorithms using a multi-task neuroevolution +strategy. Extensive experiments show that NeurELA achieves consistently +superior performance when integrated into different and even unseen MetaBBO +tasks and can be efficiently fine-tuned for further performance boost. This +advancement marks a pivotal step in making MetaBBO algorithms more autonomous +and broadly applicable. + +
+
+
+
+
+ + ☆ Tensor tree learns hidden relational structures in data to construct + generative models + + +
+ Based on the tensor tree network with the Born machine framework, we propose +a general method for constructing a generative model by expressing the target +distribution function as the quantum wave function amplitude represented by a +tensor tree. The key idea is dynamically optimizing the tree structure that +minimizes the bond mutual information. The proposed method offers enhanced +performance and uncovers hidden relational structures in the target data. We +illustrate potential practical applications with four examples: (i) random +patterns, (ii) QMNIST hand-written digits, (iii) Bayesian networks, and (iv) +the stock price fluctuation pattern in S&P500. In (i) and (ii), strongly +correlated variables were concentrated near the center of the network; in +(iii), the causality pattern was identified; and, in (iv), a structure +corresponding to the eleven sectors emerged. + +
+
+ comment: 9 pages, 3 figures +
+
+
+
+
+ + ☆ End-to-end learned Lossy Dynamic Point Cloud Attribute Compression ICIP + + +
+ Recent advancements in point cloud compression have primarily emphasized +geometry compression while comparatively fewer efforts have been dedicated to +attribute compression. This study introduces an end-to-end learned dynamic +lossy attribute coding approach, utilizing an efficient high-dimensional +convolution to capture extensive inter-point dependencies. This enables the +efficient projection of attribute features into latent variables. Subsequently, +we employ a context model that leverage previous latent space in conjunction +with an auto-regressive context model for encoding the latent tensor into a +bitstream. Evaluation of our method on widely utilized point cloud datasets +from the MPEG and Microsoft demonstrates its superior performance compared to +the core attribute compression module Region-Adaptive Hierarchical Transform +method from MPEG Geometry Point Cloud Compression with 38.1% Bjontegaard +Delta-rate saving in average while ensuring a low-complexity encoding/decoding. + +
+
+ comment: 6 pages, accepted for presentation at 2024 IEEE International + Conference on Image Processing (ICIP) 2024 +
+
+
+
+
+ + ☆ Federated Clustering: An Unsupervised Cluster-Wise Training for + Decentralized Data Distributions + + +
+ Federated Learning (FL) is a pivotal approach in decentralized machine +learning, especially when data privacy is crucial and direct data sharing is +impractical. While FL is typically associated with supervised learning, its +potential in unsupervised scenarios is underexplored. This paper introduces a +novel unsupervised federated learning methodology designed to identify the +complete set of categories (global K) across multiple clients within +label-free, non-uniform data distributions, a process known as Federated +Clustering. Our approach, Federated Cluster-Wise Refinement (FedCRef), involves +clients that collaboratively train models on clusters with similar data +distributions. Initially, clients with diverse local data distributions (local +K) train models on their clusters to generate compressed data representations. +These local models are then shared across the network, enabling clients to +compare them through reconstruction error analysis, leading to the formation of +federated groups.In these groups, clients collaboratively train a shared model +representing each data distribution, while continuously refining their local +clusters to enhance data association accuracy. This iterative process allows +our system to identify all potential data distributions across the network and +develop robust representation models for each. To validate our approach, we +compare it with traditional centralized methods, establishing a performance +baseline and showcasing the advantages of our distributed solution. We also +conduct experiments on the EMNIST and KMNIST datasets, demonstrating FedCRef's +ability to refine and align cluster models with actual data distributions, +significantly improving data representation precision in unsupervised federated +settings. + +
+
+
+
+
+ + ☆ Inferring Underwater Topography with FINN + + +
+ Spatiotemporal partial differential equations (PDEs) find extensive +application across various scientific and engineering fields. While numerous +models have emerged from both physics and machine learning (ML) communities, +there is a growing trend towards integrating these approaches to develop hybrid +architectures known as physics-aware machine learning models. Among these, the +finite volume neural network (FINN) has emerged as a recent addition. FINN has +proven to be particularly efficient in uncovering latent structures in data. In +this study, we explore the capabilities of FINN in tackling the shallow-water +equations, which simulates wave dynamics in coastal regions. Specifically, we +investigate FINN's efficacy to reconstruct underwater topography based on these +particular wave equations. Our findings reveal that FINN exhibits a remarkable +capacity to infer topography solely from wave dynamics, distinguishing itself +from both conventional ML and physics-aware ML models. Our results underscore +the potential of FINN in advancing our understanding of spatiotemporal +phenomena and enhancing parametrization capabilities in related domains. + +
+
+
+
+
+ + ☆ Privacy-preserving Universal Adversarial Defense for Black-box Models + + +
+ Deep neural networks (DNNs) are increasingly used in critical applications +such as identity authentication and autonomous driving, where robustness +against adversarial attacks is crucial. These attacks can exploit minor +perturbations to cause significant prediction errors, making it essential to +enhance the resilience of DNNs. Traditional defense methods often rely on +access to detailed model information, which raises privacy concerns, as model +owners may be reluctant to share such data. In contrast, existing black-box +defense methods fail to offer a universal defense against various types of +adversarial attacks. To address these challenges, we introduce DUCD, a +universal black-box defense method that does not require access to the target +model's parameters or architecture. Our approach involves distilling the target +model by querying it with data, creating a white-box surrogate while preserving +data privacy. We further enhance this surrogate model using a certified defense +based on randomized smoothing and optimized noise selection, enabling robust +defense against a broad range of adversarial attacks. Comparative evaluations +between the certified defenses of the surrogate and target models demonstrate +the effectiveness of our approach. Experiments on multiple image classification +datasets show that DUCD not only outperforms existing black-box defenses but +also matches the accuracy of white-box defenses, all while enhancing data +privacy and reducing the success rate of membership inference attacks. + +
+
+ comment: 12 pages, 9 figures +
+
+
+
+
+ + ☆ CoRA: Collaborative Information Perception by Large Language Model's + Weights for Recommendation + + +
+ Involving collaborative information in Large Language Models (LLMs) is a +promising technique for adapting LLMs for recommendation. Existing methods +achieve this by concatenating collaborative features with text tokens into a +unified sequence input and then fine-tuning to align these features with LLM's +input space. Although effective, in this work, we identify two limitations when +adapting LLMs to recommendation tasks, which hinder the integration of general +knowledge and collaborative information, resulting in sub-optimal +recommendation performance. (1) Fine-tuning LLM with recommendation data can +undermine its inherent world knowledge and fundamental competencies, which are +crucial for interpreting and inferring recommendation text. (2) Incorporating +collaborative features into textual prompts disrupts the semantics of the +original prompts, preventing LLM from generating appropriate outputs. In this +paper, we propose a new paradigm, CoRA (an acronym for Collaborative LoRA), +with a collaborative weights generator. Rather than input space alignment, this +method aligns collaborative information with LLM's parameter space, +representing them as incremental weights to update LLM's output. This way, LLM +perceives collaborative information without altering its general knowledge and +text inference capabilities. Specifically, we employ a collaborative filtering +model to extract user and item embeddings, converting them into collaborative +weights with low-rank properties through the collaborative weights generator. +We then merge the collaborative weights into LLM's weights, enabling LLM to +perceive the collaborative signals and generate personalized recommendations +without fine-tuning or extra collaborative tokens in prompts. Extensive +experiments confirm that CoRA effectively integrates collaborative information +into LLM, enhancing recommendation performance. + +
+
+
+
+
+ + ☆ Interactive Counterfactual Generation for Univariate Time Series KDD + + +
+ We propose an interactive methodology for generating counterfactual +explanations for univariate time series data in classification tasks by +leveraging 2D projections and decision boundary maps to tackle interpretability +challenges. Our approach aims to enhance the transparency and understanding of +deep learning models' decision processes. The application simplifies the time +series data analysis by enabling users to interactively manipulate projected +data points, providing intuitive insights through inverse projection +techniques. By abstracting user interactions with the projected data points +rather than the raw time series data, our method facilitates an intuitive +generation of counterfactual explanations. This approach allows for a more +straightforward exploration of univariate time series data, enabling users to +manipulate data points to comprehend potential outcomes of hypothetical +scenarios. We validate this method using the ECG5000 benchmark dataset, +demonstrating significant improvements in interpretability and user +understanding of time series classification. The results indicate a promising +direction for enhancing explainable AI, with potential applications in various +domains requiring transparent and interpretable deep learning models. Future +work will explore the scalability of this method to multivariate time series +data and its integration with other interpretability techniques. + +
+
+ comment: 14 pages, 4 figures, accepted at XKDD @ ECML-PKDD +
+
+
+
+
+ + ☆ LLM-Barber: Block-Aware Rebuilder for Sparsity Mask in One-Shot for + Large Language Models + + +
+ Large language models (LLMs) have grown significantly in scale, leading to a +critical need for efficient model pruning techniques. Existing post-training +pruning techniques primarily focus on measuring weight importance on converged +dense models to determine salient weights to retain. However, they often +overlook the changes in weight importance during the pruning process, which can +lead to performance degradation in the pruned models. To address this issue, we +present LLM-Barber (Block-Aware Rebuilder for Sparsity Mask in One-Shot), a +novel one-shot pruning framework that rebuilds the sparsity mask of pruned +models without any retraining or weight reconstruction. LLM-Barber incorporates +block-aware error optimization across Self-Attention and MLP blocks, ensuring +global performance optimization. Inspired by the recent discovery of prominent +outliers in LLMs, LLM-Barber introduces an innovative pruning metric that +identifies weight importance using weights multiplied by gradients. Our +experiments show that LLM-Barber can efficiently prune models like LLaMA and +OPT families with 7B to 13B parameters on a single A100 GPU in just 30 minutes, +achieving state-of-the-art results in both perplexity and zero-shot performance +across various language benchmarks. Code is available at +https://github.com/YupengSu/LLM-Barber. + +
+
+
+
+
+ + ☆ Finding the DeepDream for Time Series: Activation Maximization for + Univariate Time Series ECML-PKDD + + +
+ Understanding how models process and interpret time series data remains a +significant challenge in deep learning to enable applicability in +safety-critical areas such as healthcare. In this paper, we introduce Sequence +Dreaming, a technique that adapts Activation Maximization to analyze sequential +information, aiming to enhance the interpretability of neural networks +operating on univariate time series. By leveraging this method, we visualize +the temporal dynamics and patterns most influential in model decision-making +processes. To counteract the generation of unrealistic or excessively noisy +sequences, we enhance Sequence Dreaming with a range of regularization +techniques, including exponential smoothing. This approach ensures the +production of sequences that more accurately reflect the critical features +identified by the neural network. Our approach is tested on a time series +classification dataset encompassing applications in predictive maintenance. The +results show that our proposed Sequence Dreaming approach demonstrates targeted +activation maximization for different use cases so that either centered class +or border activation maximization can be generated. The results underscore the +versatility of Sequence Dreaming in uncovering salient temporal features +learned by neural networks, thereby advancing model transparency and +trustworthiness in decision-critical domains. + +
+
+ comment: 16 pages, 4 figures, accepted at TempXAI @ ECML-PKDD +
+
+
+
+
+ + ☆ On the Approximability of Stationary Processes using the ARMA Model + + +
+ We identify certain gaps in the literature on the approximability of +stationary random variables using the Autoregressive Moving Average (ARMA) +model. To quantify approximability, we propose that an ARMA model be viewed as +an approximation of a stationary random variable. We map these stationary +random variables to Hardy space functions, and formulate a new function +approximation problem that corresponds to random variable approximation, and +thus to ARMA. Based on this Hardy space formulation we identify a class of +stationary processes where approximation guarantees are feasible. We also +identify an idealized stationary random process for which we conjecture that a +good ARMA approximation is not possible. Next, we provide a constructive proof +that Pad\'e approximations do not always correspond to the best ARMA +approximation. Finally, we note that the spectral methods adopted in this paper +can be seen as a generalization of unit root methods for stationary processes +even when an ARMA model is not defined. + +
+
+ comment: 10 pages, 3 figures +
+
+
+
+
+ + ☆ PerturBench: Benchmarking Machine Learning Models for Cellular + Perturbation Analysis + + +
+ We present a comprehensive framework for predicting the effects of +perturbations in single cells, designed to standardize benchmarking in this +rapidly evolving field. Our framework, PerturBench, includes a user-friendly +platform, diverse datasets, metrics for fair model comparison, and detailed +performance analysis. Extensive evaluations of published and baseline models +reveal limitations like mode or posterior collapse, and underscore the +importance of rank metrics that assess the ordering of perturbations alongside +traditional measures like RMSE. Our findings show that simple models can +outperform more complex approaches. This benchmarking exercise sets new +standards for model evaluation, supports robust model development, and advances +the potential of these models to use high-throughput and high-content genetic +and chemical screens for disease target discovery. + +
+
+ comment: 9 pages plus 19 pages supplementary material. Code is available at + https://github.com/altoslabs/perturbench +
+
+
+
+
+ + ☆ Multilingual Non-Factoid Question Answering with Silver Answers + + +
+ Most existing Question Answering Datasets (QuADs) primarily focus on +factoid-based short-context Question Answering (QA) in high-resource languages. +However, the scope of such datasets for low-resource languages remains limited, +with only a few works centered on factoid-based QuADs and none on non-factoid +QuADs. Therefore, this work presents MuNfQuAD, a multilingual QuAD with +non-factoid questions. It utilizes interrogative sub-headings from BBC news +articles as questions and the corresponding paragraphs as silver answers. The +dataset comprises over 370K QA pairs across 38 languages, encompassing several +low-resource languages, and stands as the largest multilingual QA dataset to +date. Based on the manual annotations of 790 QA-pairs from MuNfQuAD (golden +set), we observe that 98\% of questions can be answered using their +corresponding silver answer. Our fine-tuned Answer Paragraph Selection (APS) +model outperforms the baselines. The APS model attained an accuracy of 80\% and +72\%, as well as a macro F1 of 72\% and 66\%, on the MuNfQuAD testset and the +golden set, respectively. Furthermore, the APS model effectively generalizes +certain a language within the golden set, even after being fine-tuned on silver +labels. + +
+
+
+
+
+ + ☆ Prompt Your Brain: Scaffold Prompt Tuning for Efficient Adaptation of + fMRI Pre-trained Model MICCAI 2024 + + +
+ We introduce Scaffold Prompt Tuning (ScaPT), a novel prompt-based framework +for adapting large-scale functional magnetic resonance imaging (fMRI) +pre-trained models to downstream tasks, with high parameter efficiency and +improved performance compared to fine-tuning and baselines for prompt tuning. +The full fine-tuning updates all pre-trained parameters, which may distort the +learned feature space and lead to overfitting with limited training data which +is common in fMRI fields. In contrast, we design a hierarchical prompt +structure that transfers the knowledge learned from high-resource tasks to +low-resource ones. This structure, equipped with a Deeply-conditioned +Input-Prompt (DIP) mapping module, allows for efficient adaptation by updating +only 2% of the trainable parameters. The framework enhances semantic +interpretability through attention mechanisms between inputs and prompts, and +it clusters prompts in the latent space in alignment with prior knowledge. +Experiments on public resting state fMRI datasets reveal ScaPT outperforms +fine-tuning and multitask-based prompt tuning in neurodegenerative diseases +diagnosis/prognosis and personality trait prediction, even with fewer than 20 +participants. It highlights ScaPT's efficiency in adapting pre-trained fMRI +models to low-resource tasks. + +
+
+ comment: MICCAI 2024 +
+
+
+
+
+ + ☆ SparseGrow: Addressing Growth-Induced Forgetting in Task-Agnostic + Continual Learning AAAI + + +
+ In continual learning (CL), model growth enhances adaptability over new data, +improving knowledge retention for more tasks. However, improper model growth +can lead to severe degradation of previously learned knowledge, an issue we +name as growth-induced forgetting (GIFt), especially in task-agnostic CL using +entire grown model for inference. Existing works, despite adopting model growth +and random initialization for better adaptability, often fail to recognize the +presence of GIFt caused by improper model growth. This oversight limits +comprehensive control of forgetting and hinders full utilization of model +growth. We are the first in CL to identify this issue and conduct an in-depth +study on root cause of GIFt, where layer expansion stands out among model +growth strategies, widening layers without affecting model functionality. Yet, +direct adoption of layer expansion presents challenges. It lacks data-driven +control and initialization of expanded parameters to balance adaptability and +knowledge retention. This paper presents a novel SparseGrow approach to +overcome the issue of GIFt while enhancing adaptability over new data. +SparseGrow employs data-driven sparse layer expansion to control efficient +parameter usage during growth, reducing GIFt from excessive growth and +functionality changes. It also combines sparse growth with on-data +initialization at training late-stage to create partially 0-valued expansions +that fit learned distribution, enhancing retention and adaptability. To further +minimize forgetting, freezing is applied by calculating the sparse mask, +allowing data-driven preservation of important parameters. Through experiments +across datasets with various settings, cases and task numbers, we demonstrate +the necessity of layer expansion and showcase the effectiveness of SparseGrow +in overcoming GIFt, highlighting its adaptability and knowledge retention for +incremental tasks. + +
+
+ comment: This paper has been submitted to the AAAI conference. If accepted, + the final version will be updated to reflect the conference proceedings +
+
+
+
+
+ + ☆ Hokoff: Real Game Dataset from Honor of Kings and its Offline + Reinforcement Learning Benchmarks + + +
+ The advancement of Offline Reinforcement Learning (RL) and Offline +Multi-Agent Reinforcement Learning (MARL) critically depends on the +availability of high-quality, pre-collected offline datasets that represent +real-world complexities and practical applications. However, existing datasets +often fall short in their simplicity and lack of realism. To address this gap, +we propose Hokoff, a comprehensive set of pre-collected datasets that covers +both offline RL and offline MARL, accompanied by a robust framework, to +facilitate further research. This data is derived from Honor of Kings, a +recognized Multiplayer Online Battle Arena (MOBA) game known for its intricate +nature, closely resembling real-life situations. Utilizing this framework, we +benchmark a variety of offline RL and offline MARL algorithms. We also +introduce a novel baseline algorithm tailored for the inherent hierarchical +action space of the game. We reveal the incompetency of current offline RL +approaches in handling task complexity, generalization and multi-task learning. + +
+
+
+
+
+ + ☆ Target-Prompt Online Graph Collaborative Learning for Temporal QoS + Prediction + + +
+ In service-oriented architecture, accurately predicting the Quality of +Service (QoS) is vital for maintaining reliability and enhancing user +satisfaction. However, current methods often neglect high-order latent +collaborative relationships and fail to dynamically adjust feature learning for +specific user-service invocations, which are critical for precise feature +extraction. Moreover, relying on RNNs to capture QoS evolution limits the +ability to detect long-term trends due to challenges in managing long-range +dependencies. To address these issues, we propose the Target-Prompt Online +Graph Collaborative Learning (TOGCL) framework for temporal QoS prediction. It +leverages a dynamic user-service invocation graph to comprehensively model +historical interactions. Building on this graph, it develops a target-prompt +graph attention network to extract online deep latent features of users and +services at each time slice, considering implicit target-neighboring +collaborative relationships and historical QoS values. Additionally, a +multi-layer Transformer encoder is employed to uncover temporal feature +evolution patterns, enhancing temporal QoS prediction. Extensive experiments on +the WS-DREAM dataset demonstrate that TOGCL significantly outperforms +state-of-the-art methods across multiple metrics, achieving improvements of up +to 38.80\%. These results underscore the effectiveness of TOGCL for temporal +QoS prediction. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Integrating Multi-Modal Input Token Mixer Into Mamba-Based Decision + Models: Decision MetaMamba + + +
+ Return-Conditioned Transformer Decision Models (RCTDM) have demonstrated the +potential to enhance transformer performance in offline reinforcement learning +by replacing rewards in the input sequence with returns-to-go. However, to +achieve the goal of learning an optimal policy from offline datasets composed +of limited suboptimal trajectories, RCTDM required alternative methods. One +prominent approach, trajectory stitching, was designed to enable the network to +combine multiple trajectories to find the optimal path. To implement this using +only transformers without auxiliary networks, it was necessary to shorten the +input sequence length to better capture the Markov property in reinforcement +learnings. This, however, introduced a trade-off, as it reduced the accuracy of +action inference. Our study introduces a model named Decision MetaMamba to +resolve these challenges. DMM employs an input token mixer to extract patterns +from short sequences and uses a State Space Model (SSM) to selectively combine +information from relatively distant sequences. Inspired by Metaformer, this +structure was developed by transforming Mamba's input layer into various +multi-modal layers. Fortunately, with the advent of Mamba, implemented using +parallel selective scanning, we achieved a high-performance sequence model +capable of replacing transformers. Based on these innovations, DMM demonstrated +excellent performance across various datasets in offline RL, confirming that +models using SSM can improve performance by domain-specific alterations of the +input layer. Additionally, it maintained its performance even in lightweight +models with fewer parameters. These results suggest that decision models based +on SSM can pave the way for improved outcomes in future developments. + +
+
+
+
+
+ + ☆ Single-cell Curriculum Learning-based Deep Graph Embedding Clustering + + +
+ The swift advancement of single-cell RNA sequencing (scRNA-seq) technologies +enables the investigation of cellular-level tissue heterogeneity. Cell +annotation significantly contributes to the extensive downstream analysis of +scRNA-seq data. However, The analysis of scRNA-seq for biological inference +presents challenges owing to its intricate and indeterminate data distribution, +characterized by a substantial volume and a high frequency of dropout events. +Furthermore, the quality of training samples varies greatly, and the +performance of the popular scRNA-seq data clustering solution GNN could be +harmed by two types of low-quality training nodes: 1) nodes on the boundary; 2) +nodes that contribute little additional information to the graph. To address +these problems, we propose a single-cell curriculum learning-based deep graph +embedding clustering (scCLG). We first propose a Chebyshev graph convolutional +autoencoder with multi-decoder (ChebAE) that combines three optimization +objectives corresponding to three decoders, including topology reconstruction +loss of cell graphs, zero-inflated negative binomial (ZINB) loss, and +clustering loss, to learn cell-cell topology representation. Meanwhile, we +employ a selective training strategy to train GNN based on the features and +entropy of nodes and prune the difficult nodes based on the difficulty scores +to keep the high-quality graph. Empirical results on a variety of gene +expression datasets show that our model outperforms state-of-the-art methods. + +
+
+
+
+
+ + ☆ Adaptive Knowledge Distillation for Classification of Hand Images using + Explainable Vision Transformers ECML + + +
+ Assessing the forensic value of hand images involves the use of unique +features and patterns present in an individual's hand. The human hand has +distinct characteristics, such as the pattern of veins, fingerprints, and the +geometry of the hand itself. This paper investigates the use of vision +transformers (ViTs) for classification of hand images. We use explainability +tools to explore the internal representations of ViTs and assess their impact +on the model outputs. Utilizing the internal understanding of ViTs, we +introduce distillation methods that allow a student model to adaptively extract +knowledge from a teacher model while learning on data of a different domain to +prevent catastrophic forgetting. Two publicly available hand image datasets are +used to conduct a series of experiments to evaluate performance of the ViTs and +our proposed adaptive distillation methods. The experimental results +demonstrate that ViT models significantly outperform traditional machine +learning methods and the internal states of ViTs are useful for explaining the +model outputs in the classification task. By averting catastrophic forgetting, +our distillation methods achieve excellent performance on data from both source +and target domains, particularly when these two domains exhibit significant +dissimilarity. The proposed approaches therefore can be developed and +implemented effectively for real-world applications such as access control, +identity verification, and authentication systems. + +
+
+ comment: Accepted at the ECML PKDD 2024 (Research Track) +
+
+
+
+
+ + ☆ Asymptotic Classification Error for Heavy-Tailed Renewal Processes + + +
+ Despite the widespread occurrence of classification problems and the +increasing collection of point process data across many disciplines, study of +error probability for point process classification only emerged very recently. +Here, we consider classification of renewal processes. We obtain asymptotic +expressions for the Bhattacharyya bound on misclassification error +probabilities for heavy-tailed renewal processes. + +
+
+ comment: 11 pages, 2 figures +
+
+
+
+
+ + ☆ Clustering by Mining Density Distributions and Splitting Manifold + Structure + + +
+ Spectral clustering requires the time-consuming decomposition of the +Laplacian matrix of the similarity graph, thus limiting its applicability to +large datasets. To improve the efficiency of spectral clustering, a top-down +approach was recently proposed, which first divides the data into several +micro-clusters (granular-balls), then splits these micro-clusters when they are +not "compact'', and finally uses these micro-clusters as nodes to construct a +similarity graph for more efficient spectral clustering. However, this top-down +approach is challenging to adapt to unevenly distributed or structurally +complex data. This is because constructing micro-clusters as a rough ball +struggles to capture the shape and structure of data in a local range, and the +simplistic splitting rule that solely targets ``compactness'' is susceptible to +noise and variations in data density and leads to micro-clusters with varying +shapes, making it challenging to accurately measure the similarity between +them. To resolve these issues, this paper first proposes to start from local +structures to obtain micro-clusters, such that the complex structural +information inside local neighborhoods is well captured by them. Moreover, by +noting that Euclidean distance is more suitable for convex sets, this paper +further proposes a data splitting rule that couples local density and data +manifold structures, so that the similarities of the obtained micro-clusters +can be easily characterized. A novel similarity measure between micro-clusters +is then proposed for the final spectral clustering. A series of experiments +based on synthetic and real-world datasets demonstrate that the proposed method +has better adaptability to structurally complex data than granular-ball based +methods. + +
+
+
+
+
+ + ☆ Achieving the Tightest Relaxation of Sigmoids for Formal Verification + + +
+ In the field of formal verification, Neural Networks (NNs) are typically +reformulated into equivalent mathematical programs which are optimized over. To +overcome the inherent non-convexity of these reformulations, convex relaxations +of nonlinear activation functions are typically utilized. Common relaxations +(i.e., static linear cuts) of ``S-shaped" activation functions, however, can be +overly loose, slowing down the overall verification process. In this paper, we +derive tuneable hyperplanes which upper and lower bound the sigmoid activation +function. When tuned in the dual space, these affine bounds smoothly rotate +around the nonlinear manifold of the sigmoid activation function. This +approach, termed $\alpha$-sig, allows us to tractably incorporate the tightest +possible, element-wise convex relaxation of the sigmoid activation function +into a formal verification framework. We embed these relaxations inside of +large verification tasks and compare their performance to LiRPA and +$\alpha$-CROWN, a state-of-the-art verification duo. + +
+
+
+
+
+ + ☆ PRformer: Pyramidal Recurrent Transformer for Multivariate Time Series + Forecasting + + +
+ The self-attention mechanism in Transformer architecture, invariant to +sequence order, necessitates positional embeddings to encode temporal order in +time series prediction. We argue that this reliance on positional embeddings +restricts the Transformer's ability to effectively represent temporal +sequences, particularly when employing longer lookback windows. To address +this, we introduce an innovative approach that combines Pyramid RNN +embeddings(PRE) for univariate time series with the Transformer's capability to +model multivariate dependencies. PRE, utilizing pyramidal one-dimensional +convolutional layers, constructs multiscale convolutional features that +preserve temporal order. Additionally, RNNs, layered atop these features, learn +multiscale time series representations sensitive to sequence order. This +integration into Transformer models with attention mechanisms results in +significant performance enhancements. We present the PRformer, a model +integrating PRE with a standard Transformer encoder, demonstrating +state-of-the-art performance on various real-world datasets. This performance +highlights the effectiveness of our approach in leveraging longer lookback +windows and underscores the critical role of robust temporal representations in +maximizing Transformer's potential for prediction tasks. Code is available at +this repository: \url{https://github.com/usualheart/PRformer}. + +
+
+
+
+
+ + ☆ An End-to-End Reinforcement Learning Based Approach for Micro-View + Order-Dispatching in Ride-Hailing + + +
+ Assigning orders to drivers under localized spatiotemporal context +(micro-view order-dispatching) is a major task in Didi, as it influences +ride-hailing service experience. Existing industrial solutions mainly follow a +two-stage pattern that incorporate heuristic or learning-based algorithms with +naive combinatorial methods, tackling the uncertainty of both sides' behaviors, +including emerging timings, spatial relationships, and travel duration, etc. In +this paper, we propose a one-stage end-to-end reinforcement learning based +order-dispatching approach that solves behavior prediction and combinatorial +optimization uniformly in a sequential decision-making manner. Specifically, we +employ a two-layer Markov Decision Process framework to model this problem, and +present \underline{D}eep \underline{D}ouble \underline{S}calable +\underline{N}etwork (D2SN), an encoder-decoder structure network to generate +order-driver assignments directly and stop assignments accordingly. Besides, by +leveraging contextual dynamics, our approach can adapt to the behavioral +patterns for better performance. Extensive experiments on Didi's real-world +benchmarks justify that the proposed approach significantly outperforms +competitive baselines in optimizing matching efficiency and user experience +tasks. In addition, we evaluate the deployment outline and discuss the gains +and experiences obtained during the deployment tests from the view of +large-scale engineering implementation. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ☆ LeCov: Multi-level Testing Criteria for Large Language Models + + +
+ Large Language Models (LLMs) are widely used in many different domains, but +because of their limited interpretability, there are questions about how +trustworthy they are in various perspectives, e.g., truthfulness and toxicity. +Recent research has started developing testing methods for LLMs, aiming to +uncover untrustworthy issues, i.e., defects, before deployment. However, +systematic and formalized testing criteria are lacking, which hinders a +comprehensive assessment of the extent and adequacy of testing exploration. To +mitigate this threat, we propose a set of multi-level testing criteria, LeCov, +for LLMs. The criteria consider three crucial LLM internal components, i.e., +the attention mechanism, feed-forward neurons, and uncertainty, and contain +nine types of testing criteria in total. We apply the criteria in two +scenarios: test prioritization and coverage-guided testing. The experiment +evaluation, on three models and four datasets, demonstrates the usefulness and +effectiveness of LeCov. + +
+
+
+
+
+ + ☆ Enhancing One-shot Pruned Pre-trained Language Models through + Sparse-Dense-Sparse Mechanism + + +
+ Pre-trained language models (PLMs) are engineered to be robust in contextual +understanding and exhibit outstanding performance in various natural language +processing tasks. However, their considerable size incurs significant +computational and storage costs. Modern pruning strategies employ one-shot +techniques to compress PLMs without the need for retraining on task-specific or +otherwise general data; however, these approaches often lead to an +indispensable reduction in performance. In this paper, we propose SDS, a +Sparse-Dense-Sparse pruning framework to enhance the performance of the pruned +PLMs from a weight distribution optimization perspective. We outline the +pruning process in three steps. Initially, we prune less critical connections +in the model using conventional one-shot pruning methods. Next, we reconstruct +a dense model featuring a pruning-friendly weight distribution by reactivating +pruned connections with sparse regularization. Finally, we perform a second +pruning round, yielding a superior pruned model compared to the initial +pruning. Experimental results demonstrate that SDS outperforms the +state-of-the-art pruning techniques SparseGPT and Wanda under an identical +sparsity configuration. For instance, SDS reduces perplexity by 9.13 on +Raw-Wikitext2 and improves accuracy by an average of 2.05% across multiple +zero-shot benchmarks for OPT-125M with 2:4 sparsity. + +
+
+
+
+
+ + ☆ Tracing Privacy Leakage of Language Models to Training Data via Adjusted + Influence Functions + + +
+ The responses generated by Large Language Models (LLMs) can include sensitive +information from individuals and organizations, leading to potential privacy +leakage. This work implements Influence Functions (IFs) to trace privacy +leakage back to the training data, thereby mitigating privacy concerns of +Language Models (LMs). However, we notice that current IFs struggle to +accurately estimate the influence of tokens with large gradient norms, +potentially overestimating their influence. When tracing the most influential +samples, this leads to frequently tracing back to samples with large gradient +norm tokens, overshadowing the actual most influential samples even if their +influences are well estimated. To address this issue, we propose Heuristically +Adjusted IF (HAIF), which reduces the weight of tokens with large gradient +norms, thereby significantly improving the accuracy of tracing the most +influential samples. To establish easily obtained groundtruth for tracing +privacy leakage, we construct two datasets, PII-E and PII-CR, representing two +distinct scenarios: one with identical text in the model outputs and +pre-training data, and the other where models leverage their reasoning +abilities to generate text divergent from pre-training data. HAIF significantly +improves tracing accuracy, enhancing it by 20.96\% to 73.71\% on the PII-E +dataset and 3.21\% to 45.93\% on the PII-CR dataset, compared to the best SOTA +IFs against various GPT-2 and QWen-1.5 models. HAIF also outperforms SOTA IFs +on real-world pretraining data CLUECorpus2020, demonstrating strong robustness +regardless prompt and response lengths. + +
+
+
+
+
+ + ☆ Learning Multimodal Latent Space with EBM Prior and MCMC Inference + + +
+ Multimodal generative models are crucial for various applications. We propose +an approach that combines an expressive energy-based model (EBM) prior with +Markov Chain Monte Carlo (MCMC) inference in the latent space for multimodal +generation. The EBM prior acts as an informative guide, while MCMC inference, +specifically through short-run Langevin dynamics, brings the posterior +distribution closer to its true form. This method not only provides an +expressive prior to better capture the complexity of multimodality but also +improves the learning of shared latent variables for more coherent generation +across modalities. Our proposed method is supported by empirical experiments, +underscoring the effectiveness of our EBM prior with MCMC inference in +enhancing cross-modal and joint generative tasks in multimodal contexts. + +
+
+
+
+
+ + ☆ Adversarial training of Keyword Spotting to Minimize TTS Data + Overfitting + + +
+ The keyword spotting (KWS) problem requires large amounts of real speech +training data to achieve high accuracy across diverse populations. Utilizing +large amounts of text-to-speech (TTS) synthesized data can reduce the cost and +time associated with KWS development. However, TTS data may contain artifacts +not present in real speech, which the KWS model can exploit (overfit), leading +to degraded accuracy on real speech. To address this issue, we propose applying +an adversarial training method to prevent the KWS model from learning +TTS-specific features when trained on large amounts of TTS data. Experimental +results demonstrate that KWS model accuracy on real speech data can be improved +by up to 12% when adversarial loss is used in addition to the original KWS +loss. Surprisingly, we also observed that the adversarial setup improves +accuracy by up to 8%, even when trained solely on TTS and real negative speech +data, without any real positive examples. + +
+
+ comment: to be published in a Workshop at Interspeech 2024, Synthetic Data's + Transformative Role in Foundational Speech Models +
+
+
+
+
+ + ☆ Transfer Operator Learning with Fusion Frame + + +
+ The challenge of applying learned knowledge from one domain to solve problems +in another related but distinct domain, known as transfer learning, is +fundamental in operator learning models that solve Partial Differential +Equations (PDEs). These current models often struggle with generalization +across different tasks and datasets, limiting their applicability in diverse +scientific and engineering disciplines. This work presents a novel framework +that enhances the transfer learning capabilities of operator learning models +for solving Partial Differential Equations (PDEs) through the integration of +fusion frame theory with the Proper Orthogonal Decomposition (POD)-enhanced +Deep Operator Network (DeepONet). We introduce an innovative architecture that +combines fusion frames with POD-DeepONet, demonstrating superior performance +across various PDEs in our experimental analysis. Our framework addresses the +critical challenge of transfer learning in operator learning models, paving the +way for adaptable and efficient solutions across a wide range of scientific and +engineering applications. + +
+
+
+
+
+ + ☆ Do Neural Scaling Laws Exist on Graph Self-Supervised Learning? + + +
+ Self-supervised learning~(SSL) is essential to obtain foundation models in +NLP and CV domains via effectively leveraging knowledge in large-scale +unlabeled data. The reason for its success is that a suitable SSL design can +help the model to follow the neural scaling law, i.e., the performance +consistently improves with increasing model and dataset sizes. However, it +remains a mystery whether existing SSL in the graph domain can follow the +scaling behavior toward building Graph Foundation Models~(GFMs) with +large-scale pre-training. In this study, we examine whether existing graph SSL +techniques can follow the neural scaling behavior with the potential to serve +as the essential component for GFMs. Our benchmark includes comprehensive SSL +technique implementations with analysis conducted on both the conventional SSL +setting and many new settings adopted in other domains. Surprisingly, despite +the SSL loss continuously decreasing, no existing graph SSL techniques follow +the neural scaling behavior on the downstream performance. The model +performance only merely fluctuates on different data scales and model scales. +Instead of the scales, the key factors influencing the performance are the +choices of model architecture and pretext task design. This paper examines +existing SSL techniques for the feasibility of Graph SSL techniques in +developing GFMs and opens a new direction for graph SSL design with the new +evaluation prototype. Our code implementation is available online to ease +reproducibility on https://github.com/GraphSSLScaling/GraphSSLScaling. + +
+
+
+
+
+ + ☆ Asymmetric Graph Error Control with Low Complexity in Causal Bandits + + +
+ In this paper, the causal bandit problem is investigated, in which the +objective is to select an optimal sequence of interventions on nodes in a +causal graph. It is assumed that the graph is governed by linear structural +equations; it is further assumed that both the causal topology and the +distribution of interventions are unknown. By exploiting the causal +relationships between the nodes whose signals contribute to the reward, +interventions are optimized. First, based on the difference between the two +types of graph identification errors (false positives and negatives), a causal +graph learning method is proposed, which strongly reduces sample complexity +relative to the prior art by learning sub-graphs. Under the assumption of +Gaussian exogenous inputs and minimum-mean squared error weight estimation, a +new uncertainty bound tailored to the causal bandit problem is derived. This +uncertainty bound drives an upper confidence bound based intervention selection +to optimize the reward. To cope with non-stationary bandits, a sub-graph change +detection mechanism is proposed, with high sample efficiency. Numerical results +compare the new methodology to existing schemes and show a substantial +performance improvement in both stationary and non-stationary settings. +Compared to existing approaches, the proposed scheme takes 67% fewer samples to +learn the causal structure and achieves an average reward gain of 85%. + +
+
+
+
+
+ + ☆ A Little Confidence Goes a Long Way + + +
+ We introduce a group of related methods for binary classification tasks using +probes of the hidden state activations in large language models (LLMs). +Performance is on par with the largest and most advanced LLMs currently +available, but requiring orders of magnitude fewer computational resources and +not requiring labeled data. This approach involves translating class labels +into a semantically rich description, spontaneous symmetry breaking of +multilayer perceptron probes for unsupervised learning and inference, training +probes to generate confidence scores (prior probabilities) from hidden state +activations subject to known constraints via entropy maximization, and +selecting the most confident probe model from an ensemble for prediction. These +techniques are evaluated on four datasets using five base LLMs. + +
+
+ comment: 13 pages, 2 figures +
+
+
+
+
+ + ☆ Out-of-Distribution Detection with Attention Head Masking for Multimodal + Document Classification + + +
+ Detecting out-of-distribution (OOD) data is crucial in machine learning +applications to mitigate the risk of model overconfidence, thereby enhancing +the reliability and safety of deployed systems. The majority of existing OOD +detection methods predominantly address uni-modal inputs, such as images or +texts. In the context of multi-modal documents, there is a notable lack of +extensive research on the performance of these methods, which have primarily +been developed with a focus on computer vision tasks. We propose a novel +methodology termed as attention head masking (AHM) for multi-modal OOD tasks in +document classification systems. Our empirical results demonstrate that the +proposed AHM method outperforms all state-of-the-art approaches and +significantly decreases the false positive rate (FPR) compared to existing +solutions up to 7.5\%. This methodology generalizes well to multi-modal data, +such as documents, where visual and textual information are modeled under the +same Transformer architecture. To address the scarcity of high-quality publicly +available document datasets and encourage further research on OOD detection for +documents, we introduce FinanceDocs, a new document AI dataset. Our code and +dataset are publicly available. + +
+
+
+
+
+ + ☆ Unified Deep Learning Model for Global Prediction of Aboveground + Biomass, Canopy Height and Cover from High-Resolution, Multi-Sensor Satellite + Imagery + + +
+ Regular measurement of carbon stock in the world's forests is critical for +carbon accounting and reporting under national and international climate +initiatives, and for scientific research, but has been largely limited in +scalability and temporal resolution due to a lack of ground based assessments. +Increasing efforts have been made to address these challenges by incorporating +remotely sensed data. We present a new methodology which uses multi-sensor, +multi-spectral imagery at a resolution of 10 meters and a deep learning based +model which unifies the prediction of above ground biomass density (AGBD), +canopy height (CH), canopy cover (CC) as well as uncertainty estimations for +all three quantities. The model is trained on millions of globally sampled +GEDI-L2/L4 measurements. We validate the capability of our model by deploying +it over the entire globe for the year 2023 as well as annually from 2016 to +2023 over selected areas. The model achieves a mean absolute error for AGBD +(CH, CC) of 26.1 Mg/ha (3.7 m, 9.9 %) and a root mean squared error of 50.6 +Mg/ha (5.4 m, 15.8 %) on a globally sampled test dataset, demonstrating a +significant improvement over previously published results. We also report the +model performance against independently collected ground measurements published +in the literature, which show a high degree of correlation across varying +conditions. We further show that our pre-trained model facilitates seamless +transferability to other GEDI variables due to its multi-head architecture. + +
+
+
+
+
+ + ♻ ☆ What is in Your Safe Data? Identifying Benign Data that Breaks Safety + + +
+ Current Large Language Models (LLMs), even those tuned for safety and +alignment, are susceptible to jailbreaking. Some have found that just further +fine-tuning an aligned model with benign data (i.e., data without harmful +content) surprisingly leads to substantial degradation in safety. We delve into +the data-centric aspects of why benign fine-tuning inadvertently contributes to +jailbreaking. First, we represent fine-tuning data through two lenses: +representation and gradient spaces. Additionally, we propose a bi-directional +anchoring method that, during the selection process, prioritizes data points +that are close to harmful examples and far from benign ones. Our approach +effectively identifies subsets of benign data that are more likely to degrade +the model's safety after fine-tuning. Training on just 100 of these seemingly +benign datapoints surprisingly leads to the fine-tuned model affirmatively +responding to >70% of tested harmful requests, compared to <20% after +fine-tuning on randomly selected data. We also observe that the selected data +frequently appear as lists, bullet points, or math questions, indicating a +systematic pattern in fine-tuning data that contributes to jailbreaking. + +
+
+
+
+
+ + ♻ ☆ Learning Realistic Joint Space Boundaries for Range of Motion Analysis + of Healthy and Impaired Human Arms + + +
+ A realistic human kinematic model that satisfies anatomical constraints is +essential for human-robot interaction, biomechanics and robot-assisted +rehabilitation. Modeling realistic joint constraints, however, is challenging +as human arm motion is constrained by joint limits, inter- and intra-joint +dependencies, self-collisions, individual capabilities and muscular or +neurological constraints which are difficult to represent. Hence, physicians +and researchers have relied on simple box-constraints, ignoring important +anatomical factors. In this paper, we propose a data-driven method to learn +realistic anatomically constrained upper-limb range of motion (RoM) boundaries +from motion capture data. This is achieved by fitting a one-class support +vector machine to a dataset of upper-limb joint space exploration motions with +an efficient hyper-parameter tuning scheme. Our approach outperforms similar +works focused on valid RoM learning. Further, we propose an impairment index +(II) metric that offers a quantitative assessment of capability/impairment when +comparing healthy and impaired arms. We validate the metric on healthy subjects +physically constrained to emulate hemiplegia and different disability levels as +stroke patients. + +
+
+
+
+
+ + ♻ ☆ Causal Reasoning and Large Language Models: Opening a New Frontier for + Causality + + +
+ The causal capabilities of large language models (LLMs) are a matter of +significant debate, with critical implications for the use of LLMs in +societally impactful domains such as medicine, science, law, and policy. We +conduct a "behavorial" study of LLMs to benchmark their capability in +generating causal arguments. Across a wide range of tasks, we find that LLMs +can generate text corresponding to correct causal arguments with high +probability, surpassing the best-performing existing methods. Algorithms based +on GPT-3.5 and 4 outperform existing algorithms on a pairwise causal discovery +task (97%, 13 points gain), counterfactual reasoning task (92%, 20 points gain) +and event causality (86% accuracy in determining necessary and sufficient +causes in vignettes). We perform robustness checks across tasks and show that +the capabilities cannot be explained by dataset memorization alone, especially +since LLMs generalize to novel datasets that were created after the training +cutoff date. + That said, LLMs exhibit unpredictable failure modes, and we discuss the kinds +of errors that may be improved and what are the fundamental limits of LLM-based +answers. Overall, by operating on the text metadata, LLMs bring capabilities so +far understood to be restricted to humans, such as using collected knowledge to +generate causal graphs or identifying background causal context from natural +language. As a result, LLMs may be used by human domain experts to save effort +in setting up a causal analysis, one of the biggest impediments to the +widespread adoption of causal methods. Given that LLMs ignore the actual data, +our results also point to a fruitful research direction of developing +algorithms that combine LLMs with existing causal techniques. Code and datasets +are available at https://github.com/py-why/pywhy-llm. + +
+
+ comment: Added three novel datasets. To be published in TMLR. Authors listed + alphabetically +
+
+
+
+
+ + ♻ ☆ Disparate Impact on Group Accuracy of Linearization for Private + Inference ICML + + +
+ Ensuring privacy-preserving inference on cryptographically secure data is a +well-known computational challenge. To alleviate the bottleneck of costly +cryptographic computations in non-linear activations, recent methods have +suggested linearizing a targeted portion of these activations in neural +networks. This technique results in significantly reduced runtimes with often +negligible impacts on accuracy. In this paper, we demonstrate that such +computational benefits may lead to increased fairness costs. Specifically, we +find that reducing the number of ReLU activations disproportionately decreases +the accuracy for minority groups compared to majority groups. To explain these +observations, we provide a mathematical interpretation under restricted +assumptions about the nature of the decision boundary, while also showing the +prevalence of this problem across widely used datasets and architectures. +Finally, we show how a simple procedure altering the fine-tuning step for +linearized models can serve as an effective mitigation strategy. + +
+
+ comment: Extended version of the paper accepted to appear at the Forty-first + International Conference on Machine Learning (ICML) 2024 +
+
+
+
+
+ + ♻ ☆ Two-Timescale Optimization Framework for Decentralized Linear-Quadratic + Optimal Control + + +
+ This study investigates a decentralized linear-quadratic optimal control +problem, and several approximate separable constrained optimization problems +are formulated for the first time based on the selection of sparsity promoting +functions. First, for the optimization problem with weighted $\ell_1$ sparsity +promoting function, a two-timescale algorithm is adopted that is based on the +BSUM (Block Successive Upper-bound Minimization) framework and a differential +equation solver. Second, a piecewise quadratic sparsity promoting function is +introduced, and the induced optimization problem demonstrates an accelerated +convergence rate by performing the same two-timescale algorithm. Finally, the +optimization problem with $\ell_0$ sparsity promoting function is considered +that is nonconvex and discontinuous, and can be approximated by successive +coordinatewise convex optimization problems. + +
+
+
+
+
+ + ♻ ☆ Efficient and Robust Quantization-aware Training via Adaptive Coreset + Selection + + +
+ Quantization-aware training (QAT) is a representative model compression +method to reduce redundancy in weights and activations. However, most existing +QAT methods require end-to-end training on the entire dataset, which suffers +from long training time and high energy costs. In addition, the potential label +noise in the training data undermines the robustness of QAT. We propose two +metrics based on analysis of loss and gradient of quantized weights: error +vector score and disagreement score, to quantify the importance of each sample +during training. Guided by these two metrics, we proposed a quantization-aware +Adaptive Coreset Selection (ACS) method to select the data for the current +training epoch. We evaluate our method on various networks (ResNet-18, +MobileNetV2, RetinaNet), datasets(CIFAR-10, CIFAR-100, ImageNet-1K, COCO), and +under different quantization settings. Specifically, our method can achieve an +accuracy of 68.39\% of 4-bit quantized ResNet-18 on the ImageNet-1K dataset +with only a 10\% subset, which has an absolute gain of 4.24\% compared to the +baseline. Our method can also improve the robustness of QAT by removing noisy +samples in the training set. + +
+
+ comment: Accepted by TMLR, Code: https://github.com/HuangOwen/QAT-ACS +
+
+
+
+
+ + ♻ ☆ InstructRAG: Instructing Retrieval-Augmented Generation via + Self-Synthesized Rationales + + +
+ Retrieval-augmented generation (RAG) has shown promising potential to enhance +the accuracy and factuality of language models (LMs). However, imperfect +retrievers or noisy corpora can introduce misleading or even erroneous +information to the retrieved contents, posing a significant challenge to the +generation quality. Existing RAG methods typically address this challenge by +directly predicting final answers despite potentially noisy inputs, resulting +in an implicit denoising process that is difficult to interpret and verify. On +the other hand, the acquisition of explicit denoising supervision is often +costly, involving significant human efforts. In this work, we propose +InstructRAG, where LMs explicitly learn the denoising process through +self-synthesized rationales -- First, we instruct the LM to explain how the +ground-truth answer is derived from retrieved documents. Then, these rationales +can be used either as demonstrations for in-context learning of explicit +denoising or as supervised fine-tuning data to train the model. Compared to +standard RAG approaches, InstructRAG requires no additional supervision, allows +for easier verification of the predicted answers, and effectively improves +generation accuracy. Experiments show InstructRAG consistently outperforms +existing RAG methods in both training-free and trainable scenarios, achieving a +relative improvement of 8.3% over the best baseline method on average across +five knowledge-intensive benchmarks. Extensive analysis indicates that +InstructRAG scales well with increased numbers of retrieved documents and +consistently exhibits robust denoising ability even in out-of-domain datasets, +demonstrating strong generalizability. + +
+
+ comment: Code: https://github.com/weizhepei/InstructRAG +
+
+
+
+
+ + ♻ ☆ Which Side Are You On? A Multi-task Dataset for End-to-End Argument + Summarisation and Evaluation ACL 2024 + + +
+ With the recent advances of large language models (LLMs), it is no longer +infeasible to build an automated debate system that helps people to synthesise +persuasive arguments. Previous work attempted this task by integrating multiple +components. In our work, we introduce an argument mining dataset that captures +the end-to-end process of preparing an argumentative essay for a debate, which +covers the tasks of claim and evidence identification (Task 1 ED), evidence +convincingness ranking (Task 2 ECR), argumentative essay summarisation and +human preference ranking (Task 3 ASR) and metric learning for automated +evaluation of resulting essays, based on human feedback along argument quality +dimensions (Task 4 SQE). Our dataset contains 14k examples of claims that are +fully annotated with the various properties supporting the aforementioned +tasks. We evaluate multiple generative baselines for each of these tasks, +including representative LLMs. We find, that while they show promising results +on individual tasks in our benchmark, their end-to-end performance on all four +tasks in succession deteriorates significantly, both in automated measures as +well as in human-centred evaluation. This challenge presented by our proposed +dataset motivates future research on end-to-end argument mining and +summarisation. The repository of this project is available at +https://github.com/HaoBytes/ArgSum-Datatset + +
+
+ comment: Published on ACL 2024 Findings +
+
+
+
+
+ + ♻ ☆ Model Stealing Attack against Graph Classification with Authenticity, + Uncertainty and Diversity + + +
+ Recent research demonstrates that GNNs are vulnerable to the model stealing +attack, a nefarious endeavor geared towards duplicating the target model via +query permissions. However, they mainly focus on node classification tasks, +neglecting the potential threats entailed within the domain of graph +classification tasks. Furthermore, their practicality is questionable due to +unreasonable assumptions, specifically concerning the large data requirements +and extensive model knowledge. To this end, we advocate following strict +settings with limited real data and hard-label awareness to generate synthetic +data, thereby facilitating the stealing of the target model. Specifically, +following important data generation principles, we introduce three model +stealing attacks to adapt to different actual scenarios: MSA-AU is inspired by +active learning and emphasizes the uncertainty to enhance query value of +generated samples; MSA-AD introduces diversity based on Mixup augmentation +strategy to alleviate the query inefficiency issue caused by over-similar +samples generated by MSA-AU; MSA-AUD combines the above two strategies to +seamlessly integrate the authenticity, uncertainty, and diversity of the +generated samples. Finally, extensive experiments consistently demonstrate the +superiority of the proposed methods in terms of concealment, query efficiency, +and stealing performance. + +
+
+
+
+
+ + ♻ ☆ Self-Supervised Disentanglement by Leveraging Structure in Data + Augmentations + + +
+ Self-supervised representation learning often uses data augmentations to +induce some invariance to "style" attributes of the data. However, with +downstream tasks generally unknown at training time, it is difficult to deduce +a priori which attributes of the data are indeed "style" and can be safely +discarded. To deal with this, current approaches try to retain some style +information by tuning the degree of invariance to some particular task, such as +ImageNet object classification. However, prior work has shown that such +task-specific tuning can lead to significant performance degradation on other +tasks that rely on the discarded style. To address this, we introduce a more +principled approach that seeks to disentangle style features rather than +discard them. The key idea is to add multiple style embedding spaces where: (i) +each is invariant to all-but-one augmentation; and (ii) joint entropy is +maximized. We formalize our structured data-augmentation procedure from a +causal latent-variable-model perspective, and prove identifiability of both +content and individual style variables. We empirically demonstrate the benefits +of our approach on both synthetic and real-world data. + +
+
+
+
+
+ + ♻ ☆ Normalise for Fairness: A Simple Normalisation Technique for Fairness in + Regression Machine Learning Problems + + +
+ Algorithms and Machine Learning (ML) are increasingly affecting everyday life +and several decision-making processes, where ML has an advantage due to +scalability or superior performance. Fairness in such applications is crucial, +where models should not discriminate their results based on race, gender, or +other protected groups. This is especially crucial for models affecting very +sensitive topics, like interview invitation or recidivism prediction. Fairness +is not commonly studied for regression problems compared to binary +classification problems; hence, we present a simple, yet effective method based +on normalisation (FaiReg), which minimises the impact of unfairness in +regression problems, especially due to labelling bias. We present a theoretical +analysis of the method, in addition to an empirical comparison against two +standard methods for fairness, namely data balancing and adversarial training. +We also include a hybrid formulation (FaiRegH), merging the presented method +with data balancing, in an attempt to face labelling and sampling biases +simultaneously. The experiments are conducted on the multimodal dataset First +Impressions (FI) with various labels, namely Big-Five personality prediction +and interview screening score. The results show the superior performance of +diminishing the effects of unfairness better than data balancing, also without +deteriorating the performance of the original problem as much as adversarial +training. Fairness is evaluated based on the Equal Accuracy (EA) and +Statistical Parity (SP) constraints. The experiments present a setup that +enhances the fairness for several protected variables simultaneously. + +
+
+ comment: Including references and appendices: 17 pages, 3 Figures, 5 Tables +
+
+
+
+
+ + ♻ ☆ Learning material synthesis-process-structure-property relationship by + data fusion: Bayesian Coregionalization N-Dimensional Piecewise Function + Learning + + +
+ Autonomous materials research labs require the ability to combine and learn +from diverse data streams. This is especially true for learning material +synthesis-process-structure-property relationships, key to accelerating +materials optimization and discovery as well as accelerating mechanistic +understanding. We present the Synthesis-process-structure-property relAtionship +coreGionalized lEarner (SAGE) algorithm. A fully Bayesian algorithm that uses +multimodal coregionalization to merge knowledge across data sources to learn +synthesis-process-structure-property relationships. SAGE outputs a +probabilistic posterior for the relationships including the most likely +relationships given the data. + +
+
+
+
+
+ + ♻ ☆ Elephants Never Forget: Memorization and Learning of Tabular Data in + Large Language Models + + +
+ While many have shown how Large Language Models (LLMs) can be applied to a +diverse set of tasks, the critical issues of data contamination and +memorization are often glossed over. In this work, we address this concern for +tabular data. Specifically, we introduce a variety of different techniques to +assess whether a language model has seen a tabular dataset during training. +This investigation reveals that LLMs have memorized many popular tabular +datasets verbatim. We then compare the few-shot learning performance of LLMs on +datasets that were seen during training to the performance on datasets released +after training. We find that LLMs perform better on datasets seen during +training, indicating that memorization leads to overfitting. At the same time, +LLMs show non-trivial performance on novel datasets and are surprisingly robust +to data transformations. We then investigate the in-context statistical +learning abilities of LLMs. While LLMs are significantly better than random at +solving statistical classification problems, the sample efficiency of few-shot +learning lags behind traditional statistical learning algorithms, especially as +the dimension of the problem increases. This suggests that much of the observed +few-shot performance on novel real-world datasets is due to the LLM's world +knowledge. Overall, our results highlight the importance of testing whether an +LLM has seen an evaluation dataset during pre-training. We release the +https://github.com/interpretml/LLM-Tabular-Memorization-Checker Python package +to test LLMs for memorization of tabular datasets. + +
+
+ comment: COLM camera ready +
+
+
+
+
+ + ♻ ☆ Exploiting Defenses against GAN-Based Feature Inference Attacks in + Federated Learning + + +
+ Federated learning (FL) is a decentralized model training framework that aims +to merge isolated data islands while maintaining data privacy. However, recent +studies have revealed that Generative Adversarial Network (GAN) based attacks +can be employed in FL to learn the distribution of private datasets and +reconstruct recognizable images. In this paper, we exploit defenses against +GAN-based attacks in FL and propose a framework, Anti-GAN, to prevent attackers +from learning the real distribution of the victim's data. The core idea of +Anti-GAN is to manipulate the visual features of private training images to +make them indistinguishable to human eyes even restored by attackers. +Specifically, Anti-GAN projects the private dataset onto a GAN's generator and +combines the generated fake images with the actual images to create the +training dataset, which is then used for federated model training. The +experimental results demonstrate that Anti-GAN is effective in preventing +attackers from learning the distribution of private images while causing +minimal harm to the accuracy of the federated model. + +
+
+
+
+
+ + ♻ ☆ SUBER: An RL Environment with Simulated Human Behavior for Recommender + Systems + + +
+ Reinforcement learning (RL) has gained popularity in the realm of recommender +systems due to its ability to optimize long-term rewards and guide users in +discovering relevant content. However, the successful implementation of RL in +recommender systems is challenging because of several factors, including the +limited availability of online data for training on-policy methods. This +scarcity requires expensive human interaction for online model training. +Furthermore, the development of effective evaluation frameworks that accurately +reflect the quality of models remains a fundamental challenge in recommender +systems. To address these challenges, we propose a comprehensive framework for +synthetic environments that simulate human behavior by harnessing the +capabilities of large language models (LLMs). We complement our framework with +in-depth ablation studies and demonstrate its effectiveness with experiments on +movie and book recommendations. Using LLMs as synthetic users, this work +introduces a modular and novel framework to train RL-based recommender systems. +The software, including the RL environment, is publicly available on GitHub. + +
+
+
+
+
+ + ♻ ☆ CELLM: An Efficient Communication in Large Language Models Training for + Federated Learning + + +
+ Federated Learning (FL) is a recent model training paradigm in which client +devices collaboratively train a model without ever aggregating their data. +Crucially, this scheme offers users potential privacy and security benefits by +only ever communicating updates to the model weights to a central server as +opposed to traditional machine learning (ML) training which directly +communicates and aggregates data. However, FL training suffers from statistical +heterogeneity as clients may have differing local data distributions. Large +language models (LLMs) offer a potential solution to this issue of +heterogeneity given that they have consistently been shown to be able to learn +on vast amounts of noisy data. While LLMs are a promising development for +resolving the consistent issue of non-I.I.D. Clients in federated settings +exacerbate two other bottlenecks in FL: limited local computing and expensive +communication. This thesis aims to develop efficient training methods for LLMs +in FL. To this end, we employ two critical techniques in enabling efficient +training. First, we use low-rank adaptation (LoRA) to reduce the computational +load of local model training. Second, we communicate sparse updates throughout +training to significantly cut down on communication costs. Taken together, our +method reduces communication costs by up to 10x over vanilla LoRA and up to 5x +over more complex sparse LoRA baselines while achieving greater utility. We +emphasize the importance of carefully applying sparsity and picking effective +rank and sparsity configurations for federated LLM training. + +
+
+ comment: arXiv admin note: This submission has been withdrawn by arXiv + administrators due to inappropriate text overlap with external sources +
+
+
+
+
+ + ♻ ☆ Probabilities of the Third Type: Statistical Relational Learning and + Reasoning with Relative Frequencies + + +
+ Dependencies on the relative frequency of a state in the domain are common +when modelling probabilistic dependencies on relational data. For instance, the +likelihood of a school closure during an epidemic might depend on the +proportion of infected pupils exceeding a threshold. Often, rather than +depending on discrete thresholds, dependencies are continuous: for instance, +the likelihood of any one mosquito bite transmitting an illness depends on the +proportion of carrier mosquitoes. Current approaches usually only consider +probabilities over possible worlds rather than over domain elements themselves. +An exception are the recently introduced lifted Bayesian networks for +conditional probability logic, which express discrete dependencies on +probabilistic data. We introduce functional lifted Bayesian networks, a +formalism that explicitly incorporates continuous dependencies on relative +frequencies into statistical relational artificial intelligence, and compare +and contrast them with lifted Bayesian networks for conditional probability +logic. Incorporating relative frequencies is not only beneficial to modelling; +it also provides a more rigorous approach to learning problems where training +and test or application domains have different sizes. To this end, we provide a +representation of the asymptotic probability distributions induced by +functional lifted Bayesian networks on domains of increasing sizes. Since that +representation has well-understood scaling behaviour across domain sizes, it +can be used to estimate parameters for a large domain consistently from +randomly sampled subpopulations. Furthermore, we show that in parametric +families of FLBN, convergence is uniform in the parameters, which ensures a +meaningful dependence of the asymptotic probabilities on the parameters of the +model. + +
+
+ comment: 30 pages +
+
+
+
+
+ + ♻ ☆ DropKAN: Regularizing KANs by masking post-activations + + +
+ We propose DropKAN (Dropout Kolmogorov-Arnold Networks) a regularization +method that prevents co-adaptation of activation function weights in +Kolmogorov-Arnold Networks (KANs). DropKAN functions by embedding the drop mask +directly within the KAN layer, randomly masking the outputs of some activations +within the KANs' computation graph. We show that this simple procedure that +require minimal coding effort has a regularizing effect and consistently lead +to better generalization of KANs. We analyze the adaptation of the standard +Dropout with KANs and demonstrate that Dropout applied to KANs' neurons can +lead to unpredictable behavior in the feedforward pass. We carry an empirical +study with real world Machine Learning datasets to validate our findings. Our +results suggest that DropKAN is consistently a better alternative to using +standard Dropout with KANs, and improves the generalization performance of +KANs. Our implementation of DropKAN is available at: +\url{https://github.com/Ghaith81/dropkan}. + +
+
+
+
+
+ + ♻ ☆ $p$SVM: Soft-margin SVMs with $p$-norm Hinge Loss + + +
+ Support Vector Machines (SVMs) based on hinge loss have been extensively +discussed and applied to various binary classification tasks. These SVMs +achieve a balance between margin maximization and the minimization of slack due +to outliers. Although many efforts have been dedicated to enhancing the +performance of SVMs with hinge loss, studies on $p$SVMs, soft-margin SVMs with +$p$-norm hinge loss, remain relatively scarce. In this paper, we explore the +properties, performance, and training algorithms of $p$SVMs. We first derive +the generalization bound of $p$SVMs, then formulate the dual optimization +problem, comparing it with the traditional approach. Furthermore, we discuss a +generalized version of the Sequential Minimal Optimization (SMO) algorithm, +$p$SMO, to train our $p$SVM model. Comparative experiments on various datasets, +including binary and multi-class classification tasks, demonstrate the +effectiveness and advantages of our $p$SVM model and the $p$SMO method. Code is +available at https://github.com/CoderBak/pSVM. + +
+
+
+
+
+ + ♻ ☆ Human-inspired Explanations for Vision Transformers and Convolutional + Neural Networks ECCV 2024 + + +
+ We introduce Foveation-based Explanations (FovEx), a novel human-inspired +visual explainability (XAI) method for Deep Neural Networks. Our method +achieves state-of-the-art performance on both transformer (on 4 out of 5 +metrics) and convolutional models (on 3 out of 5 metrics), demonstrating its +versatility. Furthermore, we show the alignment between the explanation map +produced by FovEx and human gaze patterns (+14\% in NSS compared to RISE, ++203\% in NSS compared to gradCAM), enhancing our confidence in FovEx's ability +to close the interpretation gap between humans and machines. + +
+
+ comment: Accepted at the Human-inspired Computer Vision (HCV) ECCV 2024 + Workshop as an extended abstract. A long version of the work can be found at + arXiv:2408.02123v1 +
+
+
+
+
+ + ♻ ☆ Limited Communications Distributed Optimization via Deep Unfolded + Distributed ADMM + + +
+ Distributed optimization is a fundamental framework for collaborative +inference and decision making in decentralized multi-agent systems. The +operation is modeled as the joint minimization of a shared objective which +typically depends on observations gathered locally by each agent. Distributed +optimization algorithms, such as the common D-ADMM, tackle this task by +iteratively combining local computations and message exchanges. One of the main +challenges associated with distributed optimization, and particularly with +D-ADMM, is that it requires a large number of communications, i.e., messages +exchanged between the agents, to reach consensus. This can make D-ADMM costly +in power, latency, and channel resources. In this work we propose unfolded +D-ADMM, which follows the emerging deep unfolding methodology to enable D-ADMM +to operate reliably with a predefined and small number of messages exchanged by +each agent. Unfolded D-ADMM fully preserves the operation of D-ADMM, while +leveraging data to tune the hyperparameters of each iteration of the algorithm. +These hyperparameters can either be agent-specific, aiming at achieving the +best performance within a fixed number of iterations over a given network, or +shared among the agents, allowing to learn to distributedly optimize over +different networks. For both settings, our unfolded D-ADMM operates with +limited communications, while preserving the interpretability and flexibility +of the original D-ADMM algorithm. We specialize unfolded D-ADMM for two +representative settings: a distributed estimation task, considering a sparse +recovery setup, and a distributed learning scenario, where multiple agents +collaborate in learning a machine learning model. Our numerical results +demonstrate that the proposed approach dramatically reduces the number of +communications utilized by D-ADMM, without compromising on its performance. + +
+
+
+
+
+ + ♻ ☆ NeuralMatrix: Compute the Entire Neural Networks with Linear Matrix + Operations for Efficient Inference AAAI + + +
+ The inherent diversity of computation types within the deep neural network +(DNN) models often requires a variety of specialized units in hardware +processors, which limits computational efficiency, increasing both inference +latency and power consumption, especially when the hardware processor needs to +support and execute different neural networks. In this study, we introduce +NeuralMatrix, which elastically transforms the computations of entire DNNs into +linear matrix operations. This transformation allows seamless execution of +various DNN models all with matrix operations and paves the way for running +versatile DNN models with a single General Matrix Multiplication (GEMM) +accelerator.Extensive experiments with both CNN and transformer-based models +demonstrate the potential of NeuralMatrix to accurately and efficiently execute +a wide range of DNN models, achieving 2.17-38.72 times computation efficiency +(i.e., throughput per power) compared to CPUs, GPUs, and SoC platforms. This +level of efficiency is usually only attainable with the accelerator designed +for a specific neural network. + +
+
+ comment: 9 pages, 8figures, Submitted to The 39th Annual AAAI Conference on + Artificial Intelligence +
+
+
+
+
+ + ♻ ☆ Deep Generative Models in Robotics: A Survey on Learning from Multimodal + Demonstrations + + +
+ Learning from Demonstrations, the field that proposes to learn robot behavior +models from data, is gaining popularity with the emergence of deep generative +models. Although the problem has been studied for years under names such as +Imitation Learning, Behavioral Cloning, or Inverse Reinforcement Learning, +classical methods have relied on models that don't capture complex data +distributions well or don't scale well to large numbers of demonstrations. In +recent years, the robot learning community has shown increasing interest in +using deep generative models to capture the complexity of large datasets. In +this survey, we aim to provide a unified and comprehensive review of the last +year's progress in the use of deep generative models in robotics. We present +the different types of models that the community has explored, such as +energy-based models, diffusion models, action value maps, or generative +adversarial networks. We also present the different types of applications in +which deep generative models have been used, from grasp generation to +trajectory generation or cost learning. One of the most important elements of +generative models is the generalization out of distributions. In our survey, we +review the different decisions the community has made to improve the +generalization of the learned models. Finally, we highlight the research +challenges and propose a number of future directions for learning deep +generative models in robotics. + +
+
+ comment: 20 pages, 11 figures, submitted to TRO +
+
+
+
+
+ + ♻ ☆ Graph Reinforcement Learning for Combinatorial Optimization: A Survey + and Unifying Perspective + + +
+ Graphs are a natural representation for systems based on relations between +connected entities. Combinatorial optimization problems, which arise when +considering an objective function related to a process of interest on discrete +structures, are often challenging due to the rapid growth of the solution +space. The trial-and-error paradigm of Reinforcement Learning has recently +emerged as a promising alternative to traditional methods, such as exact +algorithms and (meta)heuristics, for discovering better decision-making +strategies in a variety of disciplines including chemistry, computer science, +and statistics. Despite the fact that they arose in markedly different fields, +these techniques share significant commonalities. Therefore, we set out to +synthesize this work in a unifying perspective that we term Graph Reinforcement +Learning, interpreting it as a constructive decision-making method for graph +problems. After covering the relevant technical background, we review works +along the dividing line of whether the goal is to optimize graph structure +given a process of interest, or to optimize the outcome of the process itself +under fixed graph structure. Finally, we discuss the common challenges facing +the field and open research questions. In contrast with other surveys, the +present work focuses on non-canonical graph problems for which performant +algorithms are typically not known and Reinforcement Learning is able to +provide efficient and effective solutions. + +
+
+ comment: To appear in Transactions on Machine Learning Research (TMLR) +
+
+
+
+
+ + ♻ ☆ Online SLA Decomposition: Enabling Real-Time Adaptation to Evolving + Systems + + +
+ When a network slice spans multiple domains, each domain must uphold the +End-to-End (E2E) Service Level Agreement (SLA) associated with the slice. This +requires decomposing the E2E SLA into partial SLAs for each domain. In a +two-level network slicing management system with an E2E orchestrator and local +controllers, we propose an online learning-decomposition framework that +dynamically updates risk models using recent feedback. This approach utilizes +online gradient descent and FIFO memory buffers to enhance stability and +robustness. Our empirical study shows the proposed framework outperforms +state-of-the-art static methods, offering more accurate and resilient SLA +decomposition under varying conditions and sparse data. + +
+
+ comment: The paper has been submitted to IEEE Networking Letters +
+
+
+
+
+ + ♻ ☆ Tailoring Graph Neural Network-based Flow-guided Localization to + Individual Bloodstreams and Activities + + +
+ Flow-guided localization using in-body nanodevices in the bloodstream is +expected to be beneficial for early disease detection, continuous monitoring of +biological conditions, and targeted treatment. The nanodevices face size and +power constraints that produce erroneous raw data for localization purposes. +On-body anchors receive this data, and use it to derive the locations of +diagnostic events of interest. Different Machine Learning (ML) approaches have +been recently proposed for this task, yet they are currently restricted to a +reference bloodstream of a resting patient. As such, they are unable to deal +with the physical diversity of patients' bloodstreams and cannot provide +continuous monitoring due to changes in individual patient's activities. Toward +addressing these issues for the current State-of-the-Art (SotA) flow-guided +localization approach based on Graph Neural Networks (GNNs), we propose a +pipeline for GNN adaptation based on individual physiological indicators +including height, weight, and heart rate. Our results indicate that the +proposed adaptions are beneficial in reconciling the individual differences +between bloodstreams and activities. + +
+
+ comment: 7 pages, 9 figures, 2 tables, 16 references, accepted at ACM + NanoCom'25 +
+
+
+
+
+ + ♻ ☆ Persistent Ballistic Entanglement Spreading with Optimal Control in + Quantum Spin Chains + + +
+ Entanglement propagation provides a key routine to understand quantum +many-body dynamics in and out of equilibrium. The entanglement entropy (EE) +usually approaches to a sub-saturation known as the Page value $\tilde{S}_{P} +=\tilde{S} - dS$ (with $\tilde{S}$ the maximum of EE and $dS$ the Page +correction) in, e.g., the random unitary evolutions. The ballistic spreading of +EE usually appears in the early time and will be deviated far before the Page +value is reached. In this work, we uncover that the magnetic field that +maximizes the EE robustly induces persistent ballistic spreading of +entanglement in quantum spin chains. The linear growth of EE is demonstrated to +persist till the maximal $\tilde{S}$ (along with a flat entanglement spectrum) +is reached. The robustness of ballistic spreading and the enhancement of EE +under such an optimal control are demonstrated, considering particularly +perturbing the initial state by random pure states (RPS's). These are argued as +the results from the endomorphism of the time evolution under such an +entanglement-enhancing optimal control for the RPS's. + +
+
+ comment: Main text: 6 pages, 5 figures + Supplemental material +
+
+
+
+
+ + ♻ ☆ Resource-constrained Fairness + + +
+ Access to resources strongly constrains the decisions we make. While we might +wish to offer every student a scholarship, or schedule every patient for +follow-up meetings with a specialist, limited resources mean that this is not +possible. When deploying machine learning systems, these resource constraints +are simply enforced by varying the threshold of a classifier. However, these +finite resource limitations are disregarded by most existing tools for fair +machine learning, which do not allow the specification of resource limitations +and do not remain fair when varying thresholds. This makes them ill-suited for +real-world deployment. Our research introduces the concept of +"resource-constrained fairness" and quantifies the cost of fairness within this +framework. We demonstrate that the level of available resources significantly +influences this cost, a factor overlooked in previous evaluations. + +
+
+
+
+
+ + ♻ ☆ Segment, Select, Correct: A Framework for Weakly-Supervised Referring + Segmentation ECCV'24 + + +
+ Referring Image Segmentation (RIS) - the problem of identifying objects in +images through natural language sentences - is a challenging task currently +mostly solved through supervised learning. However, while collecting referred +annotation masks is a time-consuming process, the few existing +weakly-supervised and zero-shot approaches fall significantly short in +performance compared to fully-supervised learning ones. To bridge the +performance gap without mask annotations, we propose a novel weakly-supervised +framework that tackles RIS by decomposing it into three steps: obtaining +instance masks for the object mentioned in the referencing instruction +(segment), using zero-shot learning to select a potentially correct mask for +the given instruction (select), and bootstrapping a model which allows for +fixing the mistakes of zero-shot selection (correct). In our experiments, using +only the first two steps (zero-shot segment and select) outperforms other +zero-shot baselines by as much as 16.5%, while our full method improves upon +this much stronger baseline and sets the new state-of-the-art for +weakly-supervised RIS, reducing the gap between the weakly-supervised and +fully-supervised methods in some cases from around 33% to as little as 7%. Code +is available at https://github.com/fgirbal/segment-select-correct. + +
+
+ comment: Accepted to ECCV'24 Workshop Proceedings (Instance-Level Recognition + Workshop) +
+
+
+
+
+ + ♻ ☆ Predicting Short Term Energy Demand in Smart Grid: A Deep Learning + Approach for Integrating Renewable Energy Sources in Line with SDGs 7, 9, and + 13 + + +
+ Integrating renewable energy sources into the power grid is becoming +increasingly important as the world moves towards a more sustainable energy +future in line with SDG 7. However, the intermittent nature of renewable energy +sources can make it challenging to manage the power grid and ensure a stable +supply of electricity, which is crucial for achieving SDG 9. In this paper, we +propose a deep learning model for predicting energy demand in a smart power +grid, which can improve the integration of renewable energy sources by +providing accurate predictions of energy demand. Our approach aligns with SDG +13 on climate action, enabling more efficient management of renewable energy +resources. We use long short-term memory networks, well-suited for time series +data, to capture complex patterns and dependencies in energy demand data. The +proposed approach is evaluated using four historical short-term energy demand +data datasets from different energy distribution companies, including American +Electric Power, Commonwealth Edison, Dayton Power and Light, and +Pennsylvania-New Jersey-Maryland Interconnection. The proposed model is +compared with three other state-of-the-art forecasting algorithms: Facebook +Prophet, Support Vector Regression, and Random Forest Regression. The +experimental results show that the proposed REDf model can accurately predict +energy demand with a mean absolute error of 1.4%, indicating its potential to +enhance the stability and efficiency of the power grid and contribute to +achieving SDGs 7, 9, and 13. The proposed model also has the potential to +manage the integration of renewable energy sources effectively. + +
+
+
+
+
+ + ♻ ☆ Breaking Language Barriers with MMTweets: Advancing Cross-Lingual + Debunked Narrative Retrieval for Fact-Checking + + +
+ Finding previously debunked narratives involves identifying claims that have +already undergone fact-checking. The issue intensifies when similar false +claims persist in multiple languages, despite the availability of debunks for +several months in another language. Hence, automatically finding debunks (or +fact-checks) in multiple languages is crucial to make the best use of scarce +fact-checkers' resources. Mainly due to the lack of readily available data, +this is an understudied problem, particularly when considering the +cross-lingual scenario, i.e. the retrieval of debunks in a language different +from the language of the online post being checked. This study introduces +cross-lingual debunked narrative retrieval and addresses this research gap by: +(i) creating Multilingual Misinformation Tweets (MMTweets): a dataset that +stands out, featuring cross-lingual pairs, images, human annotations, and +fine-grained labels, making it a comprehensive resource compared to its +counterparts; (ii) conducting an extensive experiment to benchmark +state-of-the-art cross-lingual retrieval models and introducing multistage +retrieval methods tailored for the task; and (iii) comprehensively evaluating +retrieval models for their cross-lingual and cross-dataset transfer +capabilities within MMTweets, and conducting a retrieval latency analysis. We +find that MMTweets presents challenges for cross-lingual debunked narrative +retrieval, highlighting areas for improvement in retrieval models. Nonetheless, +the study provides valuable insights for creating MMTweets datasets and +optimising debunked narrative retrieval models to empower fact-checking +endeavours. The dataset and annotation codebook are publicly available at +https://doi.org/10.5281/zenodo.10637161. + +
+
+
+
+
+ + ♻ ☆ FairX: A comprehensive benchmarking tool for model analysis using + fairness, utility, and explainability + + +
+ We present FairX, an open-source Python-based benchmarking tool designed for +the comprehensive analysis of models under the umbrella of fairness, utility, +and eXplainability (XAI). FairX enables users to train benchmarking +bias-removal models and evaluate their fairness using a wide array of fairness +metrics, data utility metrics, and generate explanations for model predictions, +all within a unified framework. Existing benchmarking tools do not have the way +to evaluate synthetic data generated from fair generative models, also they do +not have the support for training fair generative models either. In FairX, we +add fair generative models in the collection of our fair-model library +(pre-processing, in-processing, post-processing) and evaluation metrics for +evaluating the quality of synthetic fair data. This version of FairX supports +both tabular and image datasets. It also allows users to provide their own +custom datasets. The open-source FairX benchmarking package is publicly +available at https://github.com/fahim-sikder/FairX. + +
+
+
+
+
+ + ♻ ☆ Seamless Integration: Sampling Strategies in Federated Learning Systems + + +
+ Federated Learning (FL) represents a paradigm shift in the field of machine +learning, offering an approach for a decentralized training of models across a +multitude of devices while maintaining the privacy of local data. However, the +dynamic nature of FL systems, characterized by the ongoing incorporation of new +clients with potentially diverse data distributions and computational +capabilities, poses a significant challenge to the stability and efficiency of +these distributed learning networks. The seamless integration of new clients is +imperative to sustain and enhance the performance and robustness of FL systems. +This paper looks into the complexities of integrating new clients into existing +FL systems and explores how data heterogeneity and varying data distribution +(not independent and identically distributed) among them can affect model +training, system efficiency, scalability and stability. Despite these +challenges, the integration of new clients into FL systems presents +opportunities to enhance data diversity, improve learning performance, and +leverage distributed computational power. In contrast to other fields of +application such as the distributed optimization of word predictions on Gboard +(where federated learning once originated), there are usually only a few +clients in the production environment, which is why information from each new +client becomes all the more valuable. This paper outlines strategies for +effective client selection strategies and solutions for ensuring system +scalability and stability. Using the example of images from optical quality +inspection, it offers insights into practical approaches. In conclusion, this +paper proposes that addressing the challenges presented by new client +integration is crucial to the advancement and efficiency of distributed +learning networks, thus paving the way for the adoption of Federated Learning +in production environments. + +
+
+ comment: The 2nd IEEE International Conference on Federated Learning + Technologies and Applications (FLTA24) +
+
+
+
+
+ + ♻ ☆ Atlas-Based Interpretable Age Prediction In Whole-Body MR Images + + +
+ Age prediction is an important part of medical assessments and research. It +can aid in detecting diseases as well as abnormal ageing by highlighting +potential discrepancies between chronological and biological age. To improve +understanding of age-related changes in various body parts, we investigate the +ageing of the human body on a large scale by using whole-body 3D images. We +utilise the Grad-CAM method to determine the body areas most predictive of a +person's age. In order to expand our analysis beyond individual subjects, we +employ registration techniques to generate population-wide importance maps that +show the most predictive areas in the body for a whole cohort of subjects. We +show that the investigation of the full 3D volume of the whole body and the +population-wide analysis can give important insights into which body parts play +the most important roles in predicting a person's age. Our findings reveal +three primary areas of interest: the spine, the autochthonous back muscles, and +the cardiac region, which exhibits the highest importance. Finally, we +investigate differences between subjects that show accelerated and decelerated +ageing. + +
+
+
+
+
+ + ♻ ☆ DPM: Clustering Sensitive Data through Separation + + +
+ Clustering is an important tool for data exploration where the goal is to +subdivide a data set into disjoint clusters that fit well into the underlying +data structure. When dealing with sensitive data, privacy-preserving algorithms +aim to approximate the non-private baseline while minimising the leakage of +sensitive information. State-of-the-art privacy-preserving clustering +algorithms tend to output clusters that are good in terms of the standard +metrics, inertia, silhouette score, and clustering accuracy, however, the +clustering result strongly deviates from the non-private KMeans baseline. In +this work, we present a privacy-preserving clustering algorithm called DPM that +recursively separates a data set into clusters based on a geometrical +clustering approach. In addition, DPM estimates most of the data-dependent +hyper-parameters in a privacy-preserving way. We prove that DPM preserves +Differential Privacy and analyse the utility guarantees of DPM. Finally, we +conduct an extensive empirical evaluation for synthetic and real-life data +sets. We show that DPM achieves state-of-the-art utility on the standard +clustering metrics and yields a clustering result much closer to that of the +popular non-private KMeans algorithm without requiring the number of classes. + +
+
+ comment: The first two authors equally contributed to this work +
+
+
+
+
+ + ♻ ☆ A Fast and Computationally Inexpensive Method For Image Translation of + 3D Volume Patient Data + + +
+ CycleGAN was trained on SynthRAD Grand Challenge Dataset using the +single-epoch modification (SEM) method proposed in this paper which is referred +to as (CycleGAN-single) compared to the usual method of training CycleGAN on +around 200 epochs (CycleGAN-multi). Model performance were evaluated +qualitatively and quantitatively with quantitative performance metrics like +PSNR, SSIM, MAE and MSE. The consideration of both quantitative and qualitative +performance when evaluating a model is unique to certain image-translation +tasks like medical imaging as detailed in this paper. Also, this paper shows +that good quantitative performance does not always imply good qualitative +performance and the converse is also not always True (i.e. good qualitative +performance does not always imply good quantitative performance). This paper +also proposes FQGA (Fast Paired Image-to-Image Translation Quarter-Generator +Adversary) Model which has 1/4 the number of parameters compared to CycleGAN +(when comparing their Generator Models). FQGA outperforms CycleGAN +qualitatively and quantitatively even only after training on 20 epochs. +Finally, using SEM method on FQGA allowed it to again outperform CycleGAN both +quantitatively and qualitatively. These performance gains with fewer model +parameters and time savings from running fewer epochs may also be applicable to +other image-to-image translation tasks in Machine Learning apart from the +Medical image-translation task discussed in this paper between Cone Beam +Computed Tomography (CBCT) and Computed Tomography (CT) images. + +
+
+
+
+
+ + ♻ ☆ Fishers Harvest Parallel Unlearning in Inherited Model Networks + + +
+ Unlearning in various learning frameworks remains challenging, with the +continuous growth and updates of models exhibiting complex inheritance +relationships. This paper presents a novel unlearning framework, which enables +fully parallel unlearning among models exhibiting inheritance. A key enabler is +the new Unified Model Inheritance Graph (UMIG), which captures the inheritance +using a Directed Acyclic Graph (DAG).Central to our framework is the new Fisher +Inheritance Unlearning (FIUn) algorithm, which utilizes the Fisher Information +Matrix (FIM) from initial unlearning models to pinpoint impacted parameters in +inherited models. By employing FIM, the FIUn method breaks the sequential +dependencies among the models, facilitating simultaneous unlearning and +reducing computational overhead. We further design to merge disparate FIMs into +a single matrix, synchronizing updates across inherited models. Experiments +confirm the effectiveness of our unlearning framework. For single-class tasks, +it achieves complete unlearning with 0\% accuracy for unlearned labels while +maintaining 94.53\% accuracy for retained labels on average. For multi-class +tasks, the accuracy is 1.07\% for unlearned labels and 84.77\% for retained +labels on average. Our framework accelerates unlearning by 99\% compared to +alternative methods. + +
+
+
+
+
+ + ♻ ☆ Snuffy: Efficient Whole Slide Image Classifier ECCV 2024 + + +
+ Whole Slide Image (WSI) classification with multiple instance learning (MIL) +in digital pathology faces significant computational challenges. Current +methods mostly rely on extensive self-supervised learning (SSL) for +satisfactory performance, requiring long training periods and considerable +computational resources. At the same time, no pre-training affects performance +due to domain shifts from natural images to WSIs. We introduce Snuffy +architecture, a novel MIL-pooling method based on sparse transformers that +mitigates performance loss with limited pre-training and enables continual +few-shot pre-training as a competitive option. Our sparsity pattern is tailored +for pathology and is theoretically proven to be a universal approximator with +the tightest probabilistic sharp bound on the number of layers for sparse +transformers, to date. We demonstrate Snuffy's effectiveness on CAMELYON16 and +TCGA Lung cancer datasets, achieving superior WSI and patch-level accuracies. +The code is available on https://github.com/jafarinia/snuffy. + +
+
+ comment: Accepted for ECCV 2024 +
+
+
+
+
+ + ♻ ☆ HiBid: A Cross-Channel Constrained Bidding System with Budget Allocation + by Hierarchical Offline Deep Reinforcement Learning + + +
+ Online display advertising platforms service numerous advertisers by +providing real-time bidding (RTB) for the scale of billions of ad requests +every day. The bidding strategy handles ad requests cross multiple channels to +maximize the number of clicks under the set financial constraints, i.e., total +budget and cost-per-click (CPC), etc. Different from existing works mainly +focusing on single channel bidding, we explicitly consider cross-channel +constrained bidding with budget allocation. Specifically, we propose a +hierarchical offline deep reinforcement learning (DRL) framework called +``HiBid'', consisted of a high-level planner equipped with auxiliary loss for +non-competitive budget allocation, and a data augmentation enhanced low-level +executor for adaptive bidding strategy in response to allocated budgets. +Additionally, a CPC-guided action selection mechanism is introduced to satisfy +the cross-channel CPC constraint. Through extensive experiments on both the +large-scale log data and online A/B testing, we confirm that HiBid outperforms +six baselines in terms of the number of clicks, CPC satisfactory ratio, and +return-on-investment (ROI). We also deploy HiBid on Meituan advertising +platform to already service tens of thousands of advertisers every day. + +
+
+
+
+
+ + ♻ ☆ Joint Selective State Space Model and Detrending for Robust Time Series + Anomaly Detection SP + + +
+ Deep learning-based sequence models are extensively employed in Time Series +Anomaly Detection (TSAD) tasks due to their effective sequential modeling +capabilities. However, the ability of TSAD is limited by two key challenges: +(i) the ability to model long-range dependency and (ii) the generalization +issue in the presence of non-stationary data. To tackle these challenges, an +anomaly detector that leverages the selective state space model known for its +proficiency in capturing long-term dependencies across various domains is +proposed. Additionally, a multi-stage detrending mechanism is introduced to +mitigate the prominent trend component in non-stationary data to address the +generalization issue. Extensive experiments conducted on realworld public +datasets demonstrate that the proposed methods surpass all 12 compared baseline +methods. + +
+
+ comment: Accepted by IEEE Signal Processing Letters. + DOI:10.1109/LSP.2024.3438078 +
+
+
+
+
+ + ♻ ☆ Challenging the Human-in-the-loop in Algorithmic Decision-making + + +
+ We discuss the role of humans in algorithmic decision-making (ADM) for +socially relevant problems from a technical and philosophical perspective. In +particular, we illustrate tensions arising from diverse expectations, values, +and constraints by and on the humans involved. To this end, we assume that a +strategic decision-maker (SDM) introduces ADM to optimize strategic and +societal goals while the algorithms' recommended actions are overseen by a +practical decision-maker (PDM) - a specific human-in-the-loop - who makes the +final decisions. While the PDM is typically assumed to be a corrective, it can +counteract the realization of the SDM's desired goals and societal values not +least because of a misalignment of these values and unmet information needs of +the PDM. This has significant implications for the distribution of power +between the stakeholders in ADM, their constraints, and information needs. In +particular, we emphasize the overseeing PDM's role as a potential political and +ethical decision maker, who acts expected to balance strategic, value-driven +objectives and on-the-ground individual decisions and constraints. We +demonstrate empirically, on a machine learning benchmark dataset, the +significant impact an overseeing PDM's decisions can have even if the PDM is +constrained to performing only a limited amount of actions differing from the +algorithms' recommendations. To ensure that the SDM's intended values are +realized, the PDM needs to be provided with appropriate information conveyed +through tailored explanations and its role must be characterized clearly. Our +findings emphasize the need for an in-depth discussion of the role and power of +the PDM and challenge the often-taken view that just including a +human-in-the-loop in ADM ensures the 'correct' and 'ethical' functioning of the +system. + +
+
+
+
+
+ + ♻ ☆ Deep Index Policy for Multi-Resource Restless Matching Bandit and Its + Application in Multi-Channel Scheduling + + +
+ Scheduling in multi-channel wireless communication system presents formidable +challenges in effectively allocating resources. To address these challenges, we +investigate a multi-resource restless matching bandit (MR-RMB) model for +heterogeneous resource systems with an objective of maximizing long-term +discounted total rewards while respecting resource constraints. We have also +generalized to applications beyond multi-channel wireless. We discuss the +Max-Weight Index Matching algorithm, which optimizes resource allocation based +on learned partial indexes. We have derived the policy gradient theorem for +index learning. Our main contribution is the introduction of a new Deep Index +Policy (DIP), an online learning algorithm tailored for MR-RMB. DIP learns the +partial index by leveraging the policy gradient theorem for restless arms with +convoluted and unknown transition kernels of heterogeneous resources. We +demonstrate the utility of DIP by evaluating its performance for three +different MR-RMB problems. Our simulation results show that DIP indeed learns +the partial indexes efficiently. + +
+
+
+
+
+ + ♻ ☆ Enhancing Startup Success Predictions in Venture Capital: A GraphRAG + Augmented Multivariate Time Series Method + + +
+ In the Venture Capital(VC) industry, predicting the success of startups is +challenging due to limited financial data and the need for subjective revenue +forecasts. Previous methods based on time series analysis or deep learning +often fall short as they fail to incorporate crucial inter-company +relationships such as competition and collaboration. Regarding the issues, we +propose a novel approach using GrahphRAG augmented time series model. With +GraphRAG, time series predictive methods are enhanced by integrating these +vital relationships into the analysis framework, allowing for a more dynamic +understanding of the startup ecosystem in venture capital. Our experimental +results demonstrate that our model significantly outperforms previous models in +startup success predictions. To the best of our knowledge, our work is the +first application work of GraphRAG. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2312.13936, + arXiv:2312.04876, arXiv:2402.11454 by other authors +
+
+
+
+
+ + ♻ ☆ A Correlation-induced Finite Difference Estimator + + +
+ Finite difference (FD) approximation is a classic approach to stochastic +gradient estimation when only noisy function realizations are available. In +this paper, we first provide a sample-driven method via the bootstrap technique +to estimate the optimal perturbation, and then propose an efficient FD +estimator based on correlated samples at the estimated optimal perturbation. +Furthermore, theoretical analyses of both the perturbation estimator and the FD +estimator reveal that, {\it surprisingly}, the correlation enables the proposed +FD estimator to achieve a reduction in variance and, in some cases, a decrease +in bias compared to the traditional optimal FD estimator. Numerical results +confirm the efficiency of our estimators and align well with the theory +presented, especially in scenarios with small sample sizes. Finally, we apply +the estimator to solve derivative-free optimization (DFO) problems, and +numerical studies show that DFO problems with 100 dimensions can be effectively +solved. + +
+
+
+
+
+ + ♻ ☆ Auto-ICL: In-Context Learning without Human Supervision + + +
+ With in-context learning ability, the performance of large language models +can be significantly boosted when provided with appropriate context. However, +existing in-context learning methods mainly rely on human-provided contexts, +such as labeled examples and explicit instructions. Writing context by humans +is labor-intensive on various tasks and limits the model to tasks manageable by +humans. To overcome these limitations, we propose Automatic In-Context Learning +framework that enables the model to autonomously generate examples and +instructions for problem-solving. With experiments across various models and +datasets, results show that model-generated contexts outperform human-annotated +contexts, including Few-Shot and Few-Shot-CoT methods, and surpass existing +self-generated context methods like Zero-CoT and Auto-CoT. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Learning Rule-Induced Subgraph Representations for Inductive Relation + Prediction + + +
+ Inductive relation prediction (IRP) -- where entities can be different during +training and inference -- has shown great power for completing evolving +knowledge graphs. Existing works mainly focus on using graph neural networks +(GNNs) to learn the representation of the subgraph induced from the target +link, which can be seen as an implicit rule-mining process to measure the +plausibility of the target link. However, these methods cannot differentiate +the target link and other links during message passing, hence the final +subgraph representation will contain irrelevant rule information to the target +link, which reduces the reasoning performance and severely hinders the +applications for real-world scenarios. To tackle this problem, we propose a +novel \textit{single-source edge-wise} GNN model to learn the +\textbf{R}ule-induc\textbf{E}d \textbf{S}ubgraph represen\textbf{T}ations +(\textbf{REST}), which encodes relevant rules and eliminates irrelevant rules +within the subgraph. Specifically, we propose a \textit{single-source} +initialization approach to initialize edge features only for the target link, +which guarantees the relevance of mined rules and target link. Then we propose +several RNN-based functions for \textit{edge-wise} message passing to model the +sequential property of mined rules. REST is a simple and effective approach +with theoretical support to learn the \textit{rule-induced subgraph +representation}. Moreover, REST does not need node labeling, which +significantly accelerates the subgraph preprocessing time by up to +\textbf{11.66$\times$}. Experiments on inductive relation prediction benchmarks +demonstrate the effectiveness of our REST. Our code is available at +https://github.com/smart-lty/REST. + +
+
+
+
+
+ + ♻ ☆ Information-Theoretic Foundations for Machine Learning + + +
+ The staggering progress of machine learning in the past decade has been a +sight to behold. In retrospect, it is both remarkable and unsettling that these +milestones were achievable with little to no rigorous theory to guide +experimentation. Despite this fact, practitioners have been able to guide their +future experimentation via observations from previous large-scale empirical +investigations. However, alluding to Plato's Allegory of the cave, it is likely +that the observations which form the field's notion of reality are but shadows +representing fragments of that reality. In this work, we propose a theoretical +framework which attempts to answer what exists outside of the cave. To the +theorist, we provide a framework which is mathematically rigorous and leaves +open many interesting ideas for future exploration. To the practitioner, we +provide a framework whose results are very intuitive, general, and which will +help form principles to guide future investigations. Concretely, we provide a +theoretical framework rooted in Bayesian statistics and Shannon's information +theory which is general enough to unify the analysis of many phenomena in +machine learning. Our framework characterizes the performance of an optimal +Bayesian learner, which considers the fundamental limits of information. +Throughout this work, we derive very general theoretical results and apply them +to derive insights specific to settings ranging from data which is +independently and identically distributed under an unknown distribution, to +data which is sequential, to data which exhibits hierarchical structure +amenable to meta-learning. We conclude with a section dedicated to +characterizing the performance of misspecified algorithms. These results are +exciting and particularly relevant as we strive to overcome increasingly +difficult machine learning challenges in this endlessly complex world. + +
+
+
+
+
+ + ♻ ☆ TimeSieve: Extracting Temporal Dynamics through Information Bottlenecks + + +
+ Time series forecasting has become an increasingly popular research area due +to its critical applications in various real-world domains such as traffic +management, weather prediction, and financial analysis. Despite significant +advancements, existing models face notable challenges, including the necessity +of manual hyperparameter tuning for different datasets, and difficulty in +effectively distinguishing signal from redundant features in data characterized +by strong seasonality. These issues hinder the generalization and practical +application of time series forecasting models. To solve this issues, we propose +an innovative time series forecasting model TimeSieve designed to address these +challenges. Our approach employs wavelet transforms to preprocess time series +data, effectively capturing multi-scale features without the need for +additional parameters or manual hyperparameter tuning. Additionally, we +introduce the information bottleneck theory that filters out redundant features +from both detail and approximation coefficients, retaining only the most +predictive information. This combination reduces significantly improves the +model's accuracy. Extensive experiments demonstrate that our model outperforms +existing state-of-the-art methods on 70% of the datasets, achieving higher +predictive accuracy and better generalization across diverse datasets. Our +results validate the effectiveness of our approach in addressing the key +challenges in time series forecasting, paving the way for more reliable and +efficient predictive models in practical applications. The code for our model +is available at https://github.com/xll0328/TimeSieve. + +
+
+
+
+
+ + ♻ ☆ MoDeGPT: Modular Decomposition for Large Language Model Compression + + +
+ Large Language Models (LLMs) have reshaped the landscape of artificial +intelligence by demonstrating exceptional performance across various tasks. +However, substantial computational requirements make their deployment +challenging on devices with limited resources. Recently, compression methods +using low-rank matrix techniques have shown promise, yet these often lead to +degraded accuracy or introduce significant overhead in parameters and inference +latency. This paper introduces \textbf{Mo}dular \textbf{De}composition +(MoDeGPT), a novel structured compression framework that does not need recovery +fine-tuning while resolving the above drawbacks. MoDeGPT partitions the +Transformer block into modules comprised of matrix pairs and reduces the hidden +dimensions via reconstructing the module-level outputs. MoDeGPT is developed +based on a theoretical framework that utilizes three well-established matrix +decomposition algorithms -- Nystr\"om approximation, CR decomposition, and SVD +-- and applies them to our redefined transformer modules. Our comprehensive +experiments show MoDeGPT, without backward propagation, matches or surpasses +previous structured compression methods that rely on gradient information, and +saves 98% of compute costs on compressing a 13B model. On \textsc{Llama}-2/3 +and OPT models, MoDeGPT maintains 90-95% zero-shot performance with 25-30% +compression rates. Moreover, the compression can be done on a single GPU within +a few hours and increases the inference throughput by up to 46%. + +
+
+ comment: 31 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Periodic agent-state based Q-learning for POMDPs + + +
+ The standard approach for Partially Observable Markov Decision Processes +(POMDPs) is to convert them to a fully observed belief-state MDP. However, the +belief state depends on the system model and is therefore not viable in +reinforcement learning (RL) settings. A widely used alternative is to use an +agent state, which is a model-free, recursively updateable function of the +observation history. Examples include frame stacking and recurrent neural +networks. Since the agent state is model-free, it is used to adapt standard RL +algorithms to POMDPs. However, standard RL algorithms like Q-learning learn a +stationary policy. Our main thesis that we illustrate via examples is that +because the agent state does not satisfy the Markov property, non-stationary +agent-state based policies can outperform stationary ones. To leverage this +feature, we propose PASQL (periodic agent-state based Q-learning), which is a +variant of agent-state-based Q-learning that learns periodic policies. By +combining ideas from periodic Markov chains and stochastic approximation, we +rigorously establish that PASQL converges to a cyclic limit and characterize +the approximation error of the converged periodic policy. Finally, we present a +numerical experiment to highlight the salient features of PASQL and demonstrate +the benefit of learning periodic policies over stationary policies. + +
+
+
+
+
+ + ♻ ☆ An Efficient Real-Time Object Detection Framework on + Resource-Constricted Hardware Devices via Software and Hardware Co-design + + +
+ The fast development of object detection techniques has attracted attention +to developing efficient Deep Neural Networks (DNNs). However, the current +state-of-the-art DNN models can not provide a balanced solution among accuracy, +speed, and model size. This paper proposes an efficient real-time object +detection framework on resource-constrained hardware devices through hardware +and software co-design. The Tensor Train (TT) decomposition is proposed for +compressing the YOLOv5 model. By unitizing the unique characteristics given by +the TT decomposition, we develop an efficient hardware accelerator based on +FPGA devices. Experimental results show that the proposed method can +significantly reduce the model size and improve the execution time. + +
+
+ comment: 11 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ HYDEN: Hyperbolic Density Representations for Medical Images and Reports + + +
+ In light of the inherent entailment relations between images and text, +hyperbolic point vector embeddings, leveraging the hierarchical modeling +advantages of hyperbolic space, have been utilized for visual semantic +representation learning. However, point vector embedding approaches fail to +address the issue of semantic uncertainty, where an image may have multiple +interpretations, and text may refer to different images, a phenomenon +particularly prevalent in the medical domain. Therefor, we propose +\textbf{HYDEN}, a novel hyperbolic density embedding based image-text +representation learning approach tailored for specific medical domain data. +This method integrates text-aware local features alongside global features from +images, mapping image-text features to density features in hyperbolic space via +using hyperbolic pseudo-Gaussian distributions. An encapsulation loss function +is employed to model the partial order relations between image-text density +distributions. Experimental results demonstrate the interpretability of our +approach and its superior performance compared to the baseline methods across +various zero-shot tasks and different datasets. + +
+
+
+
+
+ + ♻ ☆ PLUTUS: A Well Pre-trained Large Unified Transformer can Unveil + Financial Time Series Regularities + + +
+ Financial time series modeling is crucial for understanding and predicting +market behaviors but faces challenges such as non-linearity, non-stationarity, +and high noise levels. Traditional models struggle to capture complex patterns +due to these issues, compounded by limitations in computational resources and +model capacity. Inspired by the success of large language models in NLP, we +introduce $\textbf{PLUTUS}$, a $\textbf{P}$re-trained $\textbf{L}$arge +$\textbf{U}$nified $\textbf{T}$ransformer-based model that $\textbf{U}$nveils +regularities in financial time $\textbf{S}$eries. PLUTUS uses an invertible +embedding module with contrastive learning and autoencoder techniques to create +an approximate one-to-one mapping between raw data and patch embeddings. +TimeFormer, an attention based architecture, forms the core of PLUTUS, +effectively modeling high-noise time series. We incorporate a novel attention +mechanisms to capture features across both variable and temporal dimensions. +PLUTUS is pre-trained on an unprecedented dataset of 100 billion observations, +designed to thrive in noisy financial environments. To our knowledge, PLUTUS is +the first open-source, large-scale, pre-trained financial time series model +with over one billion parameters. It achieves state-of-the-art performance in +various tasks, demonstrating strong transferability and establishing a robust +foundational model for finance. Our research provides technical guidance for +pre-training financial time series data, setting a new standard in the field. + +
+
+
+
+
+ + ♻ ☆ How to Make the Gradients Small Privately: Improved Rates for + Differentially Private Non-Convex Optimization ICML 2024 + + +
+ We provide a simple and flexible framework for designing differentially +private algorithms to find approximate stationary points of non-convex loss +functions. Our framework is based on using a private approximate risk minimizer +to "warm start" another private algorithm for finding stationary points. We use +this framework to obtain improved, and sometimes optimal, rates for several +classes of non-convex loss functions. First, we obtain improved rates for +finding stationary points of smooth non-convex empirical loss functions. +Second, we specialize to quasar-convex functions, which generalize star-convex +functions and arise in learning dynamical systems and training some neural +nets. We achieve the optimal rate for this class. Third, we give an optimal +algorithm for finding stationary points of functions satisfying the +Kurdyka-Lojasiewicz (KL) condition. For example, over-parameterized neural +networks often satisfy this condition. Fourth, we provide new state-of-the-art +rates for stationary points of non-convex population loss functions. Fifth, we +obtain improved rates for non-convex generalized linear models. A modification +of our algorithm achieves nearly the same rates for second-order stationary +points of functions with Lipschitz Hessian, improving over the previous +state-of-the-art for each of the above problems. + +
+
+ comment: ICML 2024 +
+
+
+
+
+ + ♻ ☆ Constructing Domain-Specific Evaluation Sets for LLM-as-a-judge + + +
+ Large Language Models (LLMs) have revolutionized the landscape of machine +learning, yet current benchmarks often fall short in capturing the diverse +behavior of these models in real-world applications. A benchmark's usefulness +is determined by its ability to clearly differentiate between models of varying +capabilities (separability) and closely align with human preferences. Existing +frameworks like Alpaca-Eval 2.0 LC +\cite{dubois2024lengthcontrolledalpacaevalsimpleway} and Arena-Hard v0.1 +\cite{li2024crowdsourced} are limited by their focus on general-purpose queries +and lack of diversity across domains such as law, medicine, and multilingual +contexts. In this paper, we address these limitations by introducing a novel +data pipeline that curates diverse, domain-specific evaluation sets tailored +for LLM-as-a-Judge frameworks. Our approach leverages a combination of manual +curation, semi-supervised learning to generate clusters, and stratified +sampling to ensure balanced representation across a wide range of domains and +languages. The resulting evaluation set, which includes 1573 samples across 14 +categories, demonstrates high separability (84\%) across ten top-ranked models, +and agreement (84\%) with Chatbot Arena and (0.915) Spearman correlation. The +agreement values are 9\% better than Arena Hard and 20\% better than AlpacaEval +2.0 LC, while the Spearman coefficient is 0.7 more than the next best +benchmark, showcasing a significant improvement in the usefulness of the +benchmark. We further provide an open-source evaluation tool that enables +fine-grained analysis of model performance across user-defined categories, +offering valuable insights for practitioners. This work contributes to the +ongoing effort to enhance the transparency, diversity, and effectiveness of LLM +evaluation methodologies. + +
+
+ comment: 14 pages, 8 figures, Under review +
+
+
+
+
+ + ♻ ☆ Text-Driven Neural Collaborative Filtering Model for Paper Source + Tracing KDD + + +
+ Identifying significant references within the complex interrelations of a +citation knowledge graph is challenging, which encompasses connections through +citations, authorship, keywords, and other relational attributes. The Paper +Source Tracing (PST) task seeks to automate the identification of pivotal +references for given scholarly articles utilizing advanced data mining +techniques. In the KDD CUP OAG-Challenge PST track, we design a +recommendation-based framework tailored for the PST task. This framework +employs the Neural Collaborative Filtering (NCF) model to generate final +predictions. To process the textual attributes of the papers and extract input +features for the model, we utilize SciBERT, a pre-trained language model. +According to the experimental results, our method achieved a score of 0.37814 +on the Mean Average Precision (MAP) metric, outperforming baseline models and +ranking 11th among all participating teams. The source code is publicly +available at https://github.com/MyLove-XAB/KDDCupFinal. + +
+
+ comment: KDD CUP 2024 OAG-Challenges, Paper Source Tracing, Technical Report + of Team AoboSama @ KDD CUP 2024. August 25--29, 2024. Barcelona, Spain +
+
+
+
+
+ + ♻ ☆ Effective Bilevel Optimization via Minimax Reformulation + + +
+ Bilevel optimization has found successful applications in various machine +learning problems, including hyper-parameter optimization, data cleaning, and +meta-learning. However, its huge computational cost presents a significant +challenge for its utilization in large-scale problems. This challenge arises +due to the nested structure of the bilevel formulation, where each +hyper-gradient computation necessitates a costly inner optimization procedure. +To address this issue, we propose a reformulation of bilevel optimization as a +minimax problem, effectively decoupling the outer-inner dependency. Under mild +conditions, we show these two problems are equivalent. Furthermore, we +introduce a multi-stage gradient descent and ascent (GDA) algorithm to solve +the resulting minimax problem with convergence guarantees. Extensive +experimental results demonstrate that our method outperforms state-of-the-art +bilevel methods while significantly reducing the computational cost. + +
+
+ comment: Typos and intended inclusion of additional experiments +
+
+
+
+
+ + ♻ ☆ Adaptive Gradient Regularization: A Faster and Generalizable + Optimization Technique for Deep Neural Networks + + +
+ Stochastic optimization plays a crucial role in the advancement of deep +learning technologies. Over the decades, significant effort has been dedicated +to improving the training efficiency and robustness of deep neural networks, +via various strategies including gradient normalization (GN) and gradient +centralization (GC). Nevertheless, to the best of our knowledge, no one has +considered to capture the optimal gradient descent trajectory, by adaptively +controlling gradient descent direction. To address this concern, this paper is +the first attempt to study a new optimization technique for deep neural +networks, using the sum normalization of a gradient vector as coefficients, to +dynamically regularize gradients and thus to effectively control optimization +direction. The proposed technique is hence named as the adaptive gradient +regularization (AGR). It can be viewed as an adaptive gradient clipping method. +The theoretical analysis reveals that the AGR can effectively smooth the loss +landscape, and hence can significantly improve the training efficiency and +model generalization performance. We note that AGR can greatly improve the +training efficiency of vanilla optimizers' including Adan and AdamW, by adding +only three lines of code. The final experiments conducted on image generation, +image classification, and language representation, demonstrate that the AGR +method can not only improve the training efficiency but also enhance the model +generalization performance. + +
+
+ comment: 12 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ Large Visual-Language Models Are Also Good Classifiers: A Study of + In-Context Multimodal Fake News Detection + + +
+ Large visual-language models (LVLMs) exhibit exceptional performance in +visual-language reasoning across diverse cross-modal benchmarks. Despite these +advances, recent research indicates that Large Language Models (LLMs), like +GPT-3.5-turbo, underachieve compared to well-trained smaller models, such as +BERT, in Fake News Detection (FND), prompting inquiries into LVLMs' efficacy in +FND tasks. Although performance could improve through fine-tuning LVLMs, the +substantial parameters and requisite pre-trained weights render it a +resource-heavy endeavor for FND applications. This paper initially assesses the +FND capabilities of two notable LVLMs, CogVLM and GPT4V, in comparison to a +smaller yet adeptly trained CLIP model in a zero-shot context. The findings +demonstrate that LVLMs can attain performance competitive with that of the +smaller model. Next, we integrate standard in-context learning (ICL) with +LVLMs, noting improvements in FND performance, though limited in scope and +consistency. To address this, we introduce the \textbf{I}n-context +\textbf{M}ultimodal \textbf{F}ake \textbf{N}ews \textbf{D}etection (IMFND) +framework, enriching in-context examples and test inputs with predictions and +corresponding probabilities from a well-trained smaller model. This strategic +integration directs the LVLMs' focus towards news segments associated with +higher probabilities, thereby improving their analytical accuracy. The +experimental results suggest that the IMFND framework significantly boosts the +FND efficiency of LVLMs, achieving enhanced accuracy over the standard ICL +approach across three publicly available FND datasets. + +
+
+
+
+
+ + ♻ ☆ Device Sampling and Resource Optimization for Federated Learning in + Cooperative Edge Networks + + +
+ The conventional federated learning (FedL) architecture distributes machine +learning (ML) across worker devices by having them train local models that are +periodically aggregated by a server. FedL ignores two important characteristics +of contemporary wireless networks, however: (i) the network may contain +heterogeneous communication/computation resources, and (ii) there may be +significant overlaps in devices' local data distributions. In this work, we +develop a novel optimization methodology that jointly accounts for these +factors via intelligent device sampling complemented by device-to-device (D2D) +offloading. Our optimization methodology aims to select the best combination of +sampled nodes and data offloading configuration to maximize FedL training +accuracy while minimizing data processing and D2D communication resource +consumption subject to realistic constraints on the network topology and device +capabilities. Theoretical analysis of the D2D offloading subproblem leads to +new FedL convergence bounds and an efficient sequential convex optimizer. Using +these results, we develop a sampling methodology based on graph convolutional +networks (GCNs) which learns the relationship between network attributes, +sampled nodes, and D2D data offloading to maximize FedL accuracy. Through +evaluation on popular datasets and real-world network measurements from our +edge testbed, we find that our methodology outperforms popular device sampling +methodologies from literature in terms of ML model performance, data processing +overhead, and energy consumption. + +
+
+ comment: Published in IEEE/ACM Transactions on Networking. arXiv admin note: + substantial text overlap with arXiv:2101.00787 +
+
+
+
+
+ + ♻ ☆ PromptBench: A Unified Library for Evaluation of Large Language Models + + +
+ The evaluation of large language models (LLMs) is crucial to assess their +performance and mitigate potential security risks. In this paper, we introduce +PromptBench, a unified library to evaluate LLMs. It consists of several key +components that are easily used and extended by researchers: prompt +construction, prompt engineering, dataset and model loading, adversarial prompt +attack, dynamic evaluation protocols, and analysis tools. PromptBench is +designed to be an open, general, and flexible codebase for research purposes +that can facilitate original study in creating new benchmarks, deploying +downstream applications, and designing new evaluation protocols. The code is +available at: https://github.com/microsoft/promptbench and will be continuously +supported. + +
+
+ comment: Accepted by Journal of Machine Learning Research (JMLR); code: + https://github.com/microsoft/promptbench +
+
+
+
+
+ + ♻ ☆ FedMFS: Federated Multimodal Fusion Learning with Selective Modality + Communication + + +
+ Multimodal federated learning (FL) aims to enrich model training in FL +settings where devices are collecting measurements across multiple modalities +(e.g., sensors measuring pressure, motion, and other types of data). However, +key challenges to multimodal FL remain unaddressed, particularly in +heterogeneous network settings: (i) the set of modalities collected by each +device will be diverse, and (ii) communication limitations prevent devices from +uploading all their locally trained modality models to the server. In this +paper, we propose Federated Multimodal Fusion learning with Selective modality +communication (FedMFS), a new multimodal fusion FL methodology that can tackle +the above mentioned challenges. The key idea is the introduction of a modality +selection criterion for each device, which weighs (i) the impact of the +modality, gauged by Shapley value analysis, against (ii) the modality model +size as a gauge for communication overhead. This enables FedMFS to flexibly +balance performance against communication costs, depending on resource +constraints and application requirements. Experiments on the real-world +ActionSense dataset demonstrate the ability of FedMFS to achieve comparable +accuracy to several baselines while reducing the communication overhead by over +4x. + +
+
+ comment: ICC 2024 +
+
+
+
+
+
+
+
+ + Multimedia 3 + +
+
+
+ + ☆ SZTU-CMU at MER2024: Improving Emotion-LLaMA with Conv-Attention for + Multimodal Emotion Recognition + + +
+ This paper presents our winning approach for the MER-NOISE and MER-OV tracks +of the MER2024 Challenge on multimodal emotion recognition. Our system +leverages the advanced emotional understanding capabilities of Emotion-LLaMA to +generate high-quality annotations for unlabeled samples, addressing the +challenge of limited labeled data. To enhance multimodal fusion while +mitigating modality-specific noise, we introduce Conv-Attention, a lightweight +and efficient hybrid framework. Extensive experimentation vali-dates the +effectiveness of our approach. In the MER-NOISE track, our system achieves a +state-of-the-art weighted average F-score of 85.30%, surpassing the second and +third-place teams by 1.47% and 1.65%, respectively. For the MER-OV track, our +utilization of Emotion-LLaMA for open-vocabulary annotation yields an 8.52% +improvement in average accuracy and recall compared to GPT-4V, securing the +highest score among all participating large multimodal models. The code and +model for Emotion-LLaMA are available at +https://github.com/ZebangCheng/Emotion-LLaMA. + +
+
+
+
+
+ + ♻ ☆ Self-supervised Photographic Image Layout Representation Learning + + +
+ In the domain of image layout representation learning, the critical process +of translating image layouts into succinct vector forms is increasingly +significant across diverse applications, such as image retrieval, manipulation, +and generation. Most approaches in this area heavily rely on costly labeled +datasets and notably lack in adapting their modeling and learning methods to +the specific nuances of photographic image layouts. This shortfall makes the +learning process for photographic image layouts suboptimal. In our research, we +directly address these challenges. We innovate by defining basic layout +primitives that encapsulate various levels of layout information and by mapping +these, along with their interconnections, onto a heterogeneous graph structure. +This graph is meticulously engineered to capture the intricate layout +information within the pixel domain explicitly. Advancing further, we introduce +novel pretext tasks coupled with customized loss functions, strategically +designed for effective self-supervised learning of these layout graphs. +Building on this foundation, we develop an autoencoder-based network +architecture skilled in compressing these heterogeneous layout graphs into +precise, dimensionally-reduced layout representations. Additionally, we +introduce the LODB dataset, which features a broader range of layout categories +and richer semantics, serving as a comprehensive benchmark for evaluating the +effectiveness of layout representation learning methods. Our extensive +experimentation on this dataset demonstrates the superior performance of our +approach in the realm of photographic image layout representation learning. + +
+
+ comment: The authors of the paper believe that there is an error in the + measurement of the F1 curve in the metrics description +
+
+
+
+
+ + ♻ ☆ New Job, New Gender? Measuring the Social Bias in Image Generation + Models ACM MM 2024 + + +
+ Image generation models can generate or edit images from a given text. Recent +advancements in image generation technology, exemplified by DALL-E and +Midjourney, have been groundbreaking. These advanced models, despite their +impressive capabilities, are often trained on massive Internet datasets, making +them susceptible to generating content that perpetuates social stereotypes and +biases, which can lead to severe consequences. Prior research on assessing bias +within image generation models suffers from several shortcomings, including +limited accuracy, reliance on extensive human labor, and lack of comprehensive +analysis. In this paper, we propose BiasPainter, a novel evaluation framework +that can accurately, automatically and comprehensively trigger social bias in +image generation models. BiasPainter uses a diverse range of seed images of +individuals and prompts the image generation models to edit these images using +gender, race, and age-neutral queries. These queries span 62 professions, 39 +activities, 57 types of objects, and 70 personality traits. The framework then +compares the edited images to the original seed images, focusing on the +significant changes related to gender, race, and age. BiasPainter adopts a key +insight that these characteristics should not be modified when subjected to +neutral prompts. Built upon this design, BiasPainter can trigger the social +bias and evaluate the fairness of image generation models. We use BiasPainter +to evaluate six widely-used image generation models, such as stable diffusion +and Midjourney. Experimental results show that BiasPainter can successfully +trigger social bias in image generation models. According to our human +evaluation, BiasPainter can achieve 90.8% accuracy on automatic bias detection, +which is significantly higher than the results reported in previous work. + +
+
+ comment: ACM MM 2024 Oral +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 91 + +
+
+
+ + ☆ LongVILA: Scaling Long-Context Visual Language Models for Long Videos + + +
+ Long-context capability is critical for multi-modal foundation models. We +introduce LongVILA, a full-stack solution for long-context vision-language +models, including system, model training, and dataset development. On the +system side, we introduce the first Multi-Modal Sequence Parallelism (MM-SP) +system that enables long-context training and inference, enabling 2M context +length training on 256 GPUs. MM-SP is also efficient, being 2.1x - 5.7x faster +than Ring-Style Sequence Parallelism and 1.1x - 1.4x faster than Megatron-LM in +text-only settings. Moreover, it seamlessly integrates with Hugging Face +Transformers. For model training, we propose a five-stage pipeline comprising +alignment, pre-training, context extension, and long-short joint supervised +fine-tuning. Regarding datasets, we meticulously construct large-scale visual +language pre-training datasets and long video instruction-following datasets to +support our multi-stage training process. The full-stack solution extends the +feasible frame number of VILA by a factor of 128 (from 8 to 1024 frames) and +improves long video captioning score from 2.00 to 3.26 (1.6x), achieving 99.5% +accuracy in 1400-frames video (274k context length) needle in a haystack. +LongVILA-8B also demonstrates a consistent improvement in performance on long +videos within the VideoMME benchmark as the video frames increase. + +
+
+ comment: Code and models are available at + https://github.com/NVlabs/VILA/blob/main/LongVILA.md +
+
+
+
+
+ + ☆ Multilingual Needle in a Haystack: Investigating Long-Context Behavior + of Multilingual Large Language Models + + +
+ While recent large language models (LLMs) demonstrate remarkable abilities in +responding to queries in diverse languages, their ability to handle long +multilingual contexts is unexplored. As such, a systematic evaluation of the +long-context capabilities of LLMs in multilingual settings is crucial, +specifically in the context of information retrieval. To address this gap, we +introduce the MultiLingual Needle-in-a-Haystack (MLNeedle) test, designed to +assess a model's ability to retrieve relevant information (the needle) from a +collection of multilingual distractor texts (the haystack). This test serves as +an extension of the multilingual question-answering task, encompassing both +monolingual and cross-lingual retrieval. We evaluate four state-of-the-art LLMs +on MLNeedle. Our findings reveal that model performance can vary significantly +with language and needle position. Specifically, we observe that model +performance is the lowest when the needle is (i) in a language outside the +English language family and (ii) located in the middle of the input context. +Furthermore, although some models claim a context size of $8k$ tokens or +greater, none demonstrate satisfactory cross-lingual retrieval performance as +the context length increases. Our analysis provides key insights into the +long-context behavior of LLMs in multilingual settings to guide future +evaluation protocols. To our knowledge, this is the first study to investigate +the multilingual long-context behavior of LLMs. + +
+
+
+
+
+ + ☆ In-Context Learning with Representations: Contextual Generalization of + Trained Transformers + + +
+ In-context learning (ICL) refers to a remarkable capability of pretrained +large language models, which can learn a new task given a few examples during +inference. However, theoretical understanding of ICL is largely under-explored, +particularly whether transformers can be trained to generalize to unseen +examples in a prompt, which will require the model to acquire contextual +knowledge of the prompt for generalization. This paper investigates the +training dynamics of transformers by gradient descent through the lens of +non-linear regression tasks. The contextual generalization here can be attained +via learning the template function for each task in-context, where all template +functions lie in a linear space with $m$ basis functions. We analyze the +training dynamics of one-layer multi-head transformers to in-contextly predict +unlabeled inputs given partially labeled prompts, where the labels contain +Gaussian noise and the number of examples in each prompt are not sufficient to +determine the template. Under mild assumptions, we show that the training loss +for a one-layer multi-head transformer converges linearly to a global minimum. +Moreover, the transformer effectively learns to perform ridge regression over +the basis functions. To our knowledge, this study is the first provable +demonstration that transformers can learn contextual (i.e., template) +information to generalize to both unseen examples and tasks when prompts +contain only a small number of query-answer pairs. + +
+
+
+
+
+ + ☆ Instruction Finetuning for Leaderboard Generation from Empirical AI + Research + + +
+ This study demonstrates the application of instruction finetuning of +pretrained Large Language Models (LLMs) to automate the generation of AI +research leaderboards, extracting (Task, Dataset, Metric, Score) quadruples +from articles. It aims to streamline the dissemination of advancements in AI +research by transitioning from traditional, manual community curation, or +otherwise taxonomy-constrained natural language inference (NLI) models, to an +automated, generative LLM-based approach. Utilizing the FLAN-T5 model, this +research enhances LLMs' adaptability and reliability in information extraction, +offering a novel method for structured knowledge representation. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2407.02409 +
+
+
+
+
+ + ☆ Rhyme-aware Chinese lyric generator based on GPT + + +
+ Neural language representation models such as GPT, pre-trained on large-scale +corpora, can effectively capture rich semantic patterns from plain text and be +fine-tuned to consistently improve natural language generation performance. +However, existing pre-trained language models used to generate lyrics rarely +consider rhyme information, which is crucial in lyrics. Using a pre-trained +model directly results in poor performance. To enhance the rhyming quality of +generated lyrics, we incorporate integrated rhyme information into our model, +thereby improving lyric generation performance. + +
+
+
+
+
+ + ☆ GLIMMER: Incorporating Graph and Lexical Features in Unsupervised + Multi-Document Summarization ECAI 2024 + + +
+ Pre-trained language models are increasingly being used in multi-document +summarization tasks. However, these models need large-scale corpora for +pre-training and are domain-dependent. Other non-neural unsupervised +summarization approaches mostly rely on key sentence extraction, which can lead +to information loss. To address these challenges, we propose a lightweight yet +effective unsupervised approach called GLIMMER: a Graph and LexIcal features +based unsupervised Multi-docuMEnt summaRization approach. It first constructs a +sentence graph from the source documents, then automatically identifies +semantic clusters by mining low-level features from raw texts, thereby +improving intra-cluster correlation and the fluency of generated sentences. +Finally, it summarizes clusters into natural sentences. Experiments conducted +on Multi-News, Multi-XScience and DUC-2004 demonstrate that our approach +outperforms existing unsupervised approaches. Furthermore, it surpasses +state-of-the-art pre-trained multi-document summarization models (e.g. PEGASUS +and PRIMERA) under zero-shot settings in terms of ROUGE scores. Additionally, +human evaluations indicate that summaries generated by GLIMMER achieve high +readability and informativeness scores. Our code is available at +https://github.com/Oswald1997/GLIMMER. + +
+
+ comment: 19 pages, 7 figures. Accepted by ECAI 2024 +
+
+
+
+
+ + ☆ Personalizing Reinforcement Learning from Human Feedback with + Variational Preference Learning + + +
+ Reinforcement Learning from Human Feedback (RLHF) is a powerful paradigm for +aligning foundation models to human values and preferences. However, current +RLHF techniques cannot account for the naturally occurring differences in +individual human preferences across a diverse population. When these +differences arise, traditional RLHF frameworks simply average over them, +leading to inaccurate rewards and poor performance for individual subgroups. To +address the need for pluralistic alignment, we develop a class of multimodal +RLHF methods. Our proposed techniques are based on a latent variable +formulation - inferring a novel user-specific latent and learning reward models +and policies conditioned on this latent without additional user-specific data. +While conceptually simple, we show that in practice, this reward modeling +requires careful algorithmic considerations around model architecture and +reward scaling. To empirically validate our proposed technique, we first show +that it can provide a way to combat underspecification in simulated control +problems, inferring and optimizing user-specific reward functions. Next, we +conduct experiments on pluralistic language datasets representing diverse user +preferences and demonstrate improved reward function accuracy. We additionally +show the benefits of this probabilistic framework in terms of measuring +uncertainty, and actively learning user preferences. This work enables learning +from diverse populations of users with divergent preferences, an important +challenge that naturally occurs in problems from robot learning to foundation +model alignment. + +
+
+ comment: weirdlabuw.github.io/vpl +
+
+
+
+
+ + ☆ Privacy Checklist: Privacy Violation Detection Grounding on Contextual + Integrity Theory + + +
+ Privacy research has attracted wide attention as individuals worry that their +private data can be easily leaked during interactions with smart devices, +social platforms, and AI applications. Computer science researchers, on the +other hand, commonly study privacy issues through privacy attacks and defenses +on segmented fields. Privacy research is conducted on various sub-fields, +including Computer Vision (CV), Natural Language Processing (NLP), and Computer +Networks. Within each field, privacy has its own formulation. Though pioneering +works on attacks and defenses reveal sensitive privacy issues, they are +narrowly trapped and cannot fully cover people's actual privacy concerns. +Consequently, the research on general and human-centric privacy research +remains rather unexplored. In this paper, we formulate the privacy issue as a +reasoning problem rather than simple pattern matching. We ground on the +Contextual Integrity (CI) theory which posits that people's perceptions of +privacy are highly correlated with the corresponding social context. Based on +such an assumption, we develop the first comprehensive checklist that covers +social identities, private attributes, and existing privacy regulations. Unlike +prior works on CI that either cover limited expert annotated norms or model +incomplete social context, our proposed privacy checklist uses the whole Health +Insurance Portability and Accountability Act of 1996 (HIPAA) as an example, to +show that we can resort to large language models (LLMs) to completely cover the +HIPAA's regulations. Additionally, our checklist also gathers expert +annotations across multiple ontologies to determine private information +including but not limited to personally identifiable information (PII). We use +our preliminary results on the HIPAA to shed light on future context-centric +privacy research to cover more privacy regulations, social norms and standards. + +
+
+
+
+
+ + ☆ C${^2}$RL: Content and Context Representation Learning for Gloss-free + Sign Language Translation and Retrieval + + +
+ Sign Language Representation Learning (SLRL) is crucial for a range of sign +language-related downstream tasks such as Sign Language Translation (SLT) and +Sign Language Retrieval (SLRet). Recently, many gloss-based and gloss-free SLRL +methods have been proposed, showing promising performance. Among them, the +gloss-free approach shows promise for strong scalability without relying on +gloss annotations. However, it currently faces suboptimal solutions due to +challenges in encoding the intricate, context-sensitive characteristics of sign +language videos, mainly struggling to discern essential sign features using a +non-monotonic video-text alignment strategy. Therefore, we introduce an +innovative pretraining paradigm for gloss-free SLRL, called C${^2}$RL, in this +paper. Specifically, rather than merely incorporating a non-monotonic semantic +alignment of video and text to learn language-oriented sign features, we +emphasize two pivotal aspects of SLRL: Implicit Content Learning (ICL) and +Explicit Context Learning (ECL). ICL delves into the content of communication, +capturing the nuances, emphasis, timing, and rhythm of the signs. In contrast, +ECL focuses on understanding the contextual meaning of signs and converting +them into equivalent sentences. Despite its simplicity, extensive experiments +confirm that the joint optimization of ICL and ECL results in robust sign +language representation and significant performance gains in gloss-free SLT and +SLRet tasks. Notably, C${^2}$RL improves the BLEU-4 score by +5.3 on P14T, ++10.6 on CSL-daily, +6.2 on OpenASL, and +1.3 on How2Sign. It also boosts the +R@1 score by +8.3 on P14T, +14.4 on CSL-daily, and +5.9 on How2Sign. +Additionally, we set a new baseline for the OpenASL dataset in the SLRet task. + +
+
+
+
+
+ + ☆ Microscopic Analysis on LLM players via Social Deduction Game + + +
+ Recent studies have begun developing autonomous game players for social +deduction games using large language models (LLMs). When building LLM players, +fine-grained evaluations are crucial for addressing weaknesses in game-playing +abilities. However, existing studies have often overlooked such assessments. +Specifically, we point out two issues with the evaluation methods employed. +First, game-playing abilities have typically been assessed through game-level +outcomes rather than specific event-level skills; Second, error analyses have +lacked structured methodologies. To address these issues, we propose an +approach utilizing a variant of the SpyFall game, named SpyGame. We conducted +an experiment with four LLMs, analyzing their gameplay behavior in SpyGame both +quantitatively and qualitatively. For the quantitative analysis, we introduced +eight metrics to resolve the first issue, revealing that these metrics are more +effective than existing ones for evaluating the two critical skills: intent +identification and camouflage. In the qualitative analysis, we performed +thematic analysis to resolve the second issue. This analysis identifies four +major categories that affect gameplay of LLMs. Additionally, we demonstrate how +these categories complement and support the findings from the quantitative +analysis. + +
+
+ comment: Under review, 10 pages +
+
+
+
+
+ + ☆ Benchmarking LLMs for Translating Classical Chinese Poetry:Evaluating + Adequacy, Fluency, and Elegance + + +
+ Large language models (LLMs) have shown remarkable performance in general +translation tasks. However, the increasing demand for high-quality translations +that are not only adequate but also fluent and elegant. To assess the extent to +which current LLMs can meet these demands, we introduce a suitable benchmark +for translating classical Chinese poetry into English. This task requires not +only adequacy in translating culturally and historically significant content +but also a strict adherence to linguistic fluency and poetic elegance. Our +study reveals that existing LLMs fall short of this task. To address these +issues, we propose RAT, a \textbf{R}etrieval-\textbf{A}ugmented machine +\textbf{T}ranslation method that enhances the translation process by +incorporating knowledge related to classical poetry. Additionally, we propose +an automatic evaluation metric based on GPT-4, which better assesses +translation quality in terms of adequacy, fluency, and elegance, overcoming the +limitations of traditional metrics. Our dataset and code will be made +available. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ "Image, Tell me your story!" Predicting the original meta-context of + visual misinformation + + +
+ To assist human fact-checkers, researchers have developed automated +approaches for visual misinformation detection. These methods assign veracity +scores by identifying inconsistencies between the image and its caption, or by +detecting forgeries in the image. However, they neglect a crucial point of the +human fact-checking process: identifying the original meta-context of the +image. By explaining what is actually true about the image, fact-checkers can +better detect misinformation, focus their efforts on check-worthy visual +content, engage in counter-messaging before misinformation spreads widely, and +make their explanation more convincing. Here, we fill this gap by introducing +the task of automated image contextualization. We create 5Pils, a dataset of +1,676 fact-checked images with question-answer pairs about their original +meta-context. Annotations are based on the 5 Pillars fact-checking framework. +We implement a first baseline that grounds the image in its original +meta-context using the content of the image and textual evidence retrieved from +the open web. Our experiments show promising results while highlighting several +open challenges in retrieval and reasoning. We make our code and data publicly +available. + +
+
+ comment: Preprint. Code available at https://github.com/UKPLab/5pils +
+
+
+
+
+ + ☆ Attribution Analysis Meets Model Editing: Advancing Knowledge Correction + in Vision Language Models with VisEdit + + +
+ Model editing aims to correct outdated or erroneous knowledge in large models +without costly retraining. Recent research discovered that the mid-layer +representation of the subject's final token in a prompt has a strong influence +on factual predictions, and developed Large Language Model (LLM) editing +techniques based on this observation. However, for Vision-LLMs (VLLMs), how +visual representations impact the predictions from a decoder-only language +model remains largely unexplored. To the best of our knowledge, model editing +for VLLMs has not been extensively studied in the literature. In this work, we +employ the contribution allocation and noise perturbation methods to measure +the contributions of visual representations for token predictions. Our +attribution analysis shows that visual representations in mid-to-later layers +that are highly relevant to the prompt contribute significantly to predictions. +Based on these insights, we propose VisEdit, a novel model editor for VLLMs +that effectively corrects knowledge by editing intermediate visual +representations in regions important to the edit prompt. We evaluated VisEdit +using multiple VLLM backbones and public VLLM editing benchmark datasets. The +results show the superiority of VisEdit over the strong baselines adapted from +existing state-of-the-art editors for LLMs. + +
+
+
+
+
+ + ☆ Active Learning for Identifying Disaster-Related Tweets: A Comparison + with Keyword Filtering and Generic Fine-Tuning + + +
+ Information from social media can provide essential information for emergency +response during natural disasters in near real-time. However, it is difficult +to identify the disaster-related posts among the large amounts of unstructured +data available. Previous methods often use keyword filtering, topic modelling +or classification-based techniques to identify such posts. Active Learning (AL) +presents a promising sub-field of Machine Learning (ML) that has not been used +much in the field of text classification of social media content. This study +therefore investigates the potential of AL for identifying disaster-related +Tweets. We compare a keyword filtering approach, a RoBERTa model fine-tuned +with generic data from CrisisLex, a base RoBERTa model trained with AL and a +fine-tuned RoBERTa model trained with AL regarding classification performance. +For testing, data from CrisisLex and manually labelled data from the 2021 flood +in Germany and the 2023 Chile forest fires were considered. The results show +that generic fine-tuning combined with 10 rounds of AL outperformed all other +approaches. Consequently, a broadly applicable model for the identification of +disaster-related Tweets could be trained with very little labelling effort. The +model can be applied to use cases beyond this study and provides a useful tool +for further research in social media analysis. + +
+
+ comment: Submitted for the Intelligent Systems Conference (IntelliSys 2024). + The version of record of this contribution is published in the Springer + series Lecture Notes in Networks and Systems, and is available online at + https://doi.org/10.1007/978-3-031-66428-1_8. This preprint has not undergone + peer review or any post-submission improvements or corrections. 13 pages, 2 + figures +
+
+
+
+
+ + ☆ Performance Law of Large Language Models + + +
+ Guided by the belief of the scaling law, large language models (LLMs) have +achieved impressive performance in recent years. However, scaling law only +gives a qualitative estimation of loss, which is influenced by various factors +such as model architectures, data distributions, tokenizers, and computation +precision. Thus, estimating the real performance of LLMs with different +training settings rather than loss may be quite useful in practical +development. In this article, we present an empirical equation named +"Performance Law" to directly predict the MMLU score of an LLM, which is a +widely used metric to indicate the general capability of LLMs in real-world +conversations and applications. Based on only a few key hyperparameters of the +LLM architecture and the size of training data, we obtain a quite accurate MMLU +prediction of various LLMs with diverse sizes and architectures developed by +different organizations in different years. Performance law can be used to +guide the choice of LLM architecture and the effective allocation of +computational resources without extensive experiments. + +
+
+ comment: Personal opinions of the authors +
+
+
+
+
+ + ☆ Docling Technical Report + + +
+ This technical report introduces Docling, an easy to use, self-contained, +MIT-licensed open-source package for PDF document conversion. It is powered by +state-of-the-art specialized AI models for layout analysis (DocLayNet) and +table structure recognition (TableFormer), and runs efficiently on commodity +hardware in a small resource budget. The code interface allows for easy +extensibility and addition of new features and models. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2206.01062 +
+
+
+
+
+ + ☆ MAPLE: Enhancing Review Generation with Multi-Aspect Prompt LEarning in + Explainable Recommendation + + +
+ Explainable Recommendation task is designed to receive a pair of user and +item and output explanations to justify why an item is recommended to a user. +Many models treat review-generation as a proxy of explainable recommendation. +Although they are able to generate fluent and grammatical sentences, they +suffer from generality and hallucination issues. We propose a personalized, +aspect-controlled model called Multi-Aspect Prompt LEarner (MAPLE), in which it +integrates aspect category as another input dimension to facilitate the +memorization of fine-grained aspect terms. Experiments on two real-world review +datasets in restaurant domain show that MAPLE outperforms the baseline +review-generation models in terms of text and feature diversity while +maintaining excellent coherence and factual relevance. We further treat MAPLE +as a retriever component in the retriever-reader framework and employ a +Large-Language Model (LLM) as the reader, showing that MAPLE's explanation +along with the LLM's comprehension ability leads to enriched and personalized +explanation as a result. We will release the code and data in this http upon +acceptance. + +
+
+ comment: 8 main pages, 10 pages for appendix. Under review +
+
+
+
+
+ + ☆ TaSL: Continual Dialog State Tracking via Task Skill Localization and + Consolidation ACL 2024 + + +
+ A practical dialogue system requires the capacity for ongoing skill +acquisition and adaptability to new tasks while preserving prior knowledge. +However, current methods for Continual Dialogue State Tracking (DST), a crucial +function of dialogue systems, struggle with the catastrophic forgetting issue +and knowledge transfer between tasks. We present TaSL, a novel framework for +task skill localization and consolidation that enables effective knowledge +transfer without relying on memory replay. TaSL uses a novel group-wise +technique to pinpoint task-specific and task-shared areas. Additionally, a +fine-grained skill consolidation strategy protects task-specific knowledge from +being forgotten while updating shared knowledge for bi-directional knowledge +transfer. As a result, TaSL strikes a balance between preserving previous +knowledge and excelling at new tasks. Comprehensive experiments on various +backbones highlight the significant performance improvements of TaSL over +existing state-of-the-art methods. The source code is provided for +reproducibility. + +
+
+ comment: Accepted to ACL 2024 Main Conference +
+
+
+
+
+ + ☆ TeamLoRA: Boosting Low-Rank Adaptation with Expert Collaboration and + Competition + + +
+ While Parameter-Efficient Fine-Tuning (PEFT) methods like LoRA have +effectively addressed GPU memory constraints during fine-tuning, their +performance often falls short, especially in multidimensional task scenarios. +To address this issue, one straightforward solution is to introduce +task-specific LoRA modules as domain experts, leveraging the modeling of +multiple experts' capabilities and thus enhancing the general capability of +multi-task learning. Despite promising, these additional components often add +complexity to the training and inference process, contravening the efficient +characterization of PEFT designed for. Considering this, we introduce an +innovative PEFT method, TeamLoRA, consisting of a collaboration and competition +module for experts, and thus achieving the right balance of effectiveness and +efficiency: (i) For collaboration, a novel knowledge-sharing and -organizing +mechanism is devised to appropriately reduce the scale of matrix operations, +thereby boosting the training and inference speed. (ii) For competition, we +propose leveraging a game-theoretic interaction mechanism for experts, +encouraging experts to transfer their domain-specific knowledge while facing +diverse downstream tasks, and thus enhancing the performance. By doing so, +TeamLoRA elegantly connects the experts as a "Team" with internal collaboration +and competition, enabling a faster and more accurate PEFT paradigm for +multi-task learning. To validate the superiority of TeamLoRA, we curate a +comprehensive multi-task evaluation(CME) benchmark to thoroughly assess the +capability of multi-task learning. Experiments conducted on our CME and other +benchmarks indicate the effectiveness and efficiency of TeamLoRA. Our project +is available at https://github.com/Lin-Tianwei/TeamLoRA. + +
+
+
+
+
+ + ☆ Self-Directed Turing Test for Large Language Models + + +
+ The Turing test examines whether AIs can exhibit human-like behaviour in +natural language conversations. Traditional Turing tests adopt a rigid dialogue +format where each participant sends only one message each time and require +continuous human involvement to direct the entire interaction with the test +subject. This fails to reflect a natural conversational style and hinders the +evaluation of Large Language Models (LLMs) in complex and prolonged dialogues. +This paper proposes the Self-Directed Turing Test, which extends the original +test with a burst dialogue format, allowing more dynamic exchanges by multiple +consecutive messages. It further efficiently reduces human workload by having +the LLM self-direct the majority of the test process, iteratively generating +dialogues that simulate its interaction with humans. With the pseudo-dialogue +history, the model then engages in a shorter dialogue with a human, which is +paired with a human-human conversation on the same topic to be judged using +questionnaires. We introduce the X-Turn Pass-Rate metric to assess the human +likeness of LLMs across varying durations. While LLMs like GPT-4 initially +perform well, achieving pass rates of 51.9% and 38.9% during 3 turns and 10 +turns of dialogues respectively, their performance drops as the dialogue +progresses, which underscores the difficulty in maintaining consistency in the +long term. + +
+
+
+
+
+ + ☆ Importance Weighting Can Help Large Language Models Self-Improve + + +
+ Large language models (LLMs) have shown remarkable capability in numerous +tasks and applications. However, fine-tuning LLMs using high-quality datasets +under external supervision remains prohibitively expensive. In response, LLM +self-improvement approaches have been vibrantly developed recently. The typical +paradigm of LLM self-improvement involves training LLM on self-generated data, +part of which may be detrimental and should be filtered out due to the unstable +data quality. While current works primarily employs filtering strategies based +on answer correctness, in this paper, we demonstrate that filtering out correct +but with high distribution shift extent (DSE) samples could also benefit the +results of self-improvement. Given that the actual sample distribution is +usually inaccessible, we propose a new metric called DS weight to approximate +DSE, inspired by the Importance Weighting methods. Consequently, we integrate +DS weight with self-consistency to comprehensively filter the self-generated +samples and fine-tune the language model. Experiments show that with only a +tiny valid set (up to 5\% size of the training set) to compute DS weight, our +approach can notably promote the reasoning ability of current LLM +self-improvement methods. The resulting performance is on par with methods that +rely on external supervision from pre-trained reward models. + +
+
+
+
+
+ + ☆ Continual Dialogue State Tracking via Reason-of-Select Distillation ACL 2024 + + +
+ An ideal dialogue system requires continuous skill acquisition and adaptation +to new tasks while retaining prior knowledge. Dialogue State Tracking (DST), +vital in these systems, often involves learning new services and confronting +catastrophic forgetting, along with a critical capability loss termed the +"Value Selection Quandary." To address these challenges, we introduce the +Reason-of-Select (RoS) distillation method by enhancing smaller models with a +novel 'meta-reasoning' capability. Meta-reasoning employs an enhanced +multi-domain perspective, combining fragments of meta-knowledge from +domain-specific dialogues during continual learning. This transcends +traditional single-perspective reasoning. The domain bootstrapping process +enhances the model's ability to dissect intricate dialogues from multiple +possible values. Its domain-agnostic property aligns data distribution across +different domains, effectively mitigating forgetting. Additionally, two novel +improvements, "multi-value resolution" strategy and Semantic Contrastive +Reasoning Selection method, significantly enhance RoS by generating +DST-specific selection chains and mitigating hallucinations in teachers' +reasoning, ensuring effective and reliable knowledge transfer. Extensive +experiments validate the exceptional performance and robust generalization +capabilities of our method. The source code is provided for reproducibility. + +
+
+ comment: Accepted to ACL 2024 Findings +
+
+
+
+
+ + ☆ CMoralEval: A Moral Evaluation Benchmark for Chinese Large Language + Models ACL 2024 + + +
+ What a large language model (LLM) would respond in ethically relevant +context? In this paper, we curate a large benchmark CMoralEval for morality +evaluation of Chinese LLMs. The data sources of CMoralEval are two-fold: 1) a +Chinese TV program discussing Chinese moral norms with stories from the society +and 2) a collection of Chinese moral anomies from various newspapers and +academic papers on morality. With these sources, we aim to create a moral +evaluation dataset characterized by diversity and authenticity. We develop a +morality taxonomy and a set of fundamental moral principles that are not only +rooted in traditional Chinese culture but also consistent with contemporary +societal norms. To facilitate efficient construction and annotation of +instances in CMoralEval, we establish a platform with AI-assisted instance +generation to streamline the annotation process. These help us curate +CMoralEval that encompasses both explicit moral scenarios (14,964 instances) +and moral dilemma scenarios (15,424 instances), each with instances from +different data sources. We conduct extensive experiments with CMoralEval to +examine a variety of Chinese LLMs. Experiment results demonstrate that +CMoralEval is a challenging benchmark for Chinese LLMs. The dataset is publicly +available at \url{https://github.com/tjunlp-lab/CMoralEval}. + +
+
+ comment: Accepted by ACL 2024 (Findings) +
+
+
+
+
+ + ☆ AutoML-guided Fusion of Entity and LLM-based representations + + +
+ Large semantic knowledge bases are grounded in factual knowledge. However, +recent approaches to dense text representations (embeddings) do not efficiently +exploit these resources. Dense and robust representations of documents are +essential for effectively solving downstream classification and retrieval +tasks. This work demonstrates that injecting embedded information from +knowledge bases can augment the performance of contemporary Large Language +Model (LLM)-based representations for the task of text classification. Further, +by considering automated machine learning (AutoML) with the fused +representation space, we demonstrate it is possible to improve classification +accuracy even if we use low-dimensional projections of the original +representation space obtained via efficient matrix factorization. This result +shows that significantly faster classifiers can be achieved with minimal or no +loss in predictive performance, as demonstrated using five strong LLM baselines +on six diverse real-life datasets. + +
+
+
+
+
+ + ☆ Anim-Director: A Large Multimodal Model Powered Agent for Controllable + Animation Video Generation SIGGRAPH + + +
+ Traditional animation generation methods depend on training generative models +with human-labelled data, entailing a sophisticated multi-stage pipeline that +demands substantial human effort and incurs high training costs. Due to limited +prompting plans, these methods typically produce brief, information-poor, and +context-incoherent animations. To overcome these limitations and automate the +animation process, we pioneer the introduction of large multimodal models +(LMMs) as the core processor to build an autonomous animation-making agent, +named Anim-Director. This agent mainly harnesses the advanced understanding and +reasoning capabilities of LMMs and generative AI tools to create animated +videos from concise narratives or simple instructions. Specifically, it +operates in three main stages: Firstly, the Anim-Director generates a coherent +storyline from user inputs, followed by a detailed director's script that +encompasses settings of character profiles and interior/exterior descriptions, +and context-coherent scene descriptions that include appearing characters, +interiors or exteriors, and scene events. Secondly, we employ LMMs with the +image generation tool to produce visual images of settings and scenes. These +images are designed to maintain visual consistency across different scenes +using a visual-language prompting method that combines scene descriptions and +images of the appearing character and setting. Thirdly, scene images serve as +the foundation for producing animated videos, with LMMs generating prompts to +guide this process. The whole process is notably autonomous without manual +intervention, as the LMMs interact seamlessly with generative tools to generate +prompts, evaluate visual quality, and select the best one to optimize the final +output. + +
+
+ comment: Accepted by SIGGRAPH Asia 2024, Project and Codes: + https://github.com/HITsz-TMG/Anim-Director +
+
+
+
+
+ + ☆ GoNoGo: An Efficient LLM-based Multi-Agent System for Streamlining + Automotive Software Release Decision-Making + + +
+ Traditional methods for making software deployment decisions in the +automotive industry typically rely on manual analysis of tabular software test +data. These methods often lead to higher costs and delays in the software +release cycle due to their labor-intensive nature. Large Language Models (LLMs) +present a promising solution to these challenges. However, their application +generally demands multiple rounds of human-driven prompt engineering, which +limits their practical deployment, particularly for industrial end-users who +need reliable and efficient results. In this paper, we propose GoNoGo, an LLM +agent system designed to streamline automotive software deployment while +meeting both functional requirements and practical industrial constraints. +Unlike previous systems, GoNoGo is specifically tailored to address +domain-specific and risk-sensitive systems. We evaluate GoNoGo's performance +across different task difficulties using zero-shot and few-shot examples taken +from industrial practice. Our results show that GoNoGo achieves a 100% success +rate for tasks up to Level 2 difficulty with 3-shot examples, and maintains +high performance even for more complex tasks. We find that GoNoGo effectively +automates decision-making for simpler tasks, significantly reducing the need +for manual intervention. In summary, GoNoGo represents an efficient and +user-friendly LLM-based solution currently employed in our industrial partner's +company to assist with software release decision-making, supporting more +informed and timely decisions in the release process for risk-sensitive vehicle +systems. + +
+
+
+
+
+ + ☆ Summarizing long regulatory documents with a multi-step pipeline + + +
+ Due to their length and complexity, long regulatory texts are challenging to +summarize. To address this, a multi-step extractive-abstractive architecture is +proposed to handle lengthy regulatory documents more effectively. In this +paper, we show that the effectiveness of a two-step architecture for +summarizing long regulatory texts varies significantly depending on the model +used. Specifically, the two-step architecture improves the performance of +decoder-only models. For abstractive encoder-decoder models with short context +lengths, the effectiveness of an extractive step varies, whereas for +long-context encoder-decoder models, the extractive step worsens their +performance. This research also highlights the challenges of evaluating +generated texts, as evidenced by the differing results from human and automated +evaluations. Most notably, human evaluations favoured language models +pretrained on legal text, while automated metrics rank general-purpose language +models higher. The results underscore the importance of selecting the +appropriate summarization strategy based on model architecture and context +length. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Are Large Language Models More Honest in Their Probabilistic or + Verbalized Confidence? + + +
+ Large language models (LLMs) have been found to produce hallucinations when +the question exceeds their internal knowledge boundaries. A reliable model +should have a clear perception of its knowledge boundaries, providing correct +answers within its scope and refusing to answer when it lacks knowledge. +Existing research on LLMs' perception of their knowledge boundaries typically +uses either the probability of the generated tokens or the verbalized +confidence as the model's confidence in its response. However, these studies +overlook the differences and connections between the two. In this paper, we +conduct a comprehensive analysis and comparison of LLMs' probabilistic +perception and verbalized perception of their factual knowledge boundaries. +First, we investigate the pros and cons of these two perceptions. Then, we +study how they change under questions of varying frequencies. Finally, we +measure the correlation between LLMs' probabilistic confidence and verbalized +confidence. Experimental results show that 1) LLMs' probabilistic perception is +generally more accurate than verbalized perception but requires an in-domain +validation set to adjust the confidence threshold. 2) Both perceptions perform +better on less frequent questions. 3) It is challenging for LLMs to accurately +express their internal confidence in natural language. + +
+
+
+
+
+ + ☆ Strategic Demonstration Selection for Improved Fairness in LLM + In-Context Learning + + +
+ Recent studies highlight the effectiveness of using in-context learning (ICL) +to steer large language models (LLMs) in processing tabular data, a challenging +task given the structured nature of such data. Despite advancements in +performance, the fairness implications of these methods are less understood. +This study investigates how varying demonstrations within ICL prompts influence +the fairness outcomes of LLMs. Our findings reveal that deliberately including +minority group samples in prompts significantly boosts fairness without +sacrificing predictive accuracy. Further experiments demonstrate that the +proportion of minority to majority samples in demonstrations affects the +trade-off between fairness and prediction accuracy. Based on these insights, we +introduce a mitigation technique that employs clustering and evolutionary +strategies to curate a diverse and representative sample set from the training +data. This approach aims to enhance both predictive performance and fairness in +ICL applications. Experimental results validate that our proposed method +dramatically improves fairness across various metrics, showing its efficacy in +real-world scenarios. + +
+
+
+
+
+ + ☆ R2GenCSR: Retrieving Context Samples for Large Language Model based + X-ray Medical Report Generation + + +
+ Inspired by the tremendous success of Large Language Models (LLMs), existing +X-ray medical report generation methods attempt to leverage large models to +achieve better performance. They usually adopt a Transformer to extract the +visual features of a given X-ray image, and then, feed them into the LLM for +text generation. How to extract more effective information for the LLMs to help +them improve final results is an urgent problem that needs to be solved. +Additionally, the use of visual Transformer models also brings high +computational complexity. To address these issues, this paper proposes a novel +context-guided efficient X-ray medical report generation framework. +Specifically, we introduce the Mamba as the vision backbone with linear +complexity, and the performance obtained is comparable to that of the strong +Transformer model. More importantly, we perform context retrieval from the +training set for samples within each mini-batch during the training phase, +utilizing both positively and negatively related samples to enhance feature +representation and discriminative learning. Subsequently, we feed the vision +tokens, context information, and prompt statements to invoke the LLM for +generating high-quality medical reports. Extensive experiments on three X-ray +report generation datasets (i.e., IU-Xray, MIMIC-CXR, CheXpert Plus) fully +validated the effectiveness of our proposed model. The source code of this work +will be released on \url{https://github.com/Event-AHU/Medical_Image_Analysis}. + +
+
+ comment: In Peer Review +
+
+
+
+
+ + ☆ Paired Completion: Flexible Quantification of Issue-framing at Scale + with LLMs + + +
+ Detecting and quantifying issue framing in textual discourse - the +perspective one takes to a given topic (e.g. climate science vs. denialism, +misogyny vs. gender equality) - is highly valuable to a range of end-users from +social and political scientists to program evaluators and policy analysts. +However, conceptual framing is notoriously challenging for automated natural +language processing (NLP) methods since the words and phrases used by either +`side' of an issue are often held in common, with only subtle stylistic +flourishes separating their use. Here we develop and rigorously evaluate new +detection methods for issue framing and narrative analysis within large text +datasets. By introducing a novel application of next-token log probabilities +derived from generative large language models (LLMs) we show that issue framing +can be reliably and efficiently detected in large corpora with only a few +examples of either perspective on a given issue, a method we call `paired +completion'. Through 192 independent experiments over three novel, synthetic +datasets, we evaluate paired completion against prompt-based LLM methods and +labelled methods using traditional NLP and recent LLM contextual embeddings. We +additionally conduct a cost-based analysis to mark out the feasible set of +performant methods at production-level scales, and a model bias analysis. +Together, our work demonstrates a feasible path to scalable, accurate and +low-bias issue-framing in large corpora. + +
+
+ comment: 9 pages, 4 figures +
+
+
+
+
+ + ☆ Pedestrian Attribute Recognition: A New Benchmark Dataset and A Large + Language Model Augmented Framework SP60 + + +
+ Pedestrian Attribute Recognition (PAR) is one of the indispensable tasks in +human-centered research. However, existing datasets neglect different domains +(e.g., environments, times, populations, and data sources), only conducting +simple random splits, and the performance of these datasets has already +approached saturation. In the past five years, no large-scale dataset has been +opened to the public. To address this issue, this paper proposes a new +large-scale, cross-domain pedestrian attribute recognition dataset to fill the +data gap, termed MSP60K. It consists of 60,122 images and 57 attribute +annotations across eight scenarios. Synthetic degradation is also conducted to +further narrow the gap between the dataset and real-world challenging +scenarios. To establish a more rigorous benchmark, we evaluate 17 +representative PAR models under both random and cross-domain split protocols on +our dataset. Additionally, we propose an innovative Large Language Model (LLM) +augmented PAR framework, named LLM-PAR. This framework processes pedestrian +images through a Vision Transformer (ViT) backbone to extract features and +introduces a multi-embedding query Transformer to learn partial-aware features +for attribute classification. Significantly, we enhance this framework with LLM +for ensemble learning and visual feature augmentation. Comprehensive +experiments across multiple PAR benchmark datasets have thoroughly validated +the efficacy of our proposed framework. The dataset and source code +accompanying this paper will be made publicly available at +\url{https://github.com/Event-AHU/OpenPAR}. + +
+
+ comment: MSP60K PAR Benchmark Dataset, LLM based PAR model, In Peer Review +
+
+
+
+
+ + ☆ SEMDR: A Semantic-Aware Dual Encoder Model for Legal Judgment Prediction + with Legal Clue Tracing + + +
+ Legal Judgment Prediction (LJP) aims to form legal judgments based on the +criminal fact description. However, researchers struggle to classify confusing +criminal cases, such as robbery and theft, which requires LJP models to +distinguish the nuances between similar crimes. Existing methods usually design +handcrafted features to pick up necessary semantic legal clues to make more +accurate legal judgment predictions. In this paper, we propose a Semantic-Aware +Dual Encoder Model (SEMDR), which designs a novel legal clue tracing mechanism +to conduct fine-grained semantic reasoning between criminal facts and +instruments. Our legal clue tracing mechanism is built from three reasoning +levels: 1) Lexicon-Tracing, which aims to extract criminal facts from criminal +descriptions; 2) Sentence Representation Learning, which contrastively trains +language models to better represent confusing criminal facts; 3) Multi-Fact +Reasoning, which builds a reasons graph to propagate semantic clues among fact +nodes to capture the subtle difference among criminal facts. Our legal clue +tracing mechanism helps SEMDR achieve state-of-the-art on the CAIL2018 dataset +and shows its advance in few-shot scenarios. Our experiments show that SEMDR +has a strong ability to learn more uniform and distinguished representations +for criminal facts, which helps to make more accurate predictions on confusing +criminal cases and reduces the model uncertainty during making judgments. All +codes will be released via GitHub. + +
+
+
+
+
+ + ☆ Bridging the Language Gap: Enhancing Multilingual Prompt-Based Code + Generation in LLMs via Zero-Shot Cross-Lingual Transfer + + +
+ The use of Large Language Models (LLMs) for program code generation has +gained substantial attention, but their biases and limitations with non-English +prompts challenge global inclusivity. This paper investigates the complexities +of multilingual prompt-based code generation. Our evaluations of LLMs, +including CodeLLaMa and CodeGemma, reveal significant disparities in code +quality for non-English prompts; we also demonstrate the inadequacy of simple +approaches like prompt translation, bootstrapped data augmentation, and +fine-tuning. To address this, we propose a zero-shot cross-lingual approach +using a neural projection technique, integrating a cross-lingual encoder like +LASER artetxe2019massively to map multilingual embeddings from it into the +LLM's token space. This method requires training only on English data and +scales effectively to other languages. Results on a translated and +quality-checked MBPP dataset show substantial improvements in code quality. +This research promotes a more inclusive code generation landscape by empowering +LLMs with multilingual capabilities to support the diverse linguistic spectrum +in programming. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Recording for Eyes, Not Echoing to Ears: Contextualized + Spoken-to-Written Conversion of ASR Transcripts + + +
+ Automatic Speech Recognition (ASR) transcripts exhibit recognition errors and +various spoken language phenomena such as disfluencies, ungrammatical +sentences, and incomplete sentences, hence suffering from poor readability. To +improve readability, we propose a Contextualized Spoken-to-Written conversion +(CoS2W) task to address ASR and grammar errors and also transfer the informal +text into the formal style with content preserved, utilizing contexts and +auxiliary information. This task naturally matches the in-context learning +capabilities of Large Language Models (LLMs). To facilitate comprehensive +comparisons of various LLMs, we construct a document-level Spoken-to-Written +conversion of ASR Transcripts Benchmark (SWAB) dataset. Using SWAB, we study +the impact of different granularity levels on the CoS2W performance, and +propose methods to exploit contexts and auxiliary information to enhance the +outputs. Experimental results reveal that LLMs have the potential to excel in +the CoS2W task, particularly in grammaticality and formality, our methods +achieve effective understanding of contexts and auxiliary information by LLMs. +We further investigate the effectiveness of using LLMs as evaluators and find +that LLM evaluators show strong correlations with human evaluations on rankings +of faithfulness and formality, which validates the reliability of LLM +evaluators for the CoS2W task. + +
+
+ comment: 7 pages, 3 figures +
+
+
+
+
+ + ☆ BLADE: Benchmarking Language Model Agents for Data-Driven Science + + +
+ Data-driven scientific discovery requires the iterative integration of +scientific domain knowledge, statistical expertise, and an understanding of +data semantics to make nuanced analytical decisions, e.g., about which +variables, transformations, and statistical models to consider. LM-based agents +equipped with planning, memory, and code execution capabilities have the +potential to support data-driven science. However, evaluating agents on such +open-ended tasks is challenging due to multiple valid approaches, partially +correct steps, and different ways to express the same decisions. To address +these challenges, we present BLADE, a benchmark to automatically evaluate +agents' multifaceted approaches to open-ended research questions. BLADE +consists of 12 datasets and research questions drawn from existing scientific +literature, with ground truth collected from independent analyses by expert +data scientists and researchers. To automatically evaluate agent responses, we +developed corresponding computational methods to match different +representations of analyses to this ground truth. Though language models +possess considerable world knowledge, our evaluation shows that they are often +limited to basic analyses. However, agents capable of interacting with the +underlying data demonstrate improved, but still non-optimal, diversity in their +analytical decision making. Our work enables the evaluation of agents for +data-driven science and provides researchers deeper insights into agents' +analysis approaches. + +
+
+
+
+
+ + ☆ A Comparison of Large Language Model and Human Performance on Random + Number Generation Tasks + + +
+ Random Number Generation Tasks (RNGTs) are used in psychology for examining +how humans generate sequences devoid of predictable patterns. By adapting an +existing human RNGT for an LLM-compatible environment, this preliminary study +tests whether ChatGPT-3.5, a large language model (LLM) trained on +human-generated text, exhibits human-like cognitive biases when generating +random number sequences. Initial findings indicate that ChatGPT-3.5 more +effectively avoids repetitive and sequential patterns compared to humans, with +notably lower repeat frequencies and adjacent number frequencies. Continued +research into different models, parameters, and prompting methodologies will +deepen our understanding of how LLMs can more closely mimic human random +generation behaviors, while also broadening their applications in cognitive and +behavioral science research. + +
+
+
+
+
+ + ☆ Acquiring Bidirectionality via Large and Small Language Models + + +
+ Using token representation from bidirectional language models (LMs) such as +BERT is still a widely used approach for token-classification tasks. Even +though there exist much larger unidirectional LMs such as Llama-2, they are +rarely used to replace the token representation of bidirectional LMs. In this +work, we hypothesize that their lack of bidirectionality is keeping them +behind. To that end, we propose to newly train a small backward LM and +concatenate its representations to those of existing LM for downstream tasks. +Through experiments in named entity recognition, we demonstrate that +introducing backward model improves the benchmark performance more than 10 +points. Furthermore, we show that the proposed method is especially effective +for rare domains and in few-shot learning settings. + +
+
+
+
+
+ + ☆ How to Make the Most of LLMs' Grammatical Knowledge for Acceptability + Judgments + + +
+ The grammatical knowledge of language models (LMs) is often measured using a +benchmark of linguistic minimal pairs, where LMs are presented with a pair of +acceptable and unacceptable sentences and required to judge which is +acceptable. The existing dominant approach, however, naively calculates and +compares the probabilities of paired sentences using LMs. Additionally, large +language models (LLMs) have yet to be thoroughly examined in this field. We +thus investigate how to make the most of LLMs' grammatical knowledge to +comprehensively evaluate it. Through extensive experiments of nine judgment +methods in English and Chinese, we demonstrate that a probability readout +method, in-template LP, and a prompting-based method, Yes/No probability +computing, achieve particularly high performance, surpassing the conventional +approach. Our analysis reveals their different strengths, e.g., Yes/No +probability computing is robust against token-length bias, suggesting that they +harness different aspects of LLMs' grammatical knowledge. Consequently, we +recommend using diverse judgment methods to evaluate LLMs comprehensively. + +
+
+
+
+
+ + ☆ MoDeGPT: Modular Decomposition for Large Language Model Compression + + +
+ Large Language Models (LLMs) have reshaped the landscape of artificial +intelligence by demonstrating exceptional performance across various tasks. +However, substantial computational requirements make their deployment +challenging on devices with limited resources. Recently, compression methods +using low-rank matrix techniques have shown promise, yet these often lead to +degraded accuracy or introduce significant overhead in parameters and inference +latency. This paper introduces \textbf{Mo}dular \textbf{De}composition +(MoDeGPT), a novel structured compression framework that does not need recovery +fine-tuning while resolving the above drawbacks. MoDeGPT partitions the +Transformer block into modules comprised of matrix pairs and reduces the hidden +dimensions via reconstructing the module-level outputs. MoDeGPT is developed +based on a theoretical framework that utilizes three well-established matrix +decomposition algorithms -- Nystr\"om approximation, CR decomposition, and SVD +-- and applies them to our redefined transformer modules. Our comprehensive +experiments show MoDeGPT, without backward propagation, matches or surpasses +previous structured compression methods that rely on gradient information, and +saves 98% of compute costs on compressing a 13B model. On \textsc{Llama}-2/3 +and OPT models, MoDeGPT maintains 90-95% zero-shot performance with 25-30% +compression rates. Moreover, the compression can be done on a single GPU within +a few hours and increases the inference throughput by up to 46%. + +
+
+ comment: 31 pages, 9 figures +
+
+
+
+
+ + ☆ A Strategy to Combine 1stGen Transformers and Open LLMs for Automatic + Text Classification + + +
+ Transformer models have achieved state-of-the-art results, with Large +Language Models (LLMs), an evolution of first-generation transformers (1stTR), +being considered the cutting edge in several NLP tasks. However, the literature +has yet to conclusively demonstrate that LLMs consistently outperform 1stTRs +across all NLP tasks. This study compares three 1stTRs (BERT, RoBERTa, and +BART) with two open LLMs (Llama 2 and Bloom) across 11 sentiment analysis +datasets. The results indicate that open LLMs may moderately outperform or +match 1stTRs in 8 out of 11 datasets but only when fine-tuned. Given this +substantial cost for only moderate gains, the practical applicability of these +models in cost-sensitive scenarios is questionable. In this context, a +confidence-based strategy that seamlessly integrates 1stTRs with open LLMs +based on prediction certainty is proposed. High-confidence documents are +classified by the more cost-effective 1stTRs, while uncertain cases are handled +by LLMs in zero-shot or few-shot modes, at a much lower cost than fine-tuned +versions. Experiments in sentiment analysis demonstrate that our solution not +only outperforms 1stTRs, zero-shot, and few-shot LLMs but also competes closely +with fine-tuned LLMs at a fraction of the cost. + +
+
+ comment: 13 pages, 3 figures, 8 tables +
+
+
+
+
+ + ☆ Refining Packing and Shuffling Strategies for Enhanced Performance in + Generative Language Models ACL + + +
+ Packing and shuffling tokens is a common practice in training auto-regressive +language models (LMs) to prevent overfitting and improve efficiency. Typically +documents are concatenated to chunks of maximum sequence length (MSL) and then +shuffled. However setting the atom size, the length for each data chunk +accompanied by random shuffling, to MSL may lead to contextual incoherence due +to tokens from different documents being packed into the same chunk. An +alternative approach is to utilize padding, another common data packing +strategy, to avoid contextual incoherence by only including one document in +each shuffled chunk. To optimize both packing strategies (concatenation vs +padding), we investigated the optimal atom size for shuffling and compared +their performance and efficiency. We found that matching atom size to MSL +optimizes performance for both packing methods (concatenation and padding), and +padding yields lower final perplexity (higher performance) than concatenation +at the cost of more training steps and lower compute efficiency. This trade-off +informs the choice of packing methods in training language models. + +
+
+ comment: 11 pages (include appendix), 26 figures, submitted to ACL ARR Aug + 2024 +
+
+
+
+
+ + ☆ Federated Learning of Large ASR Models in the Real World + + +
+ Federated learning (FL) has shown promising results on training machine +learning models with privacy preservation. However, for large models with over +100 million parameters, the training resource requirement becomes an obstacle +for FL because common devices do not have enough memory and computation power +to finish the FL tasks. Although efficient training methods have been proposed, +it is still a challenge to train the large models like Conformer based ASR. +This paper presents a systematic solution to train the full-size ASR models of +130M parameters with FL. To our knowledge, this is the first real-world FL +application of the Conformer model, which is also the largest model ever +trained with FL so far. And this is the first paper showing FL can improve the +ASR model quality with a set of proposed methods to refine the quality of data +and labels of clients. We demonstrate both the training efficiency and the +model quality improvement in real-world experiments. + +
+
+
+
+
+ + ☆ Goldfish: Monolingual Language Models for 350 Languages + + +
+ For many low-resource languages, the only available language models are large +multilingual models trained on many languages simultaneously. However, using +FLORES perplexity as a metric, we find that these models perform worse than +bigrams for many languages (e.g. 24% of languages in XGLM 4.5B; 43% in BLOOM +7.1B). To facilitate research that focuses on low-resource languages, we +pre-train and release Goldfish, a suite of monolingual autoregressive +Transformer language models up to 125M parameters for 350 languages. The +Goldfish reach lower FLORES perplexities than BLOOM, XGLM, and MaLA-500 on 98 +of 204 FLORES languages, despite each Goldfish model being over 10x smaller. +However, the Goldfish significantly underperform larger multilingual models on +reasoning benchmarks, suggesting that for low-resource languages, +multilinguality primarily improves general reasoning abilities rather than +basic text generation. We release models trained on 5MB (350 languages), 10MB +(288 languages), 100MB (166 languages), and 1GB (83 languages) of text data +where available. The Goldfish models are available as baselines, fine-tuning +sources, or augmentations to existing models in low-resource NLP research, and +they are further useful for crosslinguistic studies requiring maximally +comparable models across languages. + +
+
+
+
+
+ + ☆ Development of an AI Anti-Bullying System Using Large Language Model Key + Topic Detection + + +
+ This paper presents and evaluates work on the development of an artificial +intelligence (AI) anti-bullying system. The system is designed to identify +coordinated bullying attacks via social media and other mechanisms, +characterize them and propose remediation and response activities to them. In +particular, a large language model (LLM) is used to populate an enhanced expert +system-based network model of a bullying attack. This facilitates analysis and +remediation activity - such as generating report messages to social media +companies - determination. The system is described and the efficacy of the LLM +for populating the model is analyzed herein. + +
+
+
+
+
+ + ☆ Resolving Lexical Bias in Edit Scoping with Projector Editor Networks + + +
+ Weight-preserving model editing techniques heavily rely on the scoping +mechanism that decides when to apply an edit to the base model. These scoping +mechanisms utilize distance functions in the representation space to ascertain +the scope of the edit. In this work, we show that distance-based scoping +functions grapple with lexical biases leading to issues such as misfires with +irrelevant prompts that share similar lexical characteristics. To address this +problem, we introduce, Projector Editor Networks for Model Editing (PENME),is a +model editing approach that employs a compact adapter with a projection network +trained via a contrastive learning objective. We demonstrate the efficacy of +PENME in achieving superior results while being compute efficient and flexible +to adapt across model architectures. + +
+
+
+
+
+ + ☆ Value Alignment from Unstructured Text + + +
+ Aligning large language models (LLMs) to value systems has emerged as a +significant area of research within the fields of AI and NLP. Currently, this +alignment process relies on the availability of high-quality supervised and +preference data, which can be both time-consuming and expensive to curate or +annotate. In this paper, we introduce a systematic end-to-end methodology for +aligning LLMs to the implicit and explicit values represented in unstructured +text data. Our proposed approach leverages the use of scalable synthetic data +generation techniques to effectively align the model to the values present in +the unstructured data. Through two distinct use-cases, we demonstrate the +efficiency of our methodology on the Mistral-7B-Instruct model. Our approach +credibly aligns LLMs to the values embedded within documents, and shows +improved performance against other approaches, as quantified through the use of +automatic metrics and win rates. + +
+
+
+
+
+ + ☆ Narrowing the Gap between Vision and Action in Navigation + + +
+ The existing methods for Vision and Language Navigation in the Continuous +Environment (VLN-CE) commonly incorporate a waypoint predictor to discretize +the environment. This simplifies the navigation actions into a view selection +task and improves navigation performance significantly compared to direct +training using low-level actions. However, the VLN-CE agents are still far from +the real robots since there are gaps between their visual perception and +executed actions. First, VLN-CE agents that discretize the visual environment +are primarily trained with high-level view selection, which causes them to +ignore crucial spatial reasoning within the low-level action movements. Second, +in these models, the existing waypoint predictors neglect object semantics and +their attributes related to passibility, which can be informative in indicating +the feasibility of actions. To address these two issues, we introduce a +low-level action decoder jointly trained with high-level action prediction, +enabling the current VLN agent to learn and ground the selected visual view to +the low-level controls. Moreover, we enhance the current waypoint predictor by +utilizing visual representations containing rich semantic information and +explicitly masking obstacles based on humans' prior knowledge about the +feasibility of actions. Empirically, our agent can improve navigation +performance metrics compared to the strong baselines on both high-level and +low-level actions. + +
+
+
+
+
+ + ☆ Beyond Relevant Documents: A Knowledge-Intensive Approach for + Query-Focused Summarization using Large Language Models ICPR 2024 + + +
+ Query-focused summarization (QFS) is a fundamental task in natural language +processing with broad applications, including search engines and report +generation. However, traditional approaches assume the availability of relevant +documents, which may not always hold in practical scenarios, especially in +highly specialized topics. To address this limitation, we propose a novel +knowledge-intensive approach that reframes QFS as a knowledge-intensive task +setup. This approach comprises two main components: a retrieval module and a +summarization controller. The retrieval module efficiently retrieves +potentially relevant documents from a large-scale knowledge corpus based on the +given textual query, eliminating the dependence on pre-existing document sets. +The summarization controller seamlessly integrates a powerful large language +model (LLM)-based summarizer with a carefully tailored prompt, ensuring the +generated summary is comprehensive and relevant to the query. To assess the +effectiveness of our approach, we create a new dataset, along with +human-annotated relevance labels, to facilitate comprehensive evaluation +covering both retrieval and summarization performance. Extensive experiments +demonstrate the superior performance of our approach, particularly its ability +to generate accurate summaries without relying on the availability of relevant +documents initially. This underscores our method's versatility and practical +applicability across diverse query scenarios. + +
+
+ comment: Accepted by the 27th International Conference on Pattern Recognition + (ICPR 2024) +
+
+
+
+
+ + ☆ DELIA: Diversity-Enhanced Learning for Instruction Adaptation in Large + Language Models + + +
+ Although instruction tuning is widely used to adjust behavior in Large +Language Models (LLMs), extensive empirical evidence and research indicates +that it is primarily a process where the model fits to specific task formats, +rather than acquiring new knowledge or capabilities. We propose that this +limitation stems from biased features learned during instruction tuning, which +differ from ideal task-specfic features, leading to learn less underlying +semantics in downstream tasks. However, ideal features are unknown and +incalculable, constraining past work to rely on prior knowledge to assist +reasoning or training, which limits LLMs' capabilities to the developers' +abilities, rather than data-driven scalable learning. In our paper, through our +novel data synthesis method, DELIA (Diversity-Enhanced Learning for Instruction +Adaptation), we leverage the buffering effect of extensive diverse data in LLMs +training to transform biased features in instruction tuning into approximations +of ideal features, without explicit prior ideal features. Experiments show +DELIA's better performance compared to common instruction tuning and other +baselines. It outperforms common instruction tuning by 17.07%-33.41% on +Icelandic-English translation bleurt score (WMT-21 dataset, gemma-7b-it) and +improves accuracy by 36.1% on formatted text generation (Llama2-7b-chat). +Notably, among knowledge injection methods we've known, DELIA uniquely align +the internal representations of new special tokens with their prior semantics. + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ HGRN2: Gated Linear RNNs with State Expansion + + +
+ Hierarchically gated linear RNN (HGRN, \citealt{HGRN}) has demonstrated +competitive training speed and performance in language modeling while offering +efficient inference. However, the recurrent state size of HGRN remains +relatively small, limiting its expressiveness. To address this issue, we +introduce a simple outer product-based state expansion mechanism, which +significantly enlarges the recurrent state size without introducing any +additional parameters. This enhancement also provides a linear attention +interpretation for HGRN2, enabling hardware-efficient training. Our extensive +experiments verify the advantage of HGRN2 over HGRN consistently across +different settings and competitive with other recurrent models. + +
+
+ comment: Accept to COLM 2024. Yiran Zhong is the corresponding author. Zhen + Qin and Songlin Yang contributed equally to this work. The source code is + available at https://github.com/OpenNLPLab/HGRN2 +
+
+
+
+
+ + ♻ ☆ Topic-Based Watermarks for LLM-Generated Text + + +
+ The indistinguishability of text generated by large language models (LLMs) +from human-generated text poses significant challenges. Watermarking algorithms +are potential solutions by embedding detectable signatures within LLM-generated +outputs. However, current watermarking schemes lack robustness to a range of +attacks such as text substitution or manipulation, undermining their +reliability. This paper proposes a novel topic-based watermarking algorithm for +LLMs, designed to enhance the robustness of watermarking in LLMs. Our approach +leverages the topics extracted from input prompts or outputs of non-watermarked +LLMs in the generation process of watermarked text. We dynamically utilize +token lists on identified topics and adjust token sampling weights accordingly. +By using these topic-specific token biases, we embed a topic-sensitive +watermarking into the generated text. We outline the theoretical framework of +our topic-based watermarking algorithm and discuss its potential advantages in +various scenarios. Additionally, we explore a comprehensive range of attacks +against watermarking algorithms, including discrete alterations, paraphrasing, +and tokenizations. We demonstrate that our proposed watermarking scheme +classifies various watermarked text topics with 99.99% confidence and +outperforms existing algorithms in terms of z-score robustness and the +feasibility of modeling text degradation by potential attackers, while +considering the trade-offs between the benefits and losses of watermarking +LLM-generated text. + +
+
+ comment: Results for proposed scheme, additional/removal of content (figures + and equations), 12 pages +
+
+
+
+
+ + ♻ ☆ Learning Using Generated Privileged Information by Text-to-Image + Diffusion Models ICPR 2024 + + +
+ Learning Using Privileged Information is a particular type of knowledge +distillation where the teacher model benefits from an additional data +representation during training, called privileged information, improving the +student model, which does not see the extra representation. However, privileged +information is rarely available in practice. To this end, we propose a text +classification framework that harnesses text-to-image diffusion models to +generate artificial privileged information. The generated images and the +original text samples are further used to train multimodal teacher models based +on state-of-the-art transformer-based architectures. Finally, the knowledge +from multimodal teachers is distilled into a text-based (unimodal) student. +Hence, by employing a generative model to produce synthetic data as privileged +information, we guide the training of the student model. Our framework, called +Learning Using Generated Privileged Information (LUGPI), yields noticeable +performance gains on four text classification data sets, demonstrating its +potential in text classification without any additional cost during inference. + +
+
+ comment: Accepted at ICPR 2024 +
+
+
+
+
+ + ♻ ☆ Adaptive Draft-Verification for Efficient Large Language Model Decoding + + +
+ Large language model (LLM) decoding involves generating a sequence of tokens +based on a given context, where each token is predicted one at a time using the +model's learned probabilities. The typical autoregressive decoding method +requires a separate forward pass through the model for each token generated, +which is computationally inefficient and poses challenges for deploying LLMs in +latency-sensitive scenarios. The main limitations of current decoding methods +stem from their inefficiencies and resource demands. Existing approaches either +necessitate fine-tuning smaller models, which is resource-intensive, or rely on +fixed retrieval schemes to construct drafts for the next tokens, which lack +adaptability and fail to generalize across different models and contexts. To +address these issues, we introduce a novel methodology called ADED, which +accelerates LLM decoding without requiring fine-tuning. Our approach involves +an adaptive draft-verification process that evolves over time to improve +efficiency. We utilize a tri-gram matrix-based LLM representation to +dynamically approximate the output distribution of the LLM, allowing the model +to adjust to changing token probabilities during the decoding process. +Additionally, we implement a draft construction mechanism that effectively +balances exploration and exploitation, ensuring that the drafts generated are +both diverse and close to the true output distribution of the LLM. The +importance of this design lies in its ability to optimize the draft +distribution adaptively, leading to faster and more accurate decoding. Through +extensive experiments on various benchmark datasets and LLM architectures, we +demonstrate that ADED significantly accelerates the decoding process while +maintaining high accuracy, making it suitable for deployment in a wide range of +practical applications. + +
+
+ comment: Under review of Neurips 2024 +
+
+
+
+
+ + ♻ ☆ UniMem: Towards a Unified View of Long-Context Large Language Models + + +
+ Long-context processing is a critical ability that constrains the +applicability of large language models (LLMs). Although there exist various +methods devoted to enhancing the long-context processing ability of LLMs, they +are developed in an isolated manner and lack systematic analysis and +integration of their strengths, hindering further developments. In this paper, +we introduce UniMem, a Unified framework that reformulates existing +long-context methods from the view of Memory augmentation of LLMs. +Distinguished by its four core dimensions-Memory Management, Memory Writing, +Memory Reading, and Memory Injection, UniMem empowers researchers to conduct +systematic exploration of long-context methods. We re-formulate 16 existing +methods based on UniMem and analyze four representative methods: +Transformer-XL, Memorizing Transformer, RMT, and Longformer into equivalent +UniMem forms to reveal their design principles and strengths. Based on these +analyses, we propose UniMix, an innovative approach that integrates the +strengths of these algorithms. Experimental results show that UniMix achieves +superior performance in handling long contexts with significantly lower +perplexity than baselines. + +
+
+ comment: COLM 2024 +
+
+
+
+
+ + ♻ ☆ Compression Represents Intelligence Linearly + + +
+ There is a belief that learning to compress well will lead to intelligence. +Recently, language modeling has been shown to be equivalent to compression, +which offers a compelling rationale for the success of large language models +(LLMs): the development of more advanced language models is essentially +enhancing compression which facilitates intelligence. Despite such appealing +discussions, little empirical evidence is present for the interplay between +compression and intelligence. In this work, we examine their relationship in +the context of LLMs, treating LLMs as data compressors. Given the abstract +concept of "intelligence", we adopt the average downstream benchmark scores as +a surrogate, specifically targeting intelligence related to knowledge and +commonsense, coding, and mathematical reasoning. Across 12 benchmarks, our +study brings together 31 public LLMs that originate from diverse organizations. +Remarkably, we find that LLMs' intelligence -- reflected by average benchmark +scores -- almost linearly correlates with their ability to compress external +text corpora. These results provide concrete evidence supporting the belief +that superior compression indicates greater intelligence. Furthermore, our +findings suggest that compression efficiency, as an unsupervised metric derived +from raw text corpora, serves as a reliable evaluation measure that is linearly +associated with the model capabilities. We open-source our compression datasets +as well as our data collection pipelines to facilitate future researchers to +assess compression properly. + +
+
+ comment: COLM 2024. Data and code are available at + https://github.com/hkust-nlp/llm-compression-intelligence +
+
+
+
+
+ + ♻ ☆ ArcheType: A Novel Framework for Open-Source Column Type Annotation + using Large Language Models VLDB 2024 + + +
+ Existing deep-learning approaches to semantic column type annotation (CTA) +have important shortcomings: they rely on semantic types which are fixed at +training time; require a large number of training samples per type and incur +large run-time inference costs; and their performance can degrade when +evaluated on novel datasets, even when types remain constant. Large language +models have exhibited strong zero-shot classification performance on a wide +range of tasks and in this paper we explore their use for CTA. We introduce +ArcheType, a simple, practical method for context sampling, prompt +serialization, model querying, and label remapping, which enables large +language models to solve CTA problems in a fully zero-shot manner. We ablate +each component of our method separately, and establish that improvements to +context sampling and label remapping provide the most consistent gains. +ArcheType establishes a new state-of-the-art performance on zero-shot CTA +benchmarks (including three new domain-specific benchmarks which we release +along with this paper), and when used in conjunction with classical CTA +techniques, it outperforms a SOTA DoDuo model on the fine-tuned SOTAB +benchmark. Our code is available at https://github.com/penfever/ArcheType. + +
+
+ comment: VLDB 2024 +
+
+
+
+
+ + ♻ ☆ MaskMoE: Boosting Token-Level Learning via Routing Mask in + Mixture-of-Experts + + +
+ Scaling the size of a model enhances its capabilities but significantly +increases computation complexity. Mixture-of-Experts models (MoE) address the +issue by allowing model size to scale up without substantially increasing +training or inference costs. In MoE, there is an important module called the +router, which is used to distribute each token to the experts. Currently, the +mainstream routing methods include dynamic routing and fixed routing. Despite +their promising results, MoE models encounter several challenges. Primarily, +for dynamic routing methods, the dispersion of training tokens across multiple +experts can lead to underfitting, particularly for infrequent tokens. +Additionally, though fixed routing methods can mitigate that issue, they +compromise on the diversity of representations. In this paper, we propose +\textbf{MaskMoE}, a method designed to enhance token-level learning by +employing a routing \textbf{mask}ing technique within the +\textbf{M}ixture-\textbf{o}f-\textbf{E}xperts model. MaskMoE is capable of +maintaining representation diversity while achieving more comprehensive +training. Experimental results demonstrate that our method outperforms previous +dominant Mixture-of-Experts models in terms of both perplexity (PPL) and +downstream task performance. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ Linguistic and Structural Basis of Engineering Design Knowledge + + +
+ Natural language artefact descriptions are primary carriers of engineering +design knowledge, whose retrieval, representation, and reuse are fundamental to +supporting knowledge-intensive tasks in the design process. In this paper, we +explicate design knowledge from patented artefact descriptions as knowledge +graphs and examine these to understand the linguistic and structural basis. The +purpose of our work is to advance the traditional and ontological perspectives +of design knowledge and to guide Large-Language Models (LLMs) on how to +articulate natural language responses that reflect knowledge that is valuable +in a design environment. We populate 33,881 knowledge graphs from a sample of +patents stratified according to technology classes. For linguistic basis, we +conduct Zipf distribution analyses on the frequencies of unique entities and +relationships to identify 64 and 37 generalisable linguistic syntaxes +respectively. The relationships largely represent attributes ('of'), structure +('in', 'with'), purpose ('to', 'for'), hierarchy ('include'), exemplification +('such as'), and behaviour ('to', 'from'). For structural basis, we draw +inspiration from various studies on biological/ecological networks and discover +motifs from patent knowledge graphs. We identify four 3-node and four 4-node +subgraph patterns that could be converged and simplified into sequence +[->...->], aggregation [->...<-], and hierarchy [<-...->]. Based on these +results, we suggest concretisation strategies for entities and relationships +and explicating hierarchical structures, potentially aiding the construction +and modularisation of design knowledge. + +
+
+ comment: The data for this research is made available at Zenodo - + https://zenodo.org/doi/10.5281/zenodo.13328257 +
+
+
+
+
+ + ♻ ☆ Multi-Meta-RAG: Improving RAG for Multi-Hop Queries using Database + Filtering with LLM-Extracted Metadata + + +
+ The retrieval-augmented generation (RAG) enables retrieval of relevant +information from an external knowledge source and allows large language models +(LLMs) to answer queries over previously unseen document collections. However, +it was demonstrated that traditional RAG applications perform poorly in +answering multi-hop questions, which require retrieving and reasoning over +multiple elements of supporting evidence. We introduce a new method called +Multi-Meta-RAG, which uses database filtering with LLM-extracted metadata to +improve the RAG selection of the relevant documents from various sources, +relevant to the question. While database filtering is specific to a set of +questions from a particular domain and format, we found out that Multi-Meta-RAG +greatly improves the results on the MultiHop-RAG benchmark. The code is +available at https://github.com/mxpoliakov/Multi-Meta-RAG. + +
+
+ comment: Accepted to ICTERI 2024 Posters Track +
+
+
+
+
+ + ♻ ☆ LipidBERT: A Lipid Language Model Pre-trained on METiS de novo Lipid + Library + + +
+ In this study, we generate and maintain a database of 10 million virtual +lipids through METiS's in-house de novo lipid generation algorithms and lipid +virtual screening techniques. These virtual lipids serve as a corpus for +pre-training, lipid representation learning, and downstream task knowledge +transfer, culminating in state-of-the-art LNP property prediction performance. +We propose LipidBERT, a BERT-like model pre-trained with the Masked Language +Model (MLM) and various secondary tasks. Additionally, we compare the +performance of embeddings generated by LipidBERT and PhatGPT, our GPT-like +lipid generation model, on downstream tasks. The proposed bilingual LipidBERT +model operates in two languages: the language of ionizable lipid pre-training, +using in-house dry-lab lipid structures, and the language of LNP fine-tuning, +utilizing in-house LNP wet-lab data. This dual capability positions LipidBERT +as a key AI-based filter for future screening tasks, including new versions of +METiS de novo lipid libraries and, more importantly, candidates for in vivo +testing for orgran-targeting LNPs. To the best of our knowledge, this is the +first successful demonstration of the capability of a pre-trained language +model on virtual lipids and its effectiveness in downstream tasks using web-lab +data. This work showcases the clever utilization of METiS's in-house de novo +lipid library as well as the power of dry-wet lab integration. + +
+
+
+
+
+ + ♻ ☆ KnowPO: Knowledge-aware Preference Optimization for Controllable + Knowledge Selection in Retrieval-Augmented Language Models + + +
+ By integrating external knowledge, Retrieval-Augmented Generation (RAG) has +become an effective strategy for mitigating the hallucination problems that +large language models (LLMs) encounter when dealing with knowledge-intensive +tasks. However, in the process of integrating external non-parametric +supporting evidence with internal parametric knowledge, inevitable knowledge +conflicts may arise, leading to confusion in the model's responses. To enhance +the knowledge selection of LLMs in various contexts, some research has focused +on refining their behavior patterns through instruction-tuning. Nonetheless, +due to the absence of explicit negative signals and comparative objectives, +models fine-tuned in this manner may still exhibit undesirable behaviors such +as contextual ignorance and contextual overinclusion. To this end, we propose a +Knowledge-aware Preference Optimization strategy, dubbed KnowPO, aimed at +achieving adaptive knowledge selection based on contextual relevance in real +retrieval scenarios. Concretely, we proposed a general paradigm for +constructing knowledge conflict datasets, which comprehensively cover various +error types and learn how to avoid these negative signals through preference +optimization methods. Simultaneously, we proposed a rewriting strategy and data +ratio optimization strategy to address preference imbalances. Experimental +results show that KnowPO outperforms previous methods for handling knowledge +conflicts by over 37\%, while also exhibiting robust generalization across +various out-of-distribution datasets. + +
+
+
+
+
+ + ♻ ☆ EUvsDisinfo: a Dataset for Multilingual Detection of Pro-Kremlin + Disinformation in News Articles CIKM 2024 + + +
+ This work introduces EUvsDisinfo, a multilingual dataset of disinformation +articles originating from pro-Kremlin outlets, along with trustworthy articles +from credible / less biased sources. It is sourced directly from the debunk +articles written by experts leading the EUvsDisinfo project. Our dataset is the +largest to-date resource in terms of the overall number of articles and +distinct languages. It also provides the largest topical and temporal coverage. +Using this dataset, we investigate the dissemination of pro-Kremlin +disinformation across different languages, uncovering language-specific +patterns targeting certain disinformation topics. We further analyse the +evolution of topic distribution over an eight-year period, noting a significant +surge in disinformation content before the full-scale invasion of Ukraine in +2022. Lastly, we demonstrate the dataset's applicability in training models to +effectively distinguish between disinformation and trustworthy content in +multilingual settings. + +
+
+ comment: Published at CIKM 2024 +
+
+
+
+
+ + ♻ ☆ Identifying Query-Relevant Neurons in Large Language Models for + Long-Form Texts + + +
+ Large Language Models (LLMs) possess vast amounts of knowledge within their +parameters, prompting research into methods for locating and editing this +knowledge. Previous work has largely focused on locating entity-related (often +single-token) facts in smaller models. However, several key questions remain +unanswered: (1) How can we effectively locate query-relevant neurons in +contemporary autoregressive LLMs, such as Llama and Mistral? (2) How can we +address the challenge of long-form text generation? (3) Are there localized +knowledge regions in LLMs? In this study, we introduce Query-Relevant Neuron +Cluster Attribution (QRNCA), a novel architecture-agnostic framework capable of +identifying query-relevant neurons in LLMs. QRNCA allows for the examination of +long-form answers beyond triplet facts by employing the proxy task of +multi-choice question answering. To evaluate the effectiveness of our detected +neurons, we build two multi-choice QA datasets spanning diverse domains and +languages. Empirical evaluations demonstrate that our method outperforms +baseline methods significantly. Further, analysis of neuron distributions +reveals the presence of visible localized regions, particularly within +different domains. Finally, we show potential applications of our detected +neurons in knowledge editing and neuron-based prediction. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ♻ ☆ EmbSum: Leveraging the Summarization Capabilities of Large Language + Models for Content-Based Recommendations RecSys 2024 + + +
+ Content-based recommendation systems play a crucial role in delivering +personalized content to users in the digital world. In this work, we introduce +EmbSum, a novel framework that enables offline pre-computations of users and +candidate items while capturing the interactions within the user engagement +history. By utilizing the pretrained encoder-decoder model and poly-attention +layers, EmbSum derives User Poly-Embedding (UPE) and Content Poly-Embedding +(CPE) to calculate relevance scores between users and candidate items. EmbSum +actively learns the long user engagement histories by generating user-interest +summary with supervision from large language model (LLM). The effectiveness of +EmbSum is validated on two datasets from different domains, surpassing +state-of-the-art (SoTA) methods with higher accuracy and fewer parameters. +Additionally, the model's ability to generate summaries of user interests +serves as a valuable by-product, enhancing its usefulness for personalized +content recommendations. + +
+
+ comment: Accepted by RecSys 2024 +
+
+
+
+
+ + ♻ ☆ InterrogateLLM: Zero-Resource Hallucination Detection in LLM-Generated + Answers + + +
+ Despite the many advances of Large Language Models (LLMs) and their +unprecedented rapid evolution, their impact and integration into every facet of +our daily lives is limited due to various reasons. One critical factor +hindering their widespread adoption is the occurrence of hallucinations, where +LLMs invent answers that sound realistic, yet drift away from factual truth. In +this paper, we present a novel method for detecting hallucinations in large +language models, which tackles a critical issue in the adoption of these models +in various real-world scenarios. Through extensive evaluations across multiple +datasets and LLMs, including Llama-2, we study the hallucination levels of +various recent LLMs and demonstrate the effectiveness of our method to +automatically detect them. Notably, we observe up to 87% hallucinations for +Llama-2 in a specific experiment, where our method achieves a Balanced Accuracy +of 81%, all without relying on external knowledge. + +
+
+
+
+
+ + ♻ ☆ Gemma Scope: Open Sparse Autoencoders Everywhere All At Once on Gemma 2 + + +
+ Sparse autoencoders (SAEs) are an unsupervised method for learning a sparse +decomposition of a neural network's latent representations into seemingly +interpretable features. Despite recent excitement about their potential, +research applications outside of industry are limited by the high cost of +training a comprehensive suite of SAEs. In this work, we introduce Gemma Scope, +an open suite of JumpReLU SAEs trained on all layers and sub-layers of Gemma 2 +2B and 9B and select layers of Gemma 2 27B base models. We primarily train SAEs +on the Gemma 2 pre-trained models, but additionally release SAEs trained on +instruction-tuned Gemma 2 9B for comparison. We evaluate the quality of each +SAE on standard metrics and release these results. We hope that by releasing +these SAE weights, we can help make more ambitious safety and interpretability +research easier for the community. Weights and a tutorial can be found at +https://huggingface.co/google/gemma-scope and an interactive demo can be found +at https://www.neuronpedia.org/gemma-scope + +
+
+ comment: 12 main text pages, and 14 pages of acknowledgements, references and + appendices +
+
+
+
+
+ + ♻ ☆ Heuristic-enhanced Candidates Selection strategy for GPTs tackle + Few-Shot Aspect-Based Sentiment Analysis + + +
+ Few-Shot Aspect-Based Sentiment Analysis (FSABSA) is an indispensable and +highly challenging task in natural language processing. However, methods based +on Pre-trained Language Models (PLMs) struggle to accommodate multiple +sub-tasks, and methods based on Generative Pre-trained Transformers (GPTs) +perform poorly. To address the above issues, the paper designs a +Heuristic-enhanced Candidates Selection (HCS) strategy and further proposes All +in One (AiO) model based on it. The model works in a two-stage, which +simultaneously accommodates the accuracy of PLMs and the generalization +capability of GPTs. Specifically, in the first stage, a backbone model based on +PLMs generates rough heuristic candidates for the input sentence. In the second +stage, AiO leverages LLMs' contextual learning capabilities to generate precise +predictions. The study conducted comprehensive comparative and ablation +experiments on five benchmark datasets. The experimental results demonstrate +that the proposed model can better adapt to multiple sub-tasks, and also +outperforms the methods that directly utilize GPTs. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ IncDSI: Incrementally Updatable Document Retrieval + + +
+ Differentiable Search Index is a recently proposed paradigm for document +retrieval, that encodes information about a corpus of documents within the +parameters of a neural network and directly maps queries to corresponding +documents. These models have achieved state-of-the-art performances for +document retrieval across many benchmarks. These kinds of models have a +significant limitation: it is not easy to add new documents after a model is +trained. We propose IncDSI, a method to add documents in real time (about +20-50ms per document), without retraining the model on the entire dataset (or +even parts thereof). Instead we formulate the addition of documents as a +constrained optimization problem that makes minimal changes to the network +parameters. Although orders of magnitude faster, our approach is competitive +with re-training the model on the whole dataset and enables the development of +document retrieval systems that can be updated with new information in +real-time. Our code for IncDSI is available at +https://github.com/varshakishore/IncDSI. + +
+
+
+
+
+ + ♻ ☆ Lory: Fully Differentiable Mixture-of-Experts for Autoregressive + Language Model Pre-training + + +
+ Mixture-of-experts (MoE) models facilitate efficient scaling; however, +training the router network introduces the challenge of optimizing a +non-differentiable, discrete objective. Recently, a fully-differentiable MoE +architecture, SMEAR, was proposed (Muqeeth et al., 2023), which softly merges +experts in the parameter space; nevertheless, its effectiveness was only +demonstrated in downstream fine-tuning on classification tasks. In this paper, +we present Lory, the first approach that scales such architectures to +autoregressive language model pre-training. Lory introduces two key techniques: +(1) a causal segment routing strategy that achieves high efficiency for expert +merging operations while preserving the autoregressive nature of language +models; (2) a similarity-based data batching method that encourages expert +specialization by grouping similar documents in training instances. We +pre-train a series of Lory models on 150B tokens from scratch, with up to 32 +experts and 30B (1.5B active) parameters. Experimental results show significant +performance gains over parameter-matched dense models on both perplexity +(+13.9%) and a variety of downstream tasks (+1.5%-11.1%). Despite segment-level +routing, Lory models achieve competitive performance compared to +state-of-the-art MoE models with token-level routing. We further demonstrate +that the trained experts in Lory capture domain-level specialization without +supervision. Our work highlights the potential of fully-differentiable MoE +architectures for language model pre-training and advocates future research in +this area. + +
+
+ comment: COLM 2024 +
+
+
+
+
+ + ♻ ☆ Guiding In-Context Learning of LLMs through Quality Estimation for + Machine Translation + + +
+ The quality of output from large language models (LLMs), particularly in +machine translation (MT), is closely tied to the quality of in-context examples +(ICEs) provided along with the query, i.e., the text to translate. The +effectiveness of these ICEs is influenced by various factors, such as the +domain of the source text, the order in which the ICEs are presented, the +number of these examples, and the prompt templates used. Naturally, selecting +the most impactful ICEs depends on understanding how these affect the resulting +translation quality, which ultimately relies on translation references or human +judgment. This paper presents a novel methodology for in-context learning (ICL) +that relies on a search algorithm guided by domain-specific quality estimation +(QE). Leveraging the XGLM model, our methodology estimates the resulting +translation quality without the need for translation references, selecting +effective ICEs for MT to maximize translation quality. Our results demonstrate +significant improvements over existing ICL methods and higher translation +performance compared to fine-tuning a pre-trained language model (PLM), +specifically mBART-50. + +
+
+ comment: Camera-ready version of the Association for Machine Translation in + the Americas (AMTA) +
+
+
+
+
+ + ♻ ☆ RAVEN: In-Context Learning with Retrieval-Augmented Encoder-Decoder + Language Models + + +
+ In this paper, we investigate the in-context learning ability of +retrieval-augmented encoder-decoder language models. We first conduct a +comprehensive analysis of existing models and identify their limitations in +in-context learning, primarily due to a mismatch between pretraining and +inference, as well as a restricted context length. To address these issues, we +propose RAVEN, a model that combines retrieval-augmented masked language +modeling and prefix language modeling. We further introduce Fusion-in-Context +Learning to enhance the few-shot performance by enabling the model to leverage +more in-context examples without requiring additional training. Through +extensive experiments, we demonstrate that our simple yet effective design +significantly improves performance, achieving results comparable to the most +advanced language models in certain scenarios, despite having substantially +fewer parameters. Our work underscores the potential of retrieval-augmented +encoder-decoder language models for in-context learning and encourages further +research in this direction. + +
+
+ comment: COLM 2024 +
+
+
+
+
+ + ♻ ☆ Coupling without Communication and Drafter-Invariant Speculative + Decoding + + +
+ Suppose Alice has a distribution $P$ and Bob has a distribution $Q$. Alice +wants to generate a sample $a\sim P$ and Bob a sample $b \sim Q$ such that $a = +b$ with has as high of probability as possible. It is well-known that, by +sampling from an optimal coupling between the distributions, Alice and Bob can +achieve $Pr[a = b] = 1 - D_{TV}(P,Q)$, where $D_{TV}(P,Q)$ is the total +variation distance. What if Alice and Bob must solve this same problem without +communicating at all? Perhaps surprisingly, with access to public randomness, +they can still achieve $Pr[a=b] \geq \frac{1-D_{TV}(P,Q)}{1+D_{TV}(P,Q)} \geq +1-2D_{TV}(P,Q)$. In fact, this bound can be obtained using a simple protocol +based on the Weighted MinHash algorithm. In this work, we explore the +communication-free coupling problem in greater depth. First, we show that an +equally simple protocol based on Gumbel sampling matches the worst-case +guarantees of the Weighted MinHash approach, but tends to perform better in +practice. Conversely, we prove that both approaches are actually sharp: no +communication-free protocol can achieve +$Pr[a=b]>\frac{1-D_{TV}(P,Q)}{1+D_{TV}(P,Q)}$ in the worst-case. Finally, we +prove that, for distributions over $n$ items, there exists a scheme that uses +just $O(\log(n/\epsilon))$ bits of communication to achieve $Pr[a = b] = 1 - +D_{TV}(P,Q) - \epsilon$, i.e. to essentially match optimal coupling. Beyond our +theoretical results, we demonstrate an application of communication-free +coupling to speculative decoding, a recent method for accelerating +autoregressive large language models [Leviathan, Kalman, Matias, ICML 2023]. We +show that communication-free protocols yield a variant of speculative decoding +that we call Drafter-Invariant Speculative Decoding, which has the desirable +property that the output of the method is fixed given a fixed random seed, +regardless of what drafter is used for speculation. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ♻ ☆ TeaMs-RL: Teaching LLMs to Generate Better Instruction Datasets via + Reinforcement Learning + + +
+ The development of Large Language Models (LLMs) often confronts challenges +stemming from the heavy reliance on human annotators in the reinforcement +learning with human feedback (RLHF) framework, or the frequent and costly +external queries tied to the self-instruct paradigm. In this work, we pivot to +Reinforcement Learning (RL) -- but with a twist. Diverging from the typical +RLHF, which refines LLMs following instruction data training, we use RL to +directly generate the foundational instruction dataset that alone suffices for +fine-tuning. Our method, TeaMs-RL, uses a suite of textual operations and +rules, prioritizing the diversification of training datasets. It facilitates +the generation of high-quality data without excessive reliance on external +advanced models, paving the way for a single fine-tuning step and negating the +need for subsequent RLHF stages. Our findings highlight key advantages of our +approach: reduced need for human involvement and fewer model queries (only +$5.73\%$ of the strong baseline's total), along with enhanced capabilities of +LLMs in crafting and comprehending complex instructions compared to strong +baselines, and substantially improved model privacy protection. Code is +available at the link: https://github.com/SafeRL-Lab/TeaMs-RL + +
+
+
+
+
+ + ♻ ☆ PEDAL: Enhancing Greedy Decoding with Large Language Models using + Diverse Exemplars + + +
+ Self-ensembling techniques with diverse reasoning paths such as +Self-Consistency have demonstrated remarkable performance gains in text +generation with Large Language Models (LLMs). However, such techniques depend +on the availability of an accurate answer extraction process to aggregate +across multiple outputs. Moreover, they acquire higher inference cost, in +comparison to Greedy Decoding, due to generation of relatively higher number of +output tokens. Research has shown that the free form text outputs from +Self-Consistency can be aggregated reliably using LLMs to produce the final +output. Additionally, recent advancements in LLM inference have demonstrated +that usage of diverse exemplars in prompts have the ability to induce diversity +in the LLM outputs. Such proven techniques can be easily extended to +self-ensembling based approaches to achieve enhanced results in text +generation. In this paper, we introduce PEDAL (Prompts based on Exemplar +Diversity Aggregated using LLMs), a hybrid self-ensembling approach, that +combines the strengths of diverse exemplar based prompts and LLM based +aggregation to achieve improvement in overall performance. On the publicly +available SVAMP and ARC datasets, our experiments reveal that PEDAL can achieve +better accuracy than Greedy Decoding based strategies with lower inference cost +compared to Self Consistency based approaches. + +
+
+
+
+
+ + ♻ ☆ Universal Approximation Theory: The Basic Theory for Transformer-based + Large Language Models + + +
+ Language models have emerged as a critical area of focus in artificial +intelligence, particularly with the introduction of groundbreaking innovations +like ChatGPT. Large-scale Transformer networks have quickly become the leading +approach for advancing natural language processing algorithms. Built on the +Transformer architecture, these models enable interactions that closely mimic +human communication and, equipped with extensive knowledge, can even assist in +guiding human tasks. Despite their impressive capabilities and growing +complexity, a key question remains-the theoretical foundations of large +language models (LLMs). What makes Transformer so effective for powering +intelligent language applications, such as translation and coding? What +underlies LLMs' ability for In-Context Learning (ICL)? How does the LoRA scheme +enhance the fine-tuning of LLMs? And what supports the practicality of pruning +LLMs? To address these critical questions and explore the technological +strategies within LLMs, we leverage the Universal Approximation Theory (UAT) to +offer a theoretical backdrop, shedding light on the mechanisms that underpin +these advancements. + +
+
+
+
+
+ + ♻ ☆ Timo: Towards Better Temporal Reasoning for Language Models + + +
+ Reasoning about time is essential for Large Language Models (LLMs) to +understand the world. Previous works focus on solving specific tasks, primarily +on time-sensitive question answering. While these methods have proven +effective, they cannot generalize to a wider spectrum of temporal reasoning +tasks. Therefore, we propose a crucial question: Can we build a universal +framework to handle a variety of temporal reasoning tasks? To that end, we +systematically study 38 temporal reasoning tasks. Based on the observation that +19 tasks are directly related to mathematics, we first leverage the available +mathematical dataset to set a solid foundation for temporal reasoning. However, +the in-depth study indicates that focusing solely on mathematical enhancement +falls short of addressing pure temporal reasoning tasks. To mitigate this +limitation, we propose a simple but effective self-critic temporal optimization +method to enhance the model's temporal reasoning capabilities without +sacrificing general task abilities. Finally, we develop Timo, a model designed +to excel in temporal reasoning at the 7B and 13B scales. Notably, Timo +outperforms the counterpart LLMs by 10.0 and 7.6 in average accuracy scores and +achieves the new state-of-the-art (SOTA) performance of comparable size. +Extensive experiments further validate our framework's effectiveness and its +generalization across diverse temporal tasks. The code is available at +https://github.com/zhaochen0110/Timo. + +
+
+ comment: This paper has been accepted to the COLM 2024 conference +
+
+
+
+
+ + ♻ ☆ LoraHub: Efficient Cross-Task Generalization via Dynamic LoRA + Composition + + +
+ Low-rank adaptations (LoRA) are often employed to fine-tune large language +models (LLMs) for new tasks. This paper investigates LoRA composability for +cross-task generalization and introduces LoraHub, a simple framework devised +for the purposive assembly of LoRA modules trained on diverse given tasks, with +the objective of achieving adaptable performance on unseen tasks. With just a +few examples from a new task, LoraHub can fluidly combine multiple LoRA +modules, eliminating the need for human expertise and assumptions. Notably, the +composition requires neither additional model parameters nor gradients. +Empirical results on the Big-Bench Hard benchmark suggest that LoraHub, while +not surpassing the performance of in-context learning, offers a notable +performance-efficiency trade-off in few-shot scenarios by employing a +significantly reduced number of tokens per example during inference. Notably, +LoraHub establishes a better upper bound compared to in-context learning when +paired with different demonstration examples, demonstrating its potential for +future development. Our vision is to establish a platform for LoRA modules, +empowering users to share their trained LoRA modules. This collaborative +approach facilitates the seamless application of LoRA modules to novel tasks, +contributing to an adaptive ecosystem. Our code is available at +https://github.com/sail-sg/lorahub, and all the pre-trained LoRA modules are +released at https://huggingface.co/lorahub. + +
+
+ comment: COLM 2024 +
+
+
+
+
+ + ♻ ☆ Clue-Guided Path Exploration: Optimizing Knowledge Graph Retrieval with + Large Language Models to Address the Information Black Box Challenge + + +
+ In recent times, large language models (LLMs) have showcased remarkable +capabilities. However, updating their knowledge poses challenges, potentially +leading to inaccuracies when confronted with unfamiliar queries. To address +this issue, integrating external knowledge bases such as knowledge graphs with +large language models is a viable approach. The key challenge lies in +extracting the required knowledge from knowledge graphs based on natural +language, demanding high semantic understanding. Therefore, researchers are +considering leveraging large language models directly for knowledge retrieval +from these graphs. Current efforts typically rely on the comprehensive +problem-solving capabilities of large language models. We argue that a problem +we term the 'information black box' can significantly impact the practical +effectiveness of such methods. Moreover, this kind of methods is less effective +for scenarios where the questions are unfamiliar to the large language models. +In this paper, we propose a Clue-Guided Path Exploration (CGPE) framework to +optimize knowledge retrieval based on large language models. By addressing the +'information black box' issue and employing single-task approaches instead of +complex tasks, we have enhanced the accuracy and efficiency of using large +language models for retrieving knowledge graphs. Experiments on open-source +datasets reveal that CGPE outperforms previous methods and is highly applicable +to LLMs with fewer parameters. In some instances, even ChatGLM3, with its 6 +billion parameters, can rival the performance of GPT-4. Furthermore, the +results indicate a minimal invocation frequency of CGPE on LLMs, suggesting +reduced computational overhead. For organizations and individuals facing +constraints in computational resources, our research offers significant +practical value. + +
+
+
+
+
+ + ♻ ☆ SWIFT:A Scalable lightWeight Infrastructure for Fine-Tuning + + +
+ Recent development in Large Language Models (LLMs) and Multi-modal Large +Language Models (MLLMs) have leverage Attention-based Transformer architectures +and achieved superior performance and generalization capabilities. They have +since covered extensive areas of traditional learning tasks. For instance, +text-based tasks such as text-classification and sequence-labeling, as well as +multi-modal tasks like Visual Question Answering (VQA) and Optical Character +Recognition (OCR), which were previously addressed using different models, can +now be tackled based on one foundation model. Consequently, the training and +lightweight fine-tuning of LLMs and MLLMs, especially those based on +Transformer architecture, has become particularly important. In recognition of +these overwhelming needs, we develop SWIFT, a customizable one-stop +infrastructure for large models. With support of over $300+$ LLMs and $50+$ +MLLMs, SWIFT stands as the open-source framework that provide the most +comprehensive support for fine-tuning large models. In particular, it is the +first training framework that provides systematic support for MLLMs. In +addition to the core functionalities of fine-tuning, SWIFT also integrates +post-training processes such as inference, evaluation, and model quantization, +to facilitate fast adoptions of large models in various application scenarios. +With a systematic integration of various training techniques, SWIFT offers +helpful utilities such as benchmark comparisons among different training +techniques for large models. For fine-tuning models specialized in agent +framework, we show that notable improvements on the ToolBench leader-board can +be achieved by training with customized dataset on SWIFT, with an increase of +5.2%-21.8% in the Act.EM metric over various baseline models, a reduction in +hallucination by 1.6%-14.1%, and an average performance improvement of 8%-17%. + +
+
+
+
+
+ + ♻ ☆ HERA: High-efficiency Matrix Compression via Element Replacement + + +
+ Matrix quantization involves encoding matrix elements in a more +space-efficient manner to minimize storage requirements, with dequantization +used to reconstruct the original matrix for practical use. We define the +Quantization Error Minimization (QEM) problem as minimizing the difference +between a matrix before and after quantization while ensuring that the +quantized matrix occupies the same amount of memory. Matrix quantization is +essential in various fields, including weight quantization in Large Language +Models (LLMs), vector databases, KV cache quantization, graph compression, and +image compression. The growing scale of LLMs, such as GPT-4 and BERT, +underscores the need for matrix compression due to the large size of parameters +and KV caches, which are stored as matrices. + To address the QEM problem, we introduce HETA, an algorithm that leverages +the local orderliness of matrix elements by iteratively swapping elements to +create a locally ordered matrix. This matrix is then grouped and quantized by +columns. To further improve HETA, we present two optimizations: additional +quantization of residuals to reduce mean squared error (MSE) and the +application of masking and batch processing to accelerate the algorithm. + Our experiments show that HETA effectively reduces MSE to 12.3% of its +original value at the same compression ratio, outperforming leading baseline +algorithms. Our contributions include formalizing the QEM problem, developing +the HETA algorithm, and proposing two optimizations to enhance both accuracy +and processing speed. + +
+
+
+
+
+ + ♻ ☆ ReALM: Reference Resolution As Language Modeling SIGDIAL 2024 + + +
+ Reference resolution is an important problem, one that is essential to +understand and successfully handle context of different kinds. This context +includes both previous turns and context that pertains to non-conversational +entities, such as entities on the user's screen or those running in the +background. While LLMs have been shown to be extremely powerful for a variety +of tasks, their use in reference resolution, particularly for +non-conversational entities, remains underutilized. This paper demonstrates how +LLMs can be used to create an extremely effective system to resolve references +of various types, by showing how reference resolution can be converted into a +language modeling problem, despite involving forms of entities like those on +screen that are not traditionally conducive to being reduced to a text-only +modality. We demonstrate large improvements over an existing system with +similar functionality across different types of references, with our smallest +model obtaining absolute gains of over 5% for on-screen references. We also +benchmark against GPT-3.5 and GPT-4, with our smallest model achieving +performance comparable to that of GPT-4, and our larger models substantially +outperforming it. + +
+
+ comment: Accepted at SIGDIAL 2024 (Oral presentation) +
+
+
+
+
+ + ♻ ☆ GPT-4V(ision) for Robotics: Multimodal Task Planning from Human + Demonstration + + +
+ We introduce a pipeline that enhances a general-purpose Vision Language +Model, GPT-4V(ision), to facilitate one-shot visual teaching for robotic +manipulation. This system analyzes videos of humans performing tasks and +outputs executable robot programs that incorporate insights into affordances. +The process begins with GPT-4V analyzing the videos to obtain textual +explanations of environmental and action details. A GPT-4-based task planner +then encodes these details into a symbolic task plan. Subsequently, vision +systems spatially and temporally ground the task plan in the videos. Object are +identified using an open-vocabulary object detector, and hand-object +interactions are analyzed to pinpoint moments of grasping and releasing. This +spatiotemporal grounding allows for the gathering of affordance information +(e.g., grasp types, waypoints, and body postures) critical for robot execution. +Experiments across various scenarios demonstrate the method's efficacy in +achieving real robots' operations from human demonstrations in a one-shot +manner. Meanwhile, quantitative tests have revealed instances of hallucination +in GPT-4V, highlighting the importance of incorporating human supervision +within the pipeline. The prompts of GPT-4V/GPT-4 are available at this project +page: https://microsoft.github.io/GPT4Vision-Robot-Manipulation-Prompts/ + +
+
+ comment: 8 pages, 10 figures, 3 tables. Last updated on August 18th, 2024 +
+
+
+
+
+ + ♻ ☆ Masked Language Modeling Becomes Conditional Density Estimation for + Tabular Data Synthesis + + +
+ In this paper, our goal is to generate synthetic data for heterogeneous +(mixed-type) tabular datasets with high machine learning utility (MLu). Since +the MLu performance depends on accurately approximating the conditional +distributions, we focus on devising a synthetic data generation method based on +conditional distribution estimation. We introduce MaCoDE by redefining the +consecutive multi-class classification task of Masked Language Modeling (MLM) +as histogram-based non-parametric conditional density estimation. Our approach +enables the estimation of conditional densities across arbitrary combinations +of target and conditional variables. We bridge the theoretical gap between +distributional learning and MLM by demonstrating that minimizing the orderless +multi-class classification loss leads to minimizing the total variation +distance between conditional distributions. To validate our proposed model, we +evaluate its performance in synthetic data generation across 10 real-world +datasets, demonstrating its ability to adjust data privacy levels easily +without re-training. Additionally, since masked input tokens in MLM are +analogous to missing data, we further assess its effectiveness in handling +training datasets with missing values, including multiple imputations of the +missing entries. + +
+
+
+
+
+ + ♻ ☆ CIC: A framework for Culturally-aware Image Captioning IJCAI 2024 + + +
+ Image Captioning generates descriptive sentences from images using +Vision-Language Pre-trained models (VLPs) such as BLIP, which has improved +greatly. However, current methods lack the generation of detailed descriptive +captions for the cultural elements depicted in the images, such as the +traditional clothing worn by people from Asian cultural groups. In this paper, +we propose a new framework, Culturally-aware Image Captioning (CIC), that +generates captions and describes cultural elements extracted from cultural +visual elements in images representing cultures. Inspired by methods combining +visual modality and Large Language Models (LLMs) through appropriate prompts, +our framework (1) generates questions based on cultural categories from images, +(2) extracts cultural visual elements from Visual Question Answering (VQA) +using generated questions, and (3) generates culturally-aware captions using +LLMs with the prompts. Our human evaluation conducted on 45 participants from 4 +different cultural groups with a high understanding of the corresponding +culture shows that our proposed framework generates more culturally descriptive +captions when compared to the image captioning baseline based on VLPs. +Resources can be found at https://shane3606.github.io/cic.. + +
+
+ comment: Accepted in IJCAI 2024 +
+
+
+
+
+ + ♻ ☆ Open foundation models for Azerbaijani language + + +
+ The emergence of multilingual large language models has enabled the +development of language understanding and generation systems in Azerbaijani. +However, most of the production-grade systems rely on cloud solutions, such as +GPT-4. While there have been several attempts to develop open foundation models +for Azerbaijani, these works have not found their way into common use due to a +lack of systemic benchmarking. This paper encompasses several lines of work +that promote open-source foundation models for Azerbaijani. We introduce (1) a +large text corpus for Azerbaijani, (2) a family of encoder-only language models +trained on this dataset, (3) labeled datasets for evaluating these models, and +(4) extensive evaluation that covers all major open-source models with +Azerbaijani support. + +
+
+ comment: Presented in the First Workshop on Natural Language Processing for + Turkic Languages +
+
+
+
+
+ + ♻ ☆ Enhancing Source Code Classification Effectiveness via Prompt Learning + Incorporating Knowledge Features + + +
+ Researchers have investigated the potential of leveraging pre-trained +language models, such as CodeBERT, to enhance source code-related tasks. +Previous methodologies have relied on CodeBERT's '[CLS]' token as the embedding +representation of input sequences for task performance, necessitating +additional neural network layers to enhance feature representation, which in +turn increases computational expenses. These approaches have also failed to +fully leverage the comprehensive knowledge inherent within the source code and +its associated text, potentially limiting classification efficacy. We propose +CodeClassPrompt, a text classification technique that harnesses prompt learning +to extract rich knowledge associated with input sequences from pre-trained +models, thereby eliminating the need for additional layers and lowering +computational costs. By applying an attention mechanism, we synthesize +multi-layered knowledge into task-specific features, enhancing classification +accuracy. Our comprehensive experimentation across four distinct source +code-related tasks reveals that CodeClassPrompt achieves competitive +performance while significantly reducing computational overhead. + +
+
+ comment: Accepted by Scientific Reports +
+
+
+
+
+ + ♻ ☆ Information-Theoretic Distillation for Reference-less Summarization + + +
+ The current winning recipe for automatic summarization is using proprietary +large-scale language models (LLMs) such as ChatGPT as is, or imitation learning +from them as teacher models. While increasingly ubiquitous dependence on such +large-scale language models is convenient, there remains an important question +of whether small-scale models could have achieved competitive results, if we +were to seek an alternative learning method -- that allows for a more +cost-efficient, controllable, yet powerful summarizer. We present InfoSumm, a +novel framework to distill a powerful summarizer based on the +information-theoretic objective for summarization, without relying on either +the LLM's capability or human-written references. To achieve this, we first +propose a novel formulation of the desiderata of summarization (saliency, +faithfulness and brevity) through the lens of mutual information between the +original document and the summary. Based on this formulation, we start off from +Pythia-2.8B as the teacher model, which is not yet capable of summarization, +then self-train the model to optimize for the information-centric measures of +ideal summaries. Distilling from the improved teacher, we arrive at a compact +but powerful summarizer with only 568M parameters that performs competitively +against ChatGPT, without ever relying on ChatGPT's capabilities. Extensive +analysis demonstrates that our approach outperforms in-domain supervised models +in human evaluation, let alone state-of-the-art unsupervised methods, and wins +over ChatGPT in controllable summarization. + +
+
+
+
+
+ + ♻ ☆ Impossible Distillation: from Low-Quality Model to High-Quality Dataset + & Model for Summarization and Paraphrasing NAACL 2024 + + +
+ We present Impossible Distillation, a novel framework for paraphrasing and +sentence summarization, that distills a high-quality dataset and model from a +low-quality teacher that itself cannot perform these tasks. Unlike prior works +that rely on an extreme-scale teacher model (e.g., GPT3) or task-specific +architecture, we hypothesize and verify the paraphrastic proximity intrinsic to +pre-trained LMs (e.g., GPT2), where paraphrases occupy a proximal subspace in +the LM distribution. By identifying and distilling generations from these +subspaces, Impossible Distillation produces a high-quality dataset and model +even from GPT2-scale LMs. We evaluate our method on multiple benchmarks +spanning unconstrained / syntax-controlled paraphrase generation and sentence +summarization. Our model with 770M parameters consistently outperforms strong +baselines, including models distilled from ChatGPT, and sometimes, even ChatGPT +itself. Also, we find that our distilled dataset from 1.5B LMs exhibits higher +diversity and fidelity than up to 13 times larger datasets. + +
+
+ comment: NAACL 2024 +
+
+
+
+
+ + ♻ ☆ WalledEval: A Comprehensive Safety Evaluation Toolkit for Large Language + Models + + +
+ WalledEval is a comprehensive AI safety testing toolkit designed to evaluate +large language models (LLMs). It accommodates a diverse range of models, +including both open-weight and API-based ones, and features over 35 safety +benchmarks covering areas such as multilingual safety, exaggerated safety, and +prompt injections. The framework supports both LLM and judge benchmarking and +incorporates custom mutators to test safety against various text-style +mutations, such as future tense and paraphrasing. Additionally, WalledEval +introduces WalledGuard, a new, small, and performant content moderation tool, +and two datasets: SGXSTest and HIXSTest, which serve as benchmarks for +assessing the exaggerated safety of LLMs and judges in cultural contexts. We +make WalledEval publicly available at https://github.com/walledai/walledeval. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ When Can LLMs Actually Correct Their Own Mistakes? A Critical Survey of + Self-Correction of LLMs ACL + + +
+ Self-correction is an approach to improving responses from large language +models (LLMs) by refining the responses using LLMs during inference. Prior work +has proposed various self-correction frameworks using different sources of +feedback, including self-evaluation and external feedback. However, there is +still no consensus on the question of when LLMs can correct their own mistakes, +as recent studies also report negative results. In this work, we critically +survey broad papers and discuss the conditions required for successful +self-correction. We first find that prior studies often do not define their +research questions in detail and involve impractical frameworks or unfair +evaluations that over-evaluate self-correction. To tackle these issues, we +categorize research questions in self-correction research and provide a +checklist for designing appropriate experiments. Our critical survey based on +the newly categorized research questions shows that (1) no prior work +demonstrates successful self-correction with feedback from prompted LLMs, +except for studies in tasks that are exceptionally suited for self-correction, +(2) self-correction works well in tasks that can use reliable external +feedback, and (3) large-scale fine-tuning enables self-correction. + +
+
+ comment: Accepted for publication in Transactions of the Association for + Computational Linguistics (TACL), 2024. Author's final version +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 126 + +
+
+
+ + ☆ Criticality Leveraged Adversarial Training (CLAT) for Boosted + Performance via Parameter Efficiency + + +
+ Adversarial training enhances neural network robustness but suffers from a +tendency to overfit and increased generalization errors on clean data. This +work introduces CLAT, an innovative approach that mitigates adversarial +overfitting by introducing parameter efficiency into the adversarial training +process, improving both clean accuracy and adversarial robustness. Instead of +tuning the entire model, CLAT identifies and fine-tunes robustness-critical +layers - those predominantly learning non-robust features - while freezing the +remaining model to enhance robustness. It employs dynamic critical layer +selection to adapt to changes in layer criticality throughout the fine-tuning +process. Empirically, CLAT can be applied on top of existing adversarial +training methods, significantly reduces the number of trainable parameters by +approximately 95%, and achieves more than a 2% improvement in adversarial +robustness compared to baseline methods. + +
+
+ comment: 9 pages + appendix/ additional experiments +
+
+
+
+
+ + ☆ SANER: Annotation-free Societal Attribute Neutralizer for Debiasing CLIP + + +
+ Large-scale vision-language models, such as CLIP, are known to contain +harmful societal bias regarding protected attributes (e.g., gender and age). In +this paper, we aim to address the problems of societal bias in CLIP. Although +previous studies have proposed to debias societal bias through adversarial +learning or test-time projecting, our comprehensive study of these works +identifies two critical limitations: 1) loss of attribute information when it +is explicitly disclosed in the input and 2) use of the attribute annotations +during debiasing process. To mitigate societal bias in CLIP and overcome these +limitations simultaneously, we introduce a simple-yet-effective debiasing +method called SANER (societal attribute neutralizer) that eliminates attribute +information from CLIP text features only of attribute-neutral descriptions. +Experimental results show that SANER, which does not require attribute +annotations and preserves original information for attribute-specific +descriptions, demonstrates superior debiasing ability than the existing +methods. + +
+
+
+
+
+ + ☆ MeshFormer: High-Quality Mesh Generation with 3D-Guided Reconstruction + Model + + +
+ Open-world 3D reconstruction models have recently garnered significant +attention. However, without sufficient 3D inductive bias, existing methods +typically entail expensive training costs and struggle to extract high-quality +3D meshes. In this work, we introduce MeshFormer, a sparse-view reconstruction +model that explicitly leverages 3D native structure, input guidance, and +training supervision. Specifically, instead of using a triplane representation, +we store features in 3D sparse voxels and combine transformers with 3D +convolutions to leverage an explicit 3D structure and projective bias. In +addition to sparse-view RGB input, we require the network to take input and +generate corresponding normal maps. The input normal maps can be predicted by +2D diffusion models, significantly aiding in the guidance and refinement of the +geometry's learning. Moreover, by combining Signed Distance Function (SDF) +supervision with surface rendering, we directly learn to generate high-quality +meshes without the need for complex multi-stage training processes. By +incorporating these explicit 3D biases, MeshFormer can be trained efficiently +and deliver high-quality textured meshes with fine-grained geometric details. +It can also be integrated with 2D diffusion models to enable fast +single-image-to-3D and text-to-3D tasks. Project page: +https://meshformer3d.github.io + +
+
+ comment: 20 pages, 9 figures +
+
+
+
+
+ + ☆ SpaRP: Fast 3D Object Reconstruction and Pose Estimation from Sparse + Views ECCV 2024 + + +
+ Open-world 3D generation has recently attracted considerable attention. While +many single-image-to-3D methods have yielded visually appealing outcomes, they +often lack sufficient controllability and tend to produce hallucinated regions +that may not align with users' expectations. In this paper, we explore an +important scenario in which the input consists of one or a few unposed 2D +images of a single object, with little or no overlap. We propose a novel +method, SpaRP, to reconstruct a 3D textured mesh and estimate the relative +camera poses for these sparse-view images. SpaRP distills knowledge from 2D +diffusion models and finetunes them to implicitly deduce the 3D spatial +relationships between the sparse views. The diffusion model is trained to +jointly predict surrogate representations for camera poses and multi-view +images of the object under known poses, integrating all information from the +input sparse views. These predictions are then leveraged to accomplish 3D +reconstruction and pose estimation, and the reconstructed 3D model can be used +to further refine the camera poses of input views. Through extensive +experiments on three datasets, we demonstrate that our method not only +significantly outperforms baseline methods in terms of 3D reconstruction +quality and pose prediction accuracy but also exhibits strong efficiency. It +requires only about 20 seconds to produce a textured mesh and camera poses for +the input views. Project page: https://chaoxu.xyz/sparp. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ LongVILA: Scaling Long-Context Visual Language Models for Long Videos + + +
+ Long-context capability is critical for multi-modal foundation models. We +introduce LongVILA, a full-stack solution for long-context vision-language +models, including system, model training, and dataset development. On the +system side, we introduce the first Multi-Modal Sequence Parallelism (MM-SP) +system that enables long-context training and inference, enabling 2M context +length training on 256 GPUs. MM-SP is also efficient, being 2.1x - 5.7x faster +than Ring-Style Sequence Parallelism and 1.1x - 1.4x faster than Megatron-LM in +text-only settings. Moreover, it seamlessly integrates with Hugging Face +Transformers. For model training, we propose a five-stage pipeline comprising +alignment, pre-training, context extension, and long-short joint supervised +fine-tuning. Regarding datasets, we meticulously construct large-scale visual +language pre-training datasets and long video instruction-following datasets to +support our multi-stage training process. The full-stack solution extends the +feasible frame number of VILA by a factor of 128 (from 8 to 1024 frames) and +improves long video captioning score from 2.00 to 3.26 (1.6x), achieving 99.5% +accuracy in 1400-frames video (274k context length) needle in a haystack. +LongVILA-8B also demonstrates a consistent improvement in performance on long +videos within the VideoMME benchmark as the video frames increase. + +
+
+ comment: Code and models are available at + https://github.com/NVlabs/VILA/blob/main/LongVILA.md +
+
+
+
+
+ + ☆ Assessment of Spectral based Solutions for the Detection of Floating + Marine Debris + + +
+ Typically, the detection of marine debris relies on in-situ campaigns that +are characterized by huge human effort and limited spatial coverage. Following +the need of a rapid solution for the detection of floating plastic, methods +based on remote sensing data have been proposed recently. Their main limitation +is represented by the lack of a general reference for evaluating performance. +Recently, the Marine Debris Archive (MARIDA) has been released as a standard +dataset to develop and evaluate Machine Learning (ML) algorithms for detection +of Marine Plastic Debris. The MARIDA dataset has been created for simplifying +the comparison between detection solutions with the aim of stimulating the +research in the field of marine environment preservation. In this work, an +assessment of spectral based solutions is proposed by evaluating performance on +MARIDA dataset. The outcome highlights the need of precise reference for fair +evaluation. + +
+
+ comment: 5 pages, 3 figures, submitted and accepted for 2024 Second + International Conference on Networks, Multimedia and Information Technology + (NMITCON) +
+
+
+
+
+ + ☆ Imbalance-Aware Culvert-Sewer Defect Segmentation Using an Enhanced + Feature Pyramid Network + + +
+ Imbalanced datasets are a significant challenge in real-world scenarios. They +lead to models that underperform on underrepresented classes, which is a +critical issue in infrastructure inspection. This paper introduces the Enhanced +Feature Pyramid Network (E-FPN), a deep learning model for the semantic +segmentation of culverts and sewer pipes within imbalanced datasets. The E-FPN +incorporates architectural innovations like sparsely connected blocks and +depth-wise separable convolutions to improve feature extraction and handle +object variations. To address dataset imbalance, the model employs strategies +like class decomposition and data augmentation. Experimental results on the +culvert-sewer defects dataset and a benchmark aerial semantic segmentation +drone dataset show that the E-FPN outperforms state-of-the-art methods, +achieving an average Intersection over Union (IoU) improvement of 13.8% and +27.2%, respectively. Additionally, class decomposition and data augmentation +together boost the model's performance by approximately 6.9% IoU. The proposed +E-FPN presents a promising solution for enhancing object segmentation in +challenging, multi-class real-world datasets, with potential applications +extending beyond culvert-sewer defect detection. + +
+
+
+
+
+ + ☆ NeuRodin: A Two-stage Framework for High-Fidelity Neural Surface + Reconstruction + + +
+ Signed Distance Function (SDF)-based volume rendering has demonstrated +significant capabilities in surface reconstruction. Although promising, +SDF-based methods often fail to capture detailed geometric structures, +resulting in visible defects. By comparing SDF-based volume rendering to +density-based volume rendering, we identify two main factors within the +SDF-based approach that degrade surface quality: SDF-to-density representation +and geometric regularization. These factors introduce challenges that hinder +the optimization of the SDF field. To address these issues, we introduce +NeuRodin, a novel two-stage neural surface reconstruction framework that not +only achieves high-fidelity surface reconstruction but also retains the +flexible optimization characteristics of density-based methods. NeuRodin +incorporates innovative strategies that facilitate transformation of arbitrary +topologies and reduce artifacts associated with density bias. Extensive +evaluations on the Tanks and Temples and ScanNet++ datasets demonstrate the +superiority of NeuRodin, showing strong reconstruction capabilities for both +indoor and outdoor environments using solely posed RGB captures. Project +website: https://open3dvlab.github.io/NeuRodin/ + +
+
+
+
+
+ + ☆ Fairness Under Cover: Evaluating the Impact of Occlusions on Demographic + Bias in Facial Recognition ECCV + + +
+ This study investigates the effects of occlusions on the fairness of face +recognition systems, particularly focusing on demographic biases. Using the +Racial Faces in the Wild (RFW) dataset and synthetically added realistic +occlusions, we evaluate their effect on the performance of face recognition +models trained on the BUPT-Balanced and BUPT-GlobalFace datasets. We note +increases in the dispersion of FMR, FNMR, and accuracy alongside decreases in +fairness according to Equilized Odds, Demographic Parity, STD of Accuracy, and +Fairness Discrepancy Rate. Additionally, we utilize a pixel attribution method +to understand the importance of occlusions in model predictions, proposing a +new metric, Face Occlusion Impact Ratio (FOIR), that quantifies the extent to +which occlusions affect model performance across different demographic groups. +Our results indicate that occlusions exacerbate existing demographic biases, +with models placing higher importance on occlusions in an unequal fashion, +particularly affecting African individuals more severely. + +
+
+ comment: Accepted at ECCV Workshop FAILED +
+
+
+
+
+ + ☆ NeuFlow v2: High-Efficiency Optical Flow Estimation on Edge Devices + + +
+ Real-time high-accuracy optical flow estimation is crucial for various +real-world applications. While recent learning-based optical flow methods have +achieved high accuracy, they often come with significant computational costs. +In this paper, we propose a highly efficient optical flow method that balances +high accuracy with reduced computational demands. Building upon NeuFlow v1, we +introduce new components including a much more light-weight backbone and a fast +refinement module. Both these modules help in keeping the computational demands +light while providing close to state of the art accuracy. Compares to other +state of the art methods, our model achieves a 10x-70x speedup while +maintaining comparable performance on both synthetic and real-world data. It is +capable of running at over 20 FPS on 512x384 resolution images on a Jetson Orin +Nano. The full training and evaluation code is available at +https://github.com/neufieldrobotics/NeuFlow_v2. + +
+
+
+
+
+ + ☆ LoopSplat: Loop Closure by Registering 3D Gaussian Splats + + +
+ Simultaneous Localization and Mapping (SLAM) based on 3D Gaussian Splats +(3DGS) has recently shown promise towards more accurate, dense 3D scene maps. +However, existing 3DGS-based methods fail to address the global consistency of +the scene via loop closure and/or global bundle adjustment. To this end, we +propose LoopSplat, which takes RGB-D images as input and performs dense mapping +with 3DGS submaps and frame-to-model tracking. LoopSplat triggers loop closure +online and computes relative loop edge constraints between submaps directly via +3DGS registration, leading to improvements in efficiency and accuracy over +traditional global-to-local point cloud registration. It uses a robust pose +graph optimization formulation and rigidly aligns the submaps to achieve global +consistency. Evaluation on the synthetic Replica and real-world TUM-RGBD, +ScanNet, and ScanNet++ datasets demonstrates competitive or superior tracking, +mapping, and rendering compared to existing methods for dense RGB-D SLAM. Code +is available at \href{https://loopsplat.github.io/}{loopsplat.github.io}. + +
+
+ comment: Project page: + \href{https://loopsplat.github.io/}{loopsplat.github.io} +
+
+
+
+
+ + ☆ Structure-preserving Image Translation for Depth Estimation in + Colonoscopy Video MICCAI 2024 + + +
+ Monocular depth estimation in colonoscopy video aims to overcome the unusual +lighting properties of the colonoscopic environment. One of the major +challenges in this area is the domain gap between annotated but unrealistic +synthetic data and unannotated but realistic clinical data. Previous attempts +to bridge this domain gap directly target the depth estimation task itself. We +propose a general pipeline of structure-preserving synthetic-to-real (sim2real) +image translation (producing a modified version of the input image) to retain +depth geometry through the translation process. This allows us to generate +large quantities of realistic-looking synthetic images for supervised depth +estimation with improved generalization to the clinical domain. We also propose +a dataset of hand-picked sequences from clinical colonoscopies to improve the +image translation process. We demonstrate the simultaneous realism of the +translated images and preservation of depth maps via the performance of +downstream depth estimation on various datasets. + +
+
+ comment: 12 pages, 7 figures, accepted at MICCAI 2024 +
+
+
+
+
+ + ☆ Multi-Scale Representation Learning for Image Restoration with + State-Space Model + + +
+ Image restoration endeavors to reconstruct a high-quality, detail-rich image +from a degraded counterpart, which is a pivotal process in photography and +various computer vision systems. In real-world scenarios, different types of +degradation can cause the loss of image details at various scales and degrade +image contrast. Existing methods predominantly rely on CNN and Transformer to +capture multi-scale representations. However, these methods are often limited +by the high computational complexity of Transformers and the constrained +receptive field of CNN, which hinder them from achieving superior performance +and efficiency in image restoration. To address these challenges, we propose a +novel Multi-Scale State-Space Model-based (MS-Mamba) for efficient image +restoration that enhances the capacity for multi-scale representation learning +through our proposed global and regional SSM modules. Additionally, an Adaptive +Gradient Block (AGB) and a Residual Fourier Block (RFB) are proposed to improve +the network's detail extraction capabilities by capturing gradients in various +directions and facilitating learning details in the frequency domain. Extensive +experiments on nine public benchmarks across four classic image restoration +tasks, image deraining, dehazing, denoising, and low-light enhancement, +demonstrate that our proposed method achieves new state-of-the-art performance +while maintaining low computational complexity. The source code will be +publicly available. + +
+
+
+
+
+ + ☆ $R^2$-Mesh: Reinforcement Learning Powered Mesh Reconstruction via + Geometry and Appearance Refinement + + +
+ Mesh reconstruction based on Neural Radiance Fields (NeRF) is popular in a +variety of applications such as computer graphics, virtual reality, and medical +imaging due to its efficiency in handling complex geometric structures and +facilitating real-time rendering. However, existing works often fail to capture +fine geometric details accurately and struggle with optimizing rendering +quality. To address these challenges, we propose a novel algorithm that +progressively generates and optimizes meshes from multi-view images. Our +approach initiates with the training of a NeRF model to establish an initial +Signed Distance Field (SDF) and a view-dependent appearance field. +Subsequently, we iteratively refine the SDF through a differentiable mesh +extraction method, continuously updating both the vertex positions and their +connectivity based on the loss from mesh differentiable rasterization, while +also optimizing the appearance representation. To further leverage +high-fidelity and detail-rich representations from NeRF, we propose an +online-learning strategy based on Upper Confidence Bound (UCB) to enhance +viewpoints by adaptively incorporating images rendered by the initial NeRF +model into the training dataset. Through extensive experiments, we demonstrate +that our method delivers highly competitive and robust performance in both mesh +rendering quality and geometric quality. + +
+
+
+
+
+ + ☆ Perceptual Depth Quality Assessment of Stereoscopic Omnidirectional + Images + + +
+ Depth perception plays an essential role in the viewer experience for +immersive virtual reality (VR) visual environments. However, previous research +investigations in the depth quality of 3D/stereoscopic images are rather +limited, and in particular, are largely lacking for 3D viewing of 360-degree +omnidirectional content. In this work, we make one of the first attempts to +develop an objective quality assessment model named depth quality index (DQI) +for efficient no-reference (NR) depth quality assessment of stereoscopic +omnidirectional images. Motivated by the perceptual characteristics of the +human visual system (HVS), the proposed DQI is built upon multi-color-channel, +adaptive viewport selection, and interocular discrepancy features. Experimental +results demonstrate that the proposed method outperforms state-of-the-art image +quality assessment (IQA) and depth quality assessment (DQA) approaches in +predicting the perceptual depth quality when tested using both single-viewport +and omnidirectional stereoscopic image databases. Furthermore, we demonstrate +that combining the proposed depth quality model with existing IQA methods +significantly boosts the performance in predicting the overall quality of 3D +omnidirectional images. + +
+
+ comment: Accepted by IEEE TCSVT +
+
+
+
+
+ + ☆ UNINEXT-Cutie: The 1st Solution for LSVOS Challenge RVOS Track + + +
+ Referring video object segmentation (RVOS) relies on natural language +expressions to segment target objects in video. In this year, LSVOS Challenge +RVOS Track replaced the origin YouTube-RVOS benchmark with MeViS. MeViS focuses +on referring the target object in a video through its motion descriptions +instead of static attributes, posing a greater challenge to RVOS task. In this +work, we integrate strengths of that leading RVOS and VOS models to build up a +simple and effective pipeline for RVOS. Firstly, We finetune the +state-of-the-art RVOS model to obtain mask sequences that are correlated with +language descriptions. Secondly, based on a reliable and high-quality key +frames, we leverage VOS model to enhance the quality and temporal consistency +of the mask results. Finally, we further improve the performance of the RVOS +model using semi-supervised learning. Our solution achieved 62.57 J&F on the +MeViS test set and ranked 1st place for 6th LSVOS Challenge RVOS Track. + +
+
+
+
+
+ + ☆ Video Object Segmentation via SAM 2: The 4th Solution for LSVOS + Challenge VOS Track + + +
+ Video Object Segmentation (VOS) task aims to segmenting a particular object +instance throughout the entire video sequence given only the object mask of the +first frame. Recently, Segment Anything Model 2 (SAM 2) is proposed, which is a +foundation model towards solving promptable visual segmentation in images and +videos. SAM 2 builds a data engine, which improves model and data via user +interaction, to collect the largest video segmentation dataset to date. SAM 2 +is a simple transformer architecture with streaming memory for real-time video +processing, which trained on the date provides strong performance across a wide +range of tasks. In this work, we evaluate the zero-shot performance of SAM 2 on +the more challenging VOS datasets MOSE and LVOS. Without fine-tuning on the +training set, SAM 2 achieved 75.79 J&F on the test set and ranked 4th place for +6th LSVOS Challenge VOS Track. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2408.00714 +
+
+
+
+
+ + ☆ Learning Precise Affordances from Egocentric Videos for Robotic + Manipulation + + +
+ Affordance, defined as the potential actions that an object offers, is +crucial for robotic manipulation tasks. A deep understanding of affordance can +lead to more intelligent AI systems. For example, such knowledge directs an +agent to grasp a knife by the handle for cutting and by the blade when passing +it to someone. In this paper, we present a streamlined affordance learning +system that encompasses data collection, effective model training, and robot +deployment. First, we collect training data from egocentric videos in an +automatic manner. Different from previous methods that focus only on the object +graspable affordance and represent it as coarse heatmaps, we cover both +graspable (e.g., object handles) and functional affordances (e.g., knife +blades, hammer heads) and extract data with precise segmentation masks. We then +propose an effective model, termed Geometry-guided Affordance Transformer +(GKT), to train on the collected data. GKT integrates an innovative Depth +Feature Injector (DFI) to incorporate 3D shape and geometric priors, enhancing +the model's understanding of affordances. To enable affordance-oriented +manipulation, we further introduce Aff-Grasp, a framework that combines GKT +with a grasp generation model. For comprehensive evaluation, we create an +affordance evaluation dataset with pixel-wise annotations, and design +real-world tasks for robot experiments. The results show that GKT surpasses the +state-of-the-art by 15.9% in mIoU, and Aff-Grasp achieves high success rates of +95.5% in affordance prediction and 77.1% in successful grasping among 179 +trials, including evaluations with seen, unseen objects, and cluttered scenes. + +
+
+ comment: Project page: https://reagan1311.github.io/affgrasp +
+
+
+
+
+ + ☆ Factorized-Dreamer: Training A High-Quality Video Generator with Limited + and Low-Quality Data + + +
+ Text-to-video (T2V) generation has gained significant attention due to its +wide applications to video generation, editing, enhancement and translation, +\etc. However, high-quality (HQ) video synthesis is extremely challenging +because of the diverse and complex motions existed in real world. Most existing +works struggle to address this problem by collecting large-scale HQ videos, +which are inaccessible to the community. In this work, we show that publicly +available limited and low-quality (LQ) data are sufficient to train a HQ video +generator without recaptioning or finetuning. We factorize the whole T2V +generation process into two steps: generating an image conditioned on a highly +descriptive caption, and synthesizing the video conditioned on the generated +image and a concise caption of motion details. Specifically, we present +\emph{Factorized-Dreamer}, a factorized spatiotemporal framework with several +critical designs for T2V generation, including an adapter to combine text and +image embeddings, a pixel-aware cross attention module to capture pixel-level +image information, a T5 text encoder to better understand motion description, +and a PredictNet to supervise optical flows. We further present a noise +schedule, which plays a key role in ensuring the quality and stability of video +generation. Our model lowers the requirements in detailed captions and HQ +videos, and can be directly trained on limited LQ datasets with noisy and brief +captions such as WebVid-10M, largely alleviating the cost to collect +large-scale HQ video-text pairs. Extensive experiments in a variety of T2V and +image-to-video generation tasks demonstrate the effectiveness of our proposed +Factorized-Dreamer. Our source codes are available at +\url{https://github.com/yangxy/Factorized-Dreamer/}. + +
+
+
+
+
+ + ☆ Modelling the Distribution of Human Motion for Sign Language Assessment ECCV 2024 + + +
+ Sign Language Assessment (SLA) tools are useful to aid in language learning +and are underdeveloped. Previous work has focused on isolated signs or +comparison against a single reference video to assess Sign Languages (SL). This +paper introduces a novel SLA tool designed to evaluate the comprehensibility of +SL by modelling the natural distribution of human motion. We train our pipeline +on data from native signers and evaluate it using SL learners. We compare our +results to ratings from a human raters study and find strong correlation +between human ratings and our tool. We visually demonstrate our tools ability +to detect anomalous results spatio-temporally, providing actionable feedback to +aid in SL learning and assessment. + +
+
+ comment: Accepted to Twelfth International Workshop on Assistive Computer + Vision and Robotics at ECCV 2024 +
+
+
+
+
+ + ☆ FFAA: Multimodal Large Language Model based Explainable Open-World Face + Forgery Analysis Assistant + + +
+ The rapid advancement of deepfake technologies has sparked widespread public +concern, particularly as face forgery poses a serious threat to public +information security. However, the unknown and diverse forgery techniques, +varied facial features and complex environmental factors pose significant +challenges for face forgery analysis. Existing datasets lack descriptions of +these aspects, making it difficult for models to distinguish between real and +forged faces using only visual information amid various confounding factors. In +addition, existing methods do not yield user-friendly and explainable results, +complicating the understanding of the model's decision-making process. To +address these challenges, we introduce a novel Open-World Face Forgery Analysis +VQA (OW-FFA-VQA) task and the corresponding benchmark. To tackle this task, we +first establish a dataset featuring a diverse collection of real and forged +face images with essential descriptions and reliable forgery reasoning. Base on +this dataset, we introduce FFAA: Face Forgery Analysis Assistant, consisting of +a fine-tuned Multimodal Large Language Model (MLLM) and Multi-answer +Intelligent Decision System (MIDS). By integrating hypothetical prompts with +MIDS, the impact of fuzzy classification boundaries is effectively mitigated, +enhancing the model's robustness. Extensive experiments demonstrate that our +method not only provides user-friendly explainable results but also +significantly boosts accuracy and robustness compared to previous methods. + +
+
+ comment: 17 pages, 18 figures; project page: https://ffaa-vl.github.io +
+
+
+
+
+ + ☆ LNQ 2023 challenge: Benchmark of weakly-supervised techniques for + mediastinal lymph node quantification + + +
+ Accurate assessment of lymph node size in 3D CT scans is crucial for cancer +staging, therapeutic management, and monitoring treatment response. Existing +state-of-the-art segmentation frameworks in medical imaging often rely on fully +annotated datasets. However, for lymph node segmentation, these datasets are +typically small due to the extensive time and expertise required to annotate +the numerous lymph nodes in 3D CT scans. Weakly-supervised learning, which +leverages incomplete or noisy annotations, has recently gained interest in the +medical imaging community as a potential solution. Despite the variety of +weakly-supervised techniques proposed, most have been validated only on private +datasets or small publicly available datasets. To address this limitation, the +Mediastinal Lymph Node Quantification (LNQ) challenge was organized in +conjunction with the 26th International Conference on Medical Image Computing +and Computer Assisted Intervention (MICCAI 2023). This challenge aimed to +advance weakly-supervised segmentation methods by providing a new, partially +annotated dataset and a robust evaluation framework. A total of 16 teams from 5 +countries submitted predictions to the validation leaderboard, and 6 teams from +3 countries participated in the evaluation phase. The results highlighted both +the potential and the current limitations of weakly-supervised approaches. On +one hand, weakly-supervised approaches obtained relatively good performance +with a median Dice score of $61.0\%$. On the other hand, top-ranked teams, with +a median Dice score exceeding $70\%$, boosted their performance by leveraging +smaller but fully annotated datasets to combine weak supervision and full +supervision. This highlights both the promise of weakly-supervised methods and +the ongoing need for high-quality, fully annotated data to achieve higher +segmentation performance. + +
+
+ comment: Submitted to MELBA +
+
+
+
+
+ + ☆ Towards a Benchmark for Colorectal Cancer Segmentation in Endorectal + Ultrasound Videos: Dataset and Model Development + + +
+ Endorectal ultrasound (ERUS) is an important imaging modality that provides +high reliability for diagnosing the depth and boundary of invasion in +colorectal cancer. However, the lack of a large-scale ERUS dataset with +high-quality annotations hinders the development of automatic ultrasound +diagnostics. In this paper, we collected and annotated the first benchmark +dataset that covers diverse ERUS scenarios, i.e. colorectal cancer +segmentation, detection, and infiltration depth staging. Our ERUS-10K dataset +comprises 77 videos and 10,000 high-resolution annotated frames. Based on this +dataset, we further introduce a benchmark model for colorectal cancer +segmentation, named the Adaptive Sparse-context TRansformer (ASTR). ASTR is +designed based on three considerations: scanning mode discrepancy, temporal +information, and low computational complexity. For generalizing to different +scanning modes, the adaptive scanning-mode augmentation is proposed to convert +between raw sector images and linear scan ones. For mining temporal +information, the sparse-context transformer is incorporated to integrate +inter-frame local and global features. For reducing computational complexity, +the sparse-context block is introduced to extract contextual features from +auxiliary frames. Finally, on the benchmark dataset, the proposed ASTR model +achieves a 77.6% Dice score in rectal cancer segmentation, largely +outperforming previous state-of-the-art methods. + +
+
+
+
+
+ + ☆ Facial Wrinkle Segmentation for Cosmetic Dermatology: Pretraining with + Texture Map-Based Weak Supervision + + +
+ Facial wrinkle detection plays a crucial role in cosmetic dermatology. +Precise manual segmentation of facial wrinkles is challenging and +time-consuming, with inherent subjectivity leading to inconsistent results +among graders. To address this issue, we propose two solutions. First, we build +and release the first public facial wrinkle dataset, `FFHQ-Wrinkle', an +extension of the NVIDIA FFHQ dataset. This dataset includes 1,000 images with +human labels and 50,000 images with automatically generated weak labels. This +dataset can foster the research community to develop advanced wrinkle detection +algorithms. Second, we introduce a training strategy for U-Net-like +encoder-decoder models to detect wrinkles across the face automatically. Our +method employs a two-stage training strategy: texture map pretraining and +finetuning on human-labeled data. Initially, we pretrain models on a large +dataset with weak labels (N=50k) or masked texture maps generated through +computer vision techniques, without human intervention. Subsequently, we +finetune the models using human-labeled data (N=1k), which consists of manually +labeled wrinkle masks. During finetuning, the network inputs a combination of +RGB and masked texture maps, comprising four channels. We effectively combine +labels from multiple annotators to minimize subjectivity in manual labeling. +Our strategies demonstrate improved segmentation performance in facial wrinkle +segmentation both quantitatively and visually compared to existing pretraining +methods. + +
+
+
+
+
+ + ☆ Exploiting Fine-Grained Prototype Distribution for Boosting Unsupervised + Class Incremental Learning + + +
+ The dynamic nature of open-world scenarios has attracted more attention to +class incremental learning (CIL). However, existing CIL methods typically +presume the availability of complete ground-truth labels throughout the +training process, an assumption rarely met in practical applications. +Consequently, this paper explores a more challenging problem of unsupervised +class incremental learning (UCIL). The essence of addressing this problem lies +in effectively capturing comprehensive feature representations and discovering +unknown novel classes. To achieve this, we first model the knowledge of class +distribution by exploiting fine-grained prototypes. Subsequently, a granularity +alignment technique is introduced to enhance the unsupervised class discovery. +Additionally, we proposed a strategy to minimize overlap between novel and +existing classes, thereby preserving historical knowledge and mitigating the +phenomenon of catastrophic forgetting. Extensive experiments on the five +datasets demonstrate that our approach significantly outperforms current +state-of-the-art methods, indicating the effectiveness of the proposed method. + +
+
+
+
+
+ + ☆ Implicit Gaussian Splatting with Efficient Multi-Level Tri-Plane + Representation + + +
+ Recent advancements in photo-realistic novel view synthesis have been +significantly driven by Gaussian Splatting (3DGS). Nevertheless, the explicit +nature of 3DGS data entails considerable storage requirements, highlighting a +pressing need for more efficient data representations. To address this, we +present Implicit Gaussian Splatting (IGS), an innovative hybrid model that +integrates explicit point clouds with implicit feature embeddings through a +multi-level tri-plane architecture. This architecture features 2D feature grids +at various resolutions across different levels, facilitating continuous spatial +domain representation and enhancing spatial correlations among Gaussian +primitives. Building upon this foundation, we introduce a level-based +progressive training scheme, which incorporates explicit spatial +regularization. This method capitalizes on spatial correlations to enhance both +the rendering quality and the compactness of the IGS representation. +Furthermore, we propose a novel compression pipeline tailored for both point +clouds and 2D feature grids, considering the entropy variations across +different levels. Extensive experimental evaluations demonstrate that our +algorithm can deliver high-quality rendering using only a few MBs, effectively +balancing storage efficiency and rendering fidelity, and yielding results that +are competitive with the state-of-the-art. + +
+
+
+
+
+ + ☆ SHARP: Segmentation of Hands and Arms by Range using Pseudo-Depth for + Enhanced Egocentric 3D Hand Pose Estimation and Action Recognition ICPR + + +
+ Hand pose represents key information for action recognition in the egocentric +perspective, where the user is interacting with objects. We propose to improve +egocentric 3D hand pose estimation based on RGB frames only by using +pseudo-depth images. Incorporating state-of-the-art single RGB image depth +estimation techniques, we generate pseudo-depth representations of the frames +and use distance knowledge to segment irrelevant parts of the scene. The +resulting depth maps are then used as segmentation masks for the RGB frames. +Experimental results on H2O Dataset confirm the high accuracy of the estimated +pose with our method in an action recognition task. The 3D hand pose, together +with information from object detection, is processed by a transformer-based +action recognition network, resulting in an accuracy of 91.73%, outperforming +all state-of-the-art methods. Estimations of 3D hand pose result in competitive +performance with existing methods with a mean pose error of 28.66 mm. This +method opens up new possibilities for employing distance information in +egocentric 3D hand pose estimation without relying on depth sensors. + +
+
+ comment: Accepted at 27th International Conference on Pattern Recognition + (ICPR) +
+
+
+
+
+ + ☆ Dynamic Label Injection for Imbalanced Industrial Defect Segmentation ECCV 2024 + + +
+ In this work, we propose a simple yet effective method to tackle the problem +of imbalanced multi-class semantic segmentation in deep learning systems. One +of the key properties for a good training set is the balancing among the +classes. When the input distribution is heavily imbalanced in the number of +instances, the learning process could be hindered or difficult to carry on. To +this end, we propose a Dynamic Label Injection (DLI) algorithm to impose a +uniform distribution in the input batch. Our algorithm computes the current +batch defect distribution and re-balances it by transferring defects using a +combination of Poisson-based seamless image cloning and cut-paste techniques. A +thorough experimental section on the Magnetic Tiles dataset shows better +results of DLI compared to other balancing loss approaches also in the +challenging weakly-supervised setup. The code is available at +https://github.com/covisionlab/dynamic-label-injection.git + +
+
+ comment: ECCV 2024 VISION Workshop +
+
+
+
+
+ + ☆ Towards Robust Federated Image Classification: An Empirical Study of + Weight Selection Strategies in Manufacturing + + +
+ In the realm of Federated Learning (FL), particularly within the +manufacturing sector, the strategy for selecting client weights for server +aggregation is pivotal for model performance. This study investigates the +comparative effectiveness of two weight selection strategies: Final Epoch +Weight Selection (FEWS) and Optimal Epoch Weight Selection (OEWS). Designed for +manufacturing contexts where collaboration typically involves a limited number +of partners (two to four clients), our research focuses on federated image +classification tasks. We employ various neural network architectures, including +EfficientNet, ResNet, and VGG, to assess the impact of these weight selection +strategies on model convergence and robustness. + Our research aims to determine whether FEWS or OEWS enhances the global FL +model's performance across communication rounds (CRs). Through empirical +analysis and rigorous experimentation, we seek to provide valuable insights for +optimizing FL implementations in manufacturing, ensuring that collaborative +efforts yield the most effective and reliable models with a limited number of +participating clients. The findings from this study are expected to refine FL +practices significantly in manufacturing, thereby enhancing the efficiency and +performance of collaborative machine learning endeavors in this vital sector. + +
+
+ comment: Submitted to The 2nd IEEE International Conference on Federated + Learning Technologies and Applications (FLTA24) +
+
+
+
+
+ + ☆ Detecting Adversarial Attacks in Semantic Segmentation via Uncertainty + Estimation: A Deep Analysis + + +
+ Deep neural networks have demonstrated remarkable effectiveness across a wide +range of tasks such as semantic segmentation. Nevertheless, these networks are +vulnerable to adversarial attacks that add imperceptible perturbations to the +input image, leading to false predictions. This vulnerability is particularly +dangerous in safety-critical applications like automated driving. While +adversarial examples and defense strategies are well-researched in the context +of image classification, there is comparatively less research focused on +semantic segmentation. Recently, we have proposed an uncertainty-based method +for detecting adversarial attacks on neural networks for semantic segmentation. +We observed that uncertainty, as measured by the entropy of the output +distribution, behaves differently on clean versus adversely perturbed images, +and we utilize this property to differentiate between the two. In this extended +version of our work, we conduct a detailed analysis of uncertainty-based +detection of adversarial attacks including a diverse set of adversarial attacks +and various state-of-the-art neural networks. Our numerical experiments show +the effectiveness of the proposed uncertainty-based detection method, which is +lightweight and operates as a post-processing step, i.e., no model +modifications or knowledge of the adversarial example generation process are +required. + +
+
+
+
+
+ + ☆ CLIPCleaner: Cleaning Noisy Labels with CLIP + + +
+ Learning with Noisy labels (LNL) poses a significant challenge for the +Machine Learning community. Some of the most widely used approaches that select +as clean samples for which the model itself (the in-training model) has high +confidence, e.g., `small loss', can suffer from the so called +`self-confirmation' bias. This bias arises because the in-training model, is at +least partially trained on the noisy labels. Furthermore, in the classification +case, an additional challenge arises because some of the label noise is between +classes that are visually very similar (`hard noise'). This paper addresses +these challenges by proposing a method (\textit{CLIPCleaner}) that leverages +CLIP, a powerful Vision-Language (VL) model for constructing a zero-shot +classifier for efficient, offline, clean sample selection. This has the +advantage that the sample selection is decoupled from the in-training model and +that the sample selection is aware of the semantic and visual similarities +between the classes due to the way that CLIP is trained. We provide theoretical +justifications and empirical evidence to demonstrate the advantages of CLIP for +LNL compared to conventional pre-trained models. Compared to current methods +that combine iterative sample selection with various techniques, +\textit{CLIPCleaner} offers a simple, single-step approach that achieves +competitive or superior performance on benchmark datasets. To the best of our +knowledge, this is the first time a VL model has been used for sample selection +to address the problem of Learning with Noisy Labels (LNL), highlighting their +potential in the domain. + +
+
+ comment: Accepted to ACMMM2024 +
+
+
+
+
+ + ☆ P3P: Pseudo-3D Pre-training for Scaling 3D Masked Autoencoders + + +
+ 3D pre-training is crucial to 3D perception tasks. However, limited by the +difficulties in collecting clean 3D data, 3D pre-training consistently faced +data scaling challenges. Inspired by semi-supervised learning leveraging +limited labeled data and a large amount of unlabeled data, in this work, we +propose a novel self-supervised pre-training framework utilizing the real 3D +data and the pseudo-3D data lifted from images by a large depth estimation +model. Another challenge lies in the efficiency. Previous methods such as +Point-BERT and Point-MAE, employ k nearest neighbors to embed 3D tokens, +requiring quadratic time complexity. To efficiently pre-train on such a large +amount of data, we propose a linear-time-complexity token embedding strategy +and a training-efficient 2D reconstruction target. Our method achieves +state-of-the-art performance in 3D classification and few-shot learning while +maintaining high pre-training and downstream fine-tuning efficiency. + +
+
+ comment: Under review. Pre-print +
+
+
+
+
+ + ☆ Boosting Open-Domain Continual Learning via Leveraging Intra-domain + Category-aware Prototype + + +
+ Despite recent progress in enhancing the efficacy of Open-Domain Continual +Learning (ODCL) in Vision-Language Models (VLM), failing to (1) correctly +identify the Task-ID of a test image and (2) use only the category set +corresponding to the Task-ID, while preserving the knowledge related to each +domain, cannot address the two primary challenges of ODCL: forgetting old +knowledge and maintaining zero-shot capabilities, as well as the confusions +caused by category-relatedness between domains. In this paper, we propose a +simple yet effective solution: leveraging intra-domain category-aware +prototypes for ODCL in CLIP (DPeCLIP), where the prototype is the key to +bridging the above two processes. Concretely, we propose a training-free +Task-ID discriminator method, by utilizing prototypes as classifiers for +identifying Task-IDs. Furthermore, to maintain the knowledge corresponding to +each domain, we incorporate intra-domain category-aware prototypes as domain +prior prompts into the training process. Extensive experiments conducted on 11 +different datasets demonstrate the effectiveness of our approach, achieving +2.37% and 1.14% average improvement in class-incremental and task-incremental +settings, respectively. + +
+
+
+
+
+ + ☆ Weakly Supervised Pretraining and Multi-Annotator Supervised Finetuning + for Facial Wrinkle Detection + + +
+ 1. Research question: With the growing interest in skin diseases and skin +aesthetics, the ability to predict facial wrinkles is becoming increasingly +important. This study aims to evaluate whether a computational model, +convolutional neural networks (CNN), can be trained for automated facial +wrinkle segmentation. 2. Findings: Our study presents an effective technique +for integrating data from multiple annotators and illustrates that transfer +learning can enhance performance, resulting in dependable segmentation of +facial wrinkles. 3. Meaning: This approach automates intricate and +time-consuming tasks of wrinkle analysis with a deep learning framework. It +could be used to facilitate skin treatments and diagnostics. + +
+
+
+
+
+ + ☆ C${^2}$RL: Content and Context Representation Learning for Gloss-free + Sign Language Translation and Retrieval + + +
+ Sign Language Representation Learning (SLRL) is crucial for a range of sign +language-related downstream tasks such as Sign Language Translation (SLT) and +Sign Language Retrieval (SLRet). Recently, many gloss-based and gloss-free SLRL +methods have been proposed, showing promising performance. Among them, the +gloss-free approach shows promise for strong scalability without relying on +gloss annotations. However, it currently faces suboptimal solutions due to +challenges in encoding the intricate, context-sensitive characteristics of sign +language videos, mainly struggling to discern essential sign features using a +non-monotonic video-text alignment strategy. Therefore, we introduce an +innovative pretraining paradigm for gloss-free SLRL, called C${^2}$RL, in this +paper. Specifically, rather than merely incorporating a non-monotonic semantic +alignment of video and text to learn language-oriented sign features, we +emphasize two pivotal aspects of SLRL: Implicit Content Learning (ICL) and +Explicit Context Learning (ECL). ICL delves into the content of communication, +capturing the nuances, emphasis, timing, and rhythm of the signs. In contrast, +ECL focuses on understanding the contextual meaning of signs and converting +them into equivalent sentences. Despite its simplicity, extensive experiments +confirm that the joint optimization of ICL and ECL results in robust sign +language representation and significant performance gains in gloss-free SLT and +SLRet tasks. Notably, C${^2}$RL improves the BLEU-4 score by +5.3 on P14T, ++10.6 on CSL-daily, +6.2 on OpenASL, and +1.3 on How2Sign. It also boosts the +R@1 score by +8.3 on P14T, +14.4 on CSL-daily, and +5.9 on How2Sign. +Additionally, we set a new baseline for the OpenASL dataset in the SLRet task. + +
+
+
+
+
+ + ☆ Caption-Driven Explorations: Aligning Image and Text Embeddings through + Human-Inspired Foveated Vision + + +
+ Understanding human attention is crucial for vision science and AI. While +many models exist for free-viewing, less is known about task-driven image +exploration. To address this, we introduce CapMIT1003, a dataset with captions +and click-contingent image explorations, to study human attention during the +captioning task. We also present NevaClip, a zero-shot method for predicting +visual scanpaths by combining CLIP models with NeVA algorithms. NevaClip +generates fixations to align the representations of foveated visual stimuli and +captions. The simulated scanpaths outperform existing human attention models in +plausibility for captioning and free-viewing tasks. This research enhances the +understanding of human attention and advances scanpath prediction models. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2305.12380 +
+
+
+
+
+ + ☆ ML-CrAIST: Multi-scale Low-high Frequency Information-based Cross black + Attention with Image Super-resolving Transformer + + +
+ Recently, transformers have captured significant interest in the area of +single-image super-resolution tasks, demonstrating substantial gains in +performance. Current models heavily depend on the network's extensive ability +to extract high-level semantic details from images while overlooking the +effective utilization of multi-scale image details and intermediate information +within the network. Furthermore, it has been observed that high-frequency areas +in images present significant complexity for super-resolution compared to +low-frequency areas. This work proposes a transformer-based super-resolution +architecture called ML-CrAIST that addresses this gap by utilizing low-high +frequency information in multiple scales. Unlike most of the previous work +(either spatial or channel), we operate spatial and channel self-attention, +which concurrently model pixel interaction from both spatial and channel +dimensions, exploiting the inherent correlations across spatial and channel +axis. Further, we devise a cross-attention block for super-resolution, which +explores the correlations between low and high-frequency information. +Quantitative and qualitative assessments indicate that our proposed ML-CrAIST +surpasses state-of-the-art super-resolution methods (e.g., 0.15 dB gain +@Manga109 $\times$4). Code is available on: +https://github.com/Alik033/ML-CrAIST. + +
+
+
+
+
+ + ☆ Pose-GuideNet: Automatic Scanning Guidance for Fetal Head Ultrasound + from Pose Estimation MICCAI2024 + + +
+ 3D pose estimation from a 2D cross-sectional view enables healthcare +professionals to navigate through the 3D space, and such techniques initiate +automatic guidance in many image-guided radiology applications. In this work, +we investigate how estimating 3D fetal pose from freehand 2D ultrasound +scanning can guide a sonographer to locate a head standard plane. Fetal head +pose is estimated by the proposed Pose-GuideNet, a novel 2D/3D registration +approach to align freehand 2D ultrasound to a 3D anatomical atlas without the +acquisition of 3D ultrasound. To facilitate the 2D to 3D cross-dimensional +projection, we exploit the prior knowledge in the atlas to align the standard +plane frame in a freehand scan. A semantic-aware contrastive-based approach is +further proposed to align the frames that are off standard planes based on +their anatomical similarity. In the experiment, we enhance the existing +assessment of freehand image localization by comparing the transformation of +its estimated pose towards standard plane with the corresponding probe motion, +which reflects the actual view change in 3D anatomy. Extensive results on two +clinical head biometry tasks show that Pose-GuideNet not only accurately +predicts pose but also successfully predicts the direction of the fetal head. +Evaluations with probe motions further demonstrate the feasibility of adopting +Pose-GuideNet for freehand ultrasound-assisted navigation in a sensor-free +environment. + +
+
+ comment: Accepted by MICCAI2024 +
+
+
+
+
+ + ☆ Data Augmentation of Contrastive Learning is Estimating + Positive-incentive Noise + + +
+ Inspired by the idea of Positive-incentive Noise (Pi-Noise or $\pi$-Noise) +that aims at learning the reliable noise beneficial to tasks, we scientifically +investigate the connection between contrastive learning and $\pi$-noise in this +paper. By converting the contrastive loss to an auxiliary Gaussian distribution +to quantitatively measure the difficulty of the specific contrastive model +under the information theory framework, we properly define the task entropy, +the core concept of $\pi$-noise, of contrastive learning. It is further proved +that the predefined data augmentation in the standard contrastive learning +paradigm can be regarded as a kind of point estimation of $\pi$-noise. Inspired +by the theoretical study, a framework that develops a $\pi$-noise generator to +learn the beneficial noise (instead of estimation) as data augmentations for +contrast is proposed. The designed framework can be applied to diverse types of +data and is also completely compatible with the existing contrastive models. +From the visualization, we surprisingly find that the proposed method +successfully learns effective augmentations. + +
+
+
+
+
+ + ☆ DiscoNeRF: Class-Agnostic Object Field for 3D Object Discovery + + +
+ Neural Radiance Fields (NeRFs) have become a powerful tool for modeling 3D +scenes from multiple images. However, NeRFs remain difficult to segment into +semantically meaningful regions. Previous approaches to 3D segmentation of +NeRFs either require user interaction to isolate a single object, or they rely +on 2D semantic masks with a limited number of classes for supervision. As a +consequence, they generalize poorly to class-agnostic masks automatically +generated in real scenes. This is attributable to the ambiguity arising from +zero-shot segmentation, yielding inconsistent masks across views. In contrast, +we propose a method that is robust to inconsistent segmentations and +successfully decomposes the scene into a set of objects of any class. By +introducing a limited number of competing object slots against which masks are +matched, a meaningful object representation emerges that best explains the 2D +supervision and minimizes an additional regularization term. Our experiments +demonstrate the ability of our method to generate 3D panoptic segmentations on +complex scenes, and extract high-quality 3D assets from NeRFs that can then be +used in virtual 3D environments. + +
+
+
+
+
+ + ☆ Sliced Maximal Information Coefficient: A Training-Free Approach for + Image Quality Assessment Enhancement ICME2024 + + +
+ Full-reference image quality assessment (FR-IQA) models generally operate by +measuring the visual differences between a degraded image and its reference. +However, existing FR-IQA models including both the classical ones (eg, PSNR and +SSIM) and deep-learning based measures (eg, LPIPS and DISTS) still exhibit +limitations in capturing the full perception characteristics of the human +visual system (HVS). In this paper, instead of designing a new FR-IQA measure, +we aim to explore a generalized human visual attention estimation strategy to +mimic the process of human quality rating and enhance existing IQA models. In +particular, we model human attention generation by measuring the statistical +dependency between the degraded image and the reference image. The dependency +is captured in a training-free manner by our proposed sliced maximal +information coefficient and exhibits surprising generalization in different IQA +measures. Experimental results verify the performance of existing IQA models +can be consistently improved when our attention module is incorporated. The +source code is available at https://github.com/KANGX99/SMIC. + +
+
+ comment: 6 pages, 5 figures, accepted by ICME2024 +
+
+
+
+
+ + ☆ Long-Tail Temporal Action Segmentation with Group-wise Temporal Logit + Adjustment ECCV 2024 + + +
+ Procedural activity videos often exhibit a long-tailed action distribution +due to varying action frequencies and durations. However, state-of-the-art +temporal action segmentation methods overlook the long tail and fail to +recognize tail actions. Existing long-tail methods make class-independent +assumptions and struggle to identify tail classes when applied to temporal +segmentation frameworks. This work proposes a novel group-wise temporal logit +adjustment~(G-TLA) framework that combines a group-wise softmax formulation +while leveraging activity information and action ordering for logit adjustment. +The proposed framework significantly improves in segmenting tail actions +without any performance loss on head actions. + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+ + ☆ Attribution Analysis Meets Model Editing: Advancing Knowledge Correction + in Vision Language Models with VisEdit + + +
+ Model editing aims to correct outdated or erroneous knowledge in large models +without costly retraining. Recent research discovered that the mid-layer +representation of the subject's final token in a prompt has a strong influence +on factual predictions, and developed Large Language Model (LLM) editing +techniques based on this observation. However, for Vision-LLMs (VLLMs), how +visual representations impact the predictions from a decoder-only language +model remains largely unexplored. To the best of our knowledge, model editing +for VLLMs has not been extensively studied in the literature. In this work, we +employ the contribution allocation and noise perturbation methods to measure +the contributions of visual representations for token predictions. Our +attribution analysis shows that visual representations in mid-to-later layers +that are highly relevant to the prompt contribute significantly to predictions. +Based on these insights, we propose VisEdit, a novel model editor for VLLMs +that effectively corrects knowledge by editing intermediate visual +representations in regions important to the edit prompt. We evaluated VisEdit +using multiple VLLM backbones and public VLLM editing benchmark datasets. The +results show the superiority of VisEdit over the strong baselines adapted from +existing state-of-the-art editors for LLMs. + +
+
+
+
+
+ + ☆ Harnessing Multi-resolution and Multi-scale Attention for Underwater + Image Restoration + + +
+ Underwater imagery is often compromised by factors such as color distortion +and low contrast, posing challenges for high-level vision tasks. Recent +underwater image restoration (UIR) methods either analyze the input image at +full resolution, resulting in spatial richness but contextual weakness, or +progressively from high to low resolution, yielding reliable semantic +information but reduced spatial accuracy. Here, we propose a lightweight +multi-stage network called Lit-Net that focuses on multi-resolution and +multi-scale image analysis for restoring underwater images while retaining +original resolution during the first stage, refining features in the second, +and focusing on reconstruction in the final stage. Our novel encoder block +utilizes parallel $1\times1$ convolution layers to capture local information +and speed up operations. Further, we incorporate a modified weighted color +channel-specific $l_1$ loss ($cl_1$) function to recover color and detail +information. Extensive experimentations on publicly available datasets suggest +our model's superiority over recent state-of-the-art methods, with significant +improvement in qualitative and quantitative measures, such as $29.477$ dB PSNR +($1.92\%$ improvement) and $0.851$ SSIM ($2.87\%$ improvement) on the EUVP +dataset. The contributions of Lit-Net offer a more robust approach to +underwater image enhancement and super-resolution, which is of considerable +importance for underwater autonomous vehicles and surveillance. The code is +available at: https://github.com/Alik033/Lit-Net. + +
+
+
+
+
+ + ☆ LCE: A Framework for Explainability of DNNs for Ultrasound Image Based + on Concept Discovery + + +
+ Explaining the decisions of Deep Neural Networks (DNNs) for medical images +has become increasingly important. Existing attribution methods have difficulty +explaining the meaning of pixels while existing concept-based methods are +limited by additional annotations or specific model structures that are +difficult to apply to ultrasound images. In this paper, we propose the Lesion +Concept Explainer (LCE) framework, which combines attribution methods with +concept-based methods. We introduce the Segment Anything Model (SAM), +fine-tuned on a large number of medical images, for concept discovery to enable +a meaningful explanation of ultrasound image DNNs. The proposed framework is +evaluated in terms of both faithfulness and understandability. We point out +deficiencies in the popular faithfulness evaluation metrics and propose a new +evaluation metric. Our evaluation of public and private breast ultrasound +datasets (BUSI and FG-US-B) shows that LCE performs well compared to +commonly-used explainability methods. Finally, we also validate that LCE can +consistently provide reliable explanations for more meaningful fine-grained +diagnostic tasks in breast ultrasound. + +
+
+
+
+
+ + ☆ Preoperative Rotator Cuff Tear Prediction from Shoulder Radiographs + using a Convolutional Block Attention Module-Integrated Neural Network + + +
+ Research question: We test whether a plane shoulder radiograph can be used +together with deep learning methods to identify patients with rotator cuff +tears as opposed to using an MRI in standard of care. Findings: By integrating +convolutional block attention modules into a deep neural network, our model +demonstrates high accuracy in detecting patients with rotator cuff tears, +achieving an average AUC of 0.889 and an accuracy of 0.831. Meaning: This study +validates the efficacy of our deep learning model to accurately detect rotation +cuff tears from radiographs, offering a viable pre-assessment or alternative to +more expensive imaging techniques such as MRI. + +
+
+
+
+
+ + ☆ SAM-UNet:Enhancing Zero-Shot Segmentation of SAM for Universal Medical + Images + + +
+ Segment Anything Model (SAM) has demonstrated impressive performance on a +wide range of natural image segmentation tasks. However, its performance +significantly deteriorates when directly applied to medical domain, due to the +remarkable differences between natural images and medical images. Some +researchers have attempted to train SAM on large scale medical datasets. +However, poor zero-shot performance is observed from the experimental results. +In this context, inspired by the superior performance of U-Net-like models in +medical image segmentation, we propose SAMUNet, a new foundation model which +incorporates U-Net to the original SAM, to fully leverage the powerful +contextual modeling ability of convolutions. To be specific, we parallel a +convolutional branch in the image encoder, which is trained independently with +the vision Transformer branch frozen. Additionally, we employ multi-scale +fusion in the mask decoder, to facilitate accurate segmentation of objects with +different scales. We train SAM-UNet on SA-Med2D-16M, the largest 2-dimensional +medical image segmentation dataset to date, yielding a universal pretrained +model for medical images. Extensive experiments are conducted to evaluate the +performance of the model, and state-of-the-art result is achieved, with a dice +similarity coefficient score of 0.883 on SA-Med2D-16M dataset. Specifically, in +zero-shot segmentation experiments, our model not only significantly +outperforms previous large medical SAM models across all modalities, but also +substantially mitigates the performance degradation seen on unseen modalities. +It should be highlighted that SAM-UNet is an efficient and extensible +foundation model, which can be further fine-tuned for other downstream tasks in +medical community. The code is available at +https://github.com/Hhankyangg/sam-unet. + +
+
+
+
+
+ + ☆ New spectral imaging biomarkers for sepsis and mortality in intensive + care + + +
+ With sepsis remaining a leading cause of mortality, early identification of +septic patients and those at high risk of death is a challenge of high +socioeconomic importance. The driving hypothesis of this study was that +hyperspectral imaging (HSI) could provide novel biomarkers for sepsis diagnosis +and treatment management due to its potential to monitor microcirculatory +alterations. We conducted a comprehensive study involving HSI data of the palm +and fingers from more than 480 patients on the day of their intensive care unit +(ICU) admission. The findings demonstrate that HSI measurements can predict +sepsis with an area under the receiver operating characteristic curve (AUROC) +of 0.80 (95 % confidence interval (CI) [0.76; 0.84]) and mortality with an +AUROC of 0.72 (95 % CI [0.65; 0.79]). The predictive performance improves +substantially when additional clinical data is incorporated, leading to an +AUROC of up to 0.94 (95 % CI [0.92; 0.96]) for sepsis and 0.84 (95 % CI [0.78; +0.89]) for mortality. We conclude that HSI presents novel imaging biomarkers +for the rapid, non-invasive prediction of sepsis and mortality, suggesting its +potential as an important modality for guiding diagnosis and treatment. + +
+
+ comment: Markus A. Weigand, Lena Maier-Hein and Maximilian Dietrich + contributed equally +
+
+
+
+
+ + ☆ Docling Technical Report + + +
+ This technical report introduces Docling, an easy to use, self-contained, +MIT-licensed open-source package for PDF document conversion. It is powered by +state-of-the-art specialized AI models for layout analysis (DocLayNet) and +table structure recognition (TableFormer), and runs efficiently on commodity +hardware in a small resource budget. The code interface allows for easy +extensibility and addition of new features and models. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2206.01062 +
+
+
+
+
+ + ☆ 3D-Aware Instance Segmentation and Tracking in Egocentric Videos + + +
+ Egocentric videos present unique challenges for 3D scene understanding due to +rapid camera motion, frequent object occlusions, and limited object visibility. +This paper introduces a novel approach to instance segmentation and tracking in +first-person video that leverages 3D awareness to overcome these obstacles. Our +method integrates scene geometry, 3D object centroid tracking, and instance +segmentation to create a robust framework for analyzing dynamic egocentric +scenes. By incorporating spatial and temporal cues, we achieve superior +performance compared to state-of-the-art 2D approaches. Extensive evaluations +on the challenging EPIC Fields dataset demonstrate significant improvements +across a range of tracking and segmentation consistency metrics. Specifically, +our method outperforms the next best performing approach by $7$ points in +Association Accuracy (AssA) and $4.5$ points in IDF1 score, while reducing the +number of ID switches by $73\%$ to $80\%$ across various object categories. +Leveraging our tracked instance segmentations, we showcase downstream +applications in 3D object reconstruction and amodal video object segmentation +in these egocentric settings. + +
+
+
+
+
+ + ☆ OccMamba: Semantic Occupancy Prediction with State Space Models + + +
+ Training deep learning models for semantic occupancy prediction is +challenging due to factors such as a large number of occupancy cells, severe +occlusion, limited visual cues, complicated driving scenarios, etc. Recent +methods often adopt transformer-based architectures given their strong +capability in learning input-conditioned weights and long-range relationships. +However, transformer-based networks are notorious for their quadratic +computation complexity, seriously undermining their efficacy and deployment in +semantic occupancy prediction. Inspired by the global modeling and linear +computation complexity of the Mamba architecture, we present the first +Mamba-based network for semantic occupancy prediction, termed OccMamba. +However, directly applying the Mamba architecture to the occupancy prediction +task yields unsatisfactory performance due to the inherent domain gap between +the linguistic and 3D domains. To relieve this problem, we present a simple yet +effective 3D-to-1D reordering operation, i.e., height-prioritized 2D Hilbert +expansion. It can maximally retain the spatial structure of point clouds as +well as facilitate the processing of Mamba blocks. Our OccMamba achieves +state-of-the-art performance on three prevalent occupancy prediction +benchmarks, including OpenOccupancy, SemanticKITTI and SemanticPOSS. Notably, +on OpenOccupancy, our OccMamba outperforms the previous state-of-the-art Co-Occ +by 3.1% IoU and 3.2% mIoU, respectively. Codes will be released upon +publication. + +
+
+ comment: 9 pages, 4 figures +
+
+
+
+
+ + ☆ Segment-Anything Models Achieve Zero-shot Robustness in Autonomous + Driving + + +
+ Semantic segmentation is a significant perception task in autonomous driving. +It suffers from the risks of adversarial examples. In the past few years, deep +learning has gradually transitioned from convolutional neural network (CNN) +models with a relatively small number of parameters to foundation models with a +huge number of parameters. The segment-anything model (SAM) is a generalized +image segmentation framework that is capable of handling various types of +images and is able to recognize and segment arbitrary objects in an image +without the need to train on a specific object. It is a unified model that can +handle diverse downstream tasks, including semantic segmentation, object +detection, and tracking. In the task of semantic segmentation for autonomous +driving, it is significant to study the zero-shot adversarial robustness of +SAM. Therefore, we deliver a systematic empirical study on the robustness of +SAM without additional training. Based on the experimental results, the +zero-shot adversarial robustness of the SAM under the black-box corruptions and +white-box adversarial attacks is acceptable, even without the need for +additional training. The finding of this study is insightful in that the +gigantic model parameters and huge amounts of training data lead to the +phenomenon of emergence, which builds a guarantee of adversarial robustness. +SAM is a vision foundation model that can be regarded as an early prototype of +an artificial general intelligence (AGI) pipeline. In such a pipeline, a +unified model can handle diverse tasks. Therefore, this research not only +inspects the impact of vision foundation models on safe autonomous driving but +also provides a perspective on developing trustworthy AGI. The code is +available at: https://github.com/momo1986/robust_sam_iv. + +
+
+ comment: Accepted to IAVVC 2024 +
+
+
+
+
+ + ☆ SurgicaL-CD: Generating Surgical Images via Unpaired Image Translation + with Latent Consistency Diffusion Models + + +
+ Computer-assisted surgery (CAS) systems are designed to assist surgeons +during procedures, thereby reducing complications and enhancing patient care. +Training machine learning models for these systems requires a large corpus of +annotated datasets, which is challenging to obtain in the surgical domain due +to patient privacy concerns and the significant labeling effort required from +doctors. Previous methods have explored unpaired image translation using +generative models to create realistic surgical images from simulations. +However, these approaches have struggled to produce high-quality, diverse +surgical images. In this work, we introduce \emph{SurgicaL-CD}, a +consistency-distilled diffusion method to generate realistic surgical images +with only a few sampling steps without paired data. We evaluate our approach on +three datasets, assessing the generated images in terms of quality and utility +as downstream training datasets. Our results demonstrate that our method +outperforms GANs and diffusion-based approaches. Our code is available at +\url{https://gitlab.com/nct_tso_public/gan2diffusion}. + +
+
+
+
+
+ + ☆ Hear Your Face: Face-based voice conversion with F0 estimation + + +
+ This paper delves into the emerging field of face-based voice conversion, +leveraging the unique relationship between an individual's facial features and +their vocal characteristics. We present a novel face-based voice conversion +framework that particularly utilizes the average fundamental frequency of the +target speaker, derived solely from their facial images. Through extensive +analysis, our framework demonstrates superior speech generation quality and the +ability to align facial features with voice characteristics, including tracking +of the target speaker's fundamental frequency. + +
+
+ comment: Interspeech 2024 +
+
+
+
+
+ + ☆ Latent Diffusion for Guided Document Table Generation ICDAR 2024 + + +
+ Obtaining annotated table structure data for complex tables is a challenging +task due to the inherent diversity and complexity of real-world document +layouts. The scarcity of publicly available datasets with comprehensive +annotations for intricate table structures hinders the development and +evaluation of models designed for such scenarios. This research paper +introduces a novel approach for generating annotated images for table structure +by leveraging conditioned mask images of rows and columns through the +application of latent diffusion models. The proposed method aims to enhance the +quality of synthetic data used for training object detection models. +Specifically, the study employs a conditioning mechanism to guide the +generation of complex document table images, ensuring a realistic +representation of table layouts. To evaluate the effectiveness of the generated +data, we employ the popular YOLOv5 object detection model for training. The +generated table images serve as valuable training samples, enriching the +dataset with diverse table structures. The model is subsequently tested on the +challenging pubtables-1m testset, a benchmark for table structure recognition +in complex document layouts. Experimental results demonstrate that the +introduced approach significantly improves the quality of synthetic data for +training, leading to YOLOv5 models with enhanced performance. The mean Average +Precision (mAP) values obtained on the pubtables-1m testset showcase results +closely aligned with state-of-the-art methods. Furthermore, low FID results +obtained on the synthetic data further validate the efficacy of the proposed +methodology in generating annotated images for table structure. + +
+
+ comment: Accepted in ICDAR 2024 +
+
+
+
+
+ + ☆ Anim-Director: A Large Multimodal Model Powered Agent for Controllable + Animation Video Generation SIGGRAPH + + +
+ Traditional animation generation methods depend on training generative models +with human-labelled data, entailing a sophisticated multi-stage pipeline that +demands substantial human effort and incurs high training costs. Due to limited +prompting plans, these methods typically produce brief, information-poor, and +context-incoherent animations. To overcome these limitations and automate the +animation process, we pioneer the introduction of large multimodal models +(LMMs) as the core processor to build an autonomous animation-making agent, +named Anim-Director. This agent mainly harnesses the advanced understanding and +reasoning capabilities of LMMs and generative AI tools to create animated +videos from concise narratives or simple instructions. Specifically, it +operates in three main stages: Firstly, the Anim-Director generates a coherent +storyline from user inputs, followed by a detailed director's script that +encompasses settings of character profiles and interior/exterior descriptions, +and context-coherent scene descriptions that include appearing characters, +interiors or exteriors, and scene events. Secondly, we employ LMMs with the +image generation tool to produce visual images of settings and scenes. These +images are designed to maintain visual consistency across different scenes +using a visual-language prompting method that combines scene descriptions and +images of the appearing character and setting. Thirdly, scene images serve as +the foundation for producing animated videos, with LMMs generating prompts to +guide this process. The whole process is notably autonomous without manual +intervention, as the LMMs interact seamlessly with generative tools to generate +prompts, evaluate visual quality, and select the best one to optimize the final +output. + +
+
+ comment: Accepted by SIGGRAPH Asia 2024, Project and Codes: + https://github.com/HITsz-TMG/Anim-Director +
+
+
+
+
+ + ☆ Cross-composition Feature Disentanglement for Compositional Zero-shot + Learning + + +
+ Disentanglement of visual features of primitives (i.e., attributes and +objects) has shown exceptional results in Compositional Zero-shot Learning +(CZSL). However, due to the feature divergence of an attribute (resp. object) +when combined with different objects (resp. attributes), it is challenging to +learn disentangled primitive features that are general across different +compositions. To this end, we propose the solution of cross-composition feature +disentanglement, which takes multiple primitive-sharing compositions as inputs +and constrains the disentangled primitive features to be general across these +compositions. More specifically, we leverage a compositional graph to define +the overall primitive-sharing relationships between compositions, and build a +task-specific architecture upon the recently successful large pre-trained +vision-language model (VLM) CLIP, with dual cross-composition disentangling +adapters (called L-Adapter and V-Adapter) inserted into CLIP's frozen text and +image encoders, respectively. Evaluation on three popular CZSL benchmarks shows +that our proposed solution significantly improves the performance of CZSL, and +its components have been verified by solid ablation studies. + +
+
+ comment: work in progress +
+
+
+
+
+ + ☆ Event Stream based Human Action Recognition: A High-Definition Benchmark + Dataset and Algorithms + + +
+ Human Action Recognition (HAR) stands as a pivotal research domain in both +computer vision and artificial intelligence, with RGB cameras dominating as the +preferred tool for investigation and innovation in this field. However, in +real-world applications, RGB cameras encounter numerous challenges, including +light conditions, fast motion, and privacy concerns. Consequently, bio-inspired +event cameras have garnered increasing attention due to their advantages of low +energy consumption, high dynamic range, etc. Nevertheless, most existing +event-based HAR datasets are low resolution ($346 \times 260$). In this paper, +we propose a large-scale, high-definition ($1280 \times 800$) human action +recognition dataset based on the CeleX-V event camera, termed CeleX-HAR. It +encompasses 150 commonly occurring action categories, comprising a total of +124,625 video sequences. Various factors such as multi-view, illumination, +action speed, and occlusion are considered when recording these data. To build +a more comprehensive benchmark dataset, we report over 20 mainstream HAR models +for future works to compare. In addition, we also propose a novel Mamba vision +backbone network for event stream based HAR, termed EVMamba, which equips the +spatial plane multi-directional scanning and novel voxel temporal scanning +mechanism. By encoding and mining the spatio-temporal information of event +streams, our EVMamba has achieved favorable results across multiple datasets. +Both the dataset and source code will be released on +\url{https://github.com/Event-AHU/CeleX-HAR} + +
+
+ comment: In Peer Review +
+
+
+
+
+ + ☆ A Unified Framework for Iris Anti-Spoofing: Introducing IrisGeneral + Dataset and Masked-MoE Method + + +
+ Iris recognition is widely used in high-security scenarios due to its +stability and distinctiveness. However, the acquisition of iris images +typically requires near-infrared illumination and near-infrared band filters, +leading to significant and consistent differences in imaging across devices. +This underscores the importance of developing cross-domain capabilities in iris +anti-spoofing methods. Despite this need, there is no dataset available that +comprehensively evaluates the generalization ability of the iris anti-spoofing +task. To address this gap, we propose the IrisGeneral dataset, which includes +10 subsets, belonging to 7 databases, published by 4 institutions, collected +with 6 types of devices. IrisGeneral is designed with three protocols, aimed at +evaluating average performance, cross-racial generalization, and cross-device +generalization of iris anti-spoofing models. To tackle the challenge of +integrating multiple sub-datasets in IrisGeneral, we employ multiple parameter +sets to learn from the various subsets. Specifically, we utilize the Mixture of +Experts (MoE) to fit complex data distributions using multiple sub-neural +networks. To further enhance the generalization capabilities, we introduce a +novel method Masked-MoE (MMoE). It randomly masks a portion of tokens for some +experts and requires their outputs to be similar to the unmasked experts, which +improves the generalization ability and effectively mitigates the overfitting +issue produced by MoE. We selected ResNet50, VIT-B/16, CLIP, and FLIP as +representative models and benchmarked them on the IrisGeneral dataset. +Experimental results demonstrate that our proposed MMoE with CLIP achieves the +best performance on IrisGeneral. + +
+
+
+
+
+ + ☆ Enhanced Cascade Prostate Cancer Classifier in mp-MRI Utilizing Recall + Feedback Adaptive Loss and Prior Knowledge-Based Feature Extraction + + +
+ Prostate cancer is the second most common cancer in males worldwide, and +mpMRI is commonly used for diagnosis. However, interpreting mpMRI is +challenging and requires expertise from radiologists. This highlights the +urgent need for automated grading in mpMRI. Existing studies lack integration +of clinical prior information and suffer from uneven training sample +distribution due to prevalence. Therefore, we propose a solution that +incorporates prior knowledge, addresses the issue of uneven medical sample +distribution, and maintains high interpretability in mpMRI. Firstly, we +introduce Prior Knowledge-Based Feature Extraction, which mathematically models +the PI-RADS criteria for prostate cancer as diagnostic information into model +training. Secondly, we propose Adaptive Recall Feedback Loss to address the +extremely imbalanced data problem. This method adjusts the training dynamically +based on accuracy and recall in the validation set, resulting in high accuracy +and recall simultaneously in the testing set.Thirdly, we design an Enhanced +Cascade Prostate Cancer Classifier that classifies prostate cancer into +different levels in an interpretable way, which refines the classification +results and helps with clinical intervention. Our method is validated through +experiments on the PI-CAI dataset and outperforms other methods with a more +balanced result in both accuracy and recall rate. + +
+
+
+
+
+ + ☆ RealCustom++: Representing Images as Real-Word for Real-Time + Customization + + +
+ Text-to-image customization, which takes given texts and images depicting +given subjects as inputs, aims to synthesize new images that align with both +text semantics and subject appearance. This task provides precise control over +details that text alone cannot capture and is fundamental for various +real-world applications, garnering significant interest from academia and +industry. Existing works follow the pseudo-word paradigm, which involves +representing given subjects as pseudo-words and combining them with given texts +to collectively guide the generation. However, the inherent conflict and +entanglement between the pseudo-words and texts result in a dual-optimum +paradox, where subject similarity and text controllability cannot be optimal +simultaneously. We propose a novel real-words paradigm termed RealCustom++ that +instead represents subjects as non-conflict real words, thereby disentangling +subject similarity from text controllability and allowing both to be optimized +simultaneously. Specifically, RealCustom++ introduces a novel "train-inference" +decoupled framework: (1) During training, RealCustom++ learns the alignment +between vision conditions and all real words in the text, ensuring high +subject-similarity generation in open domains. This is achieved by the +cross-layer cross-scale projector to robustly and finely extract subject +features, and a curriculum training recipe that adapts the generated subject to +diverse poses and sizes. (2) During inference, leveraging the learned general +alignment, an adaptive mask guidance is proposed to only customize the +generation of the specific target real word, keeping other subject-irrelevant +regions uncontaminated to ensure high text-controllability in real-time. + +
+
+ comment: 23 pages +
+
+
+
+
+ + ☆ R2GenCSR: Retrieving Context Samples for Large Language Model based + X-ray Medical Report Generation + + +
+ Inspired by the tremendous success of Large Language Models (LLMs), existing +X-ray medical report generation methods attempt to leverage large models to +achieve better performance. They usually adopt a Transformer to extract the +visual features of a given X-ray image, and then, feed them into the LLM for +text generation. How to extract more effective information for the LLMs to help +them improve final results is an urgent problem that needs to be solved. +Additionally, the use of visual Transformer models also brings high +computational complexity. To address these issues, this paper proposes a novel +context-guided efficient X-ray medical report generation framework. +Specifically, we introduce the Mamba as the vision backbone with linear +complexity, and the performance obtained is comparable to that of the strong +Transformer model. More importantly, we perform context retrieval from the +training set for samples within each mini-batch during the training phase, +utilizing both positively and negatively related samples to enhance feature +representation and discriminative learning. Subsequently, we feed the vision +tokens, context information, and prompt statements to invoke the LLM for +generating high-quality medical reports. Extensive experiments on three X-ray +report generation datasets (i.e., IU-Xray, MIMIC-CXR, CheXpert Plus) fully +validated the effectiveness of our proposed model. The source code of this work +will be released on \url{https://github.com/Event-AHU/Medical_Image_Analysis}. + +
+
+ comment: In Peer Review +
+
+
+
+
+ + ☆ TraDiffusion: Trajectory-Based Training-Free Image Generation + + +
+ In this work, we propose a training-free, trajectory-based controllable T2I +approach, termed TraDiffusion. This novel method allows users to effortlessly +guide image generation via mouse trajectories. To achieve precise control, we +design a distance awareness energy function to effectively guide latent +variables, ensuring that the focus of generation is within the areas defined by +the trajectory. The energy function encompasses a control function to draw the +generation closer to the specified trajectory and a movement function to +diminish activity in areas distant from the trajectory. Through extensive +experiments and qualitative assessments on the COCO dataset, the results reveal +that TraDiffusion facilitates simpler, more natural image control. Moreover, it +showcases the ability to manipulate salient regions, attributes, and +relationships within the generated images, alongside visual input based on +arbitrary or enhanced trajectories. + +
+
+ comment: The code: https://github.com/och-mac/TraDiffusion +
+
+
+
+
+ + ☆ Coarse-Fine View Attention Alignment-Based GAN for CT Reconstruction + from Biplanar X-Rays + + +
+ For surgical planning and intra-operation imaging, CT reconstruction using +X-ray images can potentially be an important alternative when CT imaging is not +available or not feasible. In this paper, we aim to use biplanar X-rays to +reconstruct a 3D CT image, because biplanar X-rays convey richer information +than single-view X-rays and are more commonly used by surgeons. Different from +previous studies in which the two X-ray views were treated indifferently when +fusing the cross-view data, we propose a novel attention-informed +coarse-to-fine cross-view fusion method to combine the features extracted from +the orthogonal biplanar views. This method consists of a view attention +alignment sub-module and a fine-distillation sub-module that are designed to +work together to highlight the unique or complementary information from each of +the views. Experiments have demonstrated the superiority of our proposed method +over the SOTA methods. + +
+
+
+
+
+ + ☆ Mutually-Aware Feature Learning for Few-Shot Object Counting + + +
+ Few-shot object counting has garnered significant attention for its +practicality as it aims to count target objects in a query image based on given +exemplars without the need for additional training. However, there is a +shortcoming in the prevailing extract-and-match approach: query and exemplar +features lack interaction during feature extraction since they are extracted +unaware of each other and later correlated based on similarity. This can lead +to insufficient target awareness of the extracted features, resulting in target +confusion in precisely identifying the actual target when multiple class +objects coexist. To address this limitation, we propose a novel framework, +Mutually-Aware FEAture learning(MAFEA), which encodes query and exemplar +features mutually aware of each other from the outset. By encouraging +interaction between query and exemplar features throughout the entire pipeline, +we can obtain target-aware features that are robust to a multi-category +scenario. Furthermore, we introduce a background token to effectively associate +the target region of query with exemplars and decouple its background region +from them. Our extensive experiments demonstrate that our model reaches a new +state-of-the-art performance on the two challenging benchmarks, FSCD-LVIS and +FSC-147, with a remarkably reduced degree of the target confusion problem. + +
+
+ comment: Submitted to Pattern Recognition +
+
+
+
+
+ + ☆ Diff2CT: Diffusion Learning to Reconstruct Spine CT from Biplanar X-Rays + + +
+ Intraoperative CT imaging serves as a crucial resource for surgical guidance; +however, it may not always be readily accessible or practical to implement. In +scenarios where CT imaging is not an option, reconstructing CT scans from +X-rays can offer a viable alternative. In this paper, we introduce an +innovative method for 3D CT reconstruction utilizing biplanar X-rays. Distinct +from previous research that relies on conventional image generation techniques, +our approach leverages a conditional diffusion process to tackle the task of +reconstruction. More precisely, we employ a diffusion-based probabilistic model +trained to produce 3D CT images based on orthogonal biplanar X-rays. To improve +the structural integrity of the reconstructed images, we incorporate a novel +projection loss function. Experimental results validate that our proposed +method surpasses existing state-of-the-art benchmarks in both visual image +quality and multiple evaluative metrics. Specifically, our technique achieves a +higher Structural Similarity Index (SSIM) of 0.83, a relative increase of 10\%, +and a lower Fr\'echet Inception Distance (FID) of 83.43, which represents a +relative decrease of 25\%. + +
+
+
+
+
+ + ☆ Pedestrian Attribute Recognition: A New Benchmark Dataset and A Large + Language Model Augmented Framework SP60 + + +
+ Pedestrian Attribute Recognition (PAR) is one of the indispensable tasks in +human-centered research. However, existing datasets neglect different domains +(e.g., environments, times, populations, and data sources), only conducting +simple random splits, and the performance of these datasets has already +approached saturation. In the past five years, no large-scale dataset has been +opened to the public. To address this issue, this paper proposes a new +large-scale, cross-domain pedestrian attribute recognition dataset to fill the +data gap, termed MSP60K. It consists of 60,122 images and 57 attribute +annotations across eight scenarios. Synthetic degradation is also conducted to +further narrow the gap between the dataset and real-world challenging +scenarios. To establish a more rigorous benchmark, we evaluate 17 +representative PAR models under both random and cross-domain split protocols on +our dataset. Additionally, we propose an innovative Large Language Model (LLM) +augmented PAR framework, named LLM-PAR. This framework processes pedestrian +images through a Vision Transformer (ViT) backbone to extract features and +introduces a multi-embedding query Transformer to learn partial-aware features +for attribute classification. Significantly, we enhance this framework with LLM +for ensemble learning and visual feature augmentation. Comprehensive +experiments across multiple PAR benchmark datasets have thoroughly validated +the efficacy of our proposed framework. The dataset and source code +accompanying this paper will be made publicly available at +\url{https://github.com/Event-AHU/OpenPAR}. + +
+
+ comment: MSP60K PAR Benchmark Dataset, LLM based PAR model, In Peer Review +
+
+
+
+
+ + ☆ HYDEN: Hyperbolic Density Representations for Medical Images and Reports + + +
+ In light of the inherent entailment relations between images and text, +hyperbolic point vector embeddings, leveraging the hierarchical modeling +advantages of hyperbolic space, have been utilized for visual semantic +representation learning. However, point vector embedding approaches fail to +address the issue of semantic uncertainty, where an image may have multiple +interpretations, and text may refer to different images, a phenomenon +particularly prevalent in the medical domain. Therefor, we propose +\textbf{HYDEN}, a novel hyperbolic density embedding based image-text +representation learning approach tailored for specific medical domain data. +This method integrates text-aware local features alongside global features from +images, mapping image-text features to density features in hyperbolic space via +using hyperbolic pseudo-Gaussian distributions. An encapsulation loss function +is employed to model the partial order relations between image-text density +distributions. Experimental results demonstrate the interpretability of our +approach and its superior performance compared to the baseline methods across +various zero-shot tasks and different datasets. + +
+
+
+
+
+ + ☆ Dataset Distillation for Histopathology Image Classification + + +
+ Deep neural networks (DNNs) have exhibited remarkable success in the field of +histopathology image analysis. On the other hand, the contemporary trend of +employing large models and extensive datasets has underscored the significance +of dataset distillation, which involves compressing large-scale datasets into a +condensed set of synthetic samples, offering distinct advantages in improving +training efficiency and streamlining downstream applications. In this work, we +introduce a novel dataset distillation algorithm tailored for histopathology +image datasets (Histo-DD), which integrates stain normalisation and model +augmentation into the distillation progress. Such integration can substantially +enhance the compatibility with histopathology images that are often +characterised by high colour heterogeneity. We conduct a comprehensive +evaluation of the effectiveness of the proposed algorithm and the generated +histopathology samples in both patch-level and slide-level classification +tasks. The experimental results, carried out on three publicly available WSI +datasets, including Camelyon16, TCGA-IDH, and UniToPath, demonstrate that the +proposed Histo-DD can generate more informative synthetic patches than previous +coreset selection and patch sampling methods. Moreover, the synthetic samples +can preserve discriminative information, substantially reduce training efforts, +and exhibit architecture-agnostic properties. These advantages indicate that +synthetic samples can serve as an alternative to large-scale datasets. + +
+
+
+
+
+ + ☆ MePT: Multi-Representation Guided Prompt Tuning for Vision-Language + Model + + +
+ Recent advancements in pre-trained Vision-Language Models (VLMs) have +highlighted the significant potential of prompt tuning for adapting these +models to a wide range of downstream tasks. However, existing prompt tuning +methods typically map an image to a single representation, limiting the model's +ability to capture the diverse ways an image can be described. To address this +limitation, we investigate the impact of visual prompts on the model's +generalization capability and introduce a novel method termed +Multi-Representation Guided Prompt Tuning (MePT). Specifically, MePT employs a +three-branch framework that focuses on diverse salient regions, uncovering the +inherent knowledge within images which is crucial for robust generalization. +Further, we employ efficient self-ensemble techniques to integrate these +versatile image representations, allowing MePT to learn all conditional, +marginal, and fine-grained distributions effectively. We validate the +effectiveness of MePT through extensive experiments, demonstrating significant +improvements on both base-to-novel class prediction and domain generalization +tasks. + +
+
+
+
+
+ + ☆ Photorealistic Object Insertion with Diffusion-Guided Inverse Rendering ECCV 2024 + + +
+ The correct insertion of virtual objects in images of real-world scenes +requires a deep understanding of the scene's lighting, geometry and materials, +as well as the image formation process. While recent large-scale diffusion +models have shown strong generative and inpainting capabilities, we find that +current models do not sufficiently "understand" the scene shown in a single +picture to generate consistent lighting effects (shadows, bright reflections, +etc.) while preserving the identity and details of the composited object. We +propose using a personalized large diffusion model as guidance to a physically +based inverse rendering process. Our method recovers scene lighting and +tone-mapping parameters, allowing the photorealistic composition of arbitrary +virtual objects in single frames or videos of indoor or outdoor scenes. Our +physically based pipeline further enables automatic materials and tone-mapping +refinement. + +
+
+ comment: ECCV 2024, Project page: + https://research.nvidia.com/labs/toronto-ai/DiPIR/ +
+
+
+
+
+ + ☆ TESL-Net: A Transformer-Enhanced CNN for Accurate Skin Lesion + Segmentation + + +
+ Early detection of skin cancer relies on precise segmentation of dermoscopic +images of skin lesions. However, this task is challenging due to the irregular +shape of the lesion, the lack of sharp borders, and the presence of artefacts +such as marker colours and hair follicles. Recent methods for melanoma +segmentation are U-Nets and fully connected networks (FCNs). As the depth of +these neural network models increases, they can face issues like the vanishing +gradient problem and parameter redundancy, potentially leading to a decrease in +the Jaccard index of the segmentation model. In this study, we introduced a +novel network named TESL-Net for the segmentation of skin lesions. The proposed +TESL-Net involves a hybrid network that combines the local features of a CNN +encoder-decoder architecture with long-range and temporal dependencies using +bi-convolutional long-short-term memory (Bi-ConvLSTM) networks and a Swin +transformer. This enables the model to account for the uncertainty of +segmentation over time and capture contextual channel relationships in the +data. We evaluated the efficacy of TESL-Net in three commonly used datasets +(ISIC 2016, ISIC 2017, and ISIC 2018) for the segmentation of skin lesions. The +proposed TESL-Net achieves state-of-the-art performance, as evidenced by a +significantly elevated Jaccard index demonstrated by empirical results. + +
+
+
+
+
+ + ☆ MambaLoc: Efficient Camera Localisation via State Space Model + + +
+ Location information is pivotal for the automation and intelligence of +terminal devices and edge-cloud IoT systems, such as autonomous vehicles and +augmented reality. However, achieving reliable positioning across diverse IoT +applications remains challenging due to significant training costs and the +necessity of densely collected data. To tackle these issues, we have +innovatively applied the selective state space (SSM) model to visual +localization, introducing a new model named MambaLoc. The proposed model +demonstrates exceptional training efficiency by capitalizing on the SSM model's +strengths in efficient feature extraction, rapid computation, and memory +optimization, and it further ensures robustness in sparse data environments due +to its parameter sparsity. Additionally, we propose the Global Information +Selector (GIS), which leverages selective SSM to implicitly achieve the +efficient global feature extraction capabilities of Non-local Neural Networks. +This design leverages the computational efficiency of the SSM model alongside +the Non-local Neural Networks' capacity to capture long-range dependencies with +minimal layers. Consequently, the GIS enables effective global information +capture while significantly accelerating convergence. Our extensive +experimental validation using public indoor and outdoor datasets first +demonstrates our model's effectiveness, followed by evidence of its versatility +with various existing localization models. + +
+
+
+
+
+ + ☆ Image-based Freeform Handwriting Authentication with Energy-oriented + Self-Supervised Learning + + +
+ Freeform handwriting authentication verifies a person's identity from their +writing style and habits in messy handwriting data. This technique has gained +widespread attention in recent years as a valuable tool for various fields, +e.g., fraud prevention and cultural heritage protection. However, it still +remains a challenging task in reality due to three reasons: (i) severe damage, +(ii) complex high-dimensional features, and (iii) lack of supervision. To +address these issues, we propose SherlockNet, an energy-oriented two-branch +contrastive self-supervised learning framework for robust and fast freeform +handwriting authentication. It consists of four stages: (i) pre-processing: +converting manuscripts into energy distributions using a novel plug-and-play +energy-oriented operator to eliminate the influence of noise; (ii) generalized +pre-training: learning general representation through two-branch momentum-based +adaptive contrastive learning with the energy distributions, which handles the +high-dimensional features and spatial dependencies of handwriting; (iii) +personalized fine-tuning: calibrating the learned knowledge using a small +amount of labeled data from downstream tasks; and (iv) practical application: +identifying individual handwriting from scrambled, missing, or forged data +efficiently and conveniently. Considering the practicality, we construct EN-HA, +a novel dataset that simulates data forgery and severe damage in real +applications. Finally, we conduct extensive experiments on six benchmark +datasets including our EN-HA, and the results prove the robustness and +efficiency of SherlockNet. + +
+
+ comment: Accepted by TMM +
+
+
+
+
+ + ☆ Implicit Grid Convolution for Multi-Scale Image Super-Resolution + + +
+ Recently, Super-Resolution (SR) achieved significant performance improvement +by employing neural networks. Most SR methods conventionally train a single +model for each targeted scale, which increases redundancy in training and +deployment in proportion to the number of scales targeted. This paper +challenges this conventional fixed-scale approach. Our preliminary analysis +reveals that, surprisingly, encoders trained at different scales extract +similar features from images. Furthermore, the commonly used scale-specific +upsampler, Sub-Pixel Convolution (SPConv), exhibits significant inter-scale +correlations. Based on these observations, we propose a framework for training +multiple integer scales simultaneously with a single model. We use a single +encoder to extract features and introduce a novel upsampler, Implicit Grid +Convolution~(IGConv), which integrates SPConv at all scales within a single +module to predict multiple scales. Our extensive experiments demonstrate that +training multiple scales with a single model reduces the training budget and +stored parameters by one-third while achieving equivalent inference latency and +comparable performance. Furthermore, we propose IGConv$^{+}$, which addresses +spectral bias and input-independent upsampling and uses ensemble prediction to +improve performance. As a result, SRFormer-IGConv$^{+}$ achieves a remarkable +0.25dB improvement in PSNR at Urban100$\times$4 while reducing the training +budget, stored parameters, and inference cost compared to the existing +SRFormer. + +
+
+
+
+
+ + ☆ SG-GS: Photo-realistic Animatable Human Avatars with Semantically-Guided + Gaussian Splatting + + +
+ Reconstructing photo-realistic animatable human avatars from monocular videos +remains challenging in computer vision and graphics. Recently, methods using 3D +Gaussians to represent the human body have emerged, offering faster +optimization and real-time rendering. However, due to ignoring the crucial role +of human body semantic information which represents the intrinsic structure and +connections within the human body, they fail to achieve fine-detail +reconstruction of dynamic human avatars. To address this issue, we propose +SG-GS, which uses semantics-embedded 3D Gaussians, skeleton-driven rigid +deformation, and non-rigid cloth dynamics deformation to create photo-realistic +animatable human avatars from monocular videos. We then design a Semantic +Human-Body Annotator (SHA) which utilizes SMPL's semantic prior for efficient +body part semantic labeling. The generated labels are used to guide the +optimization of Gaussian semantic attributes. To address the limited receptive +field of point-level MLPs for local features, we also propose a 3D network that +integrates geometric and semantic associations for human avatar deformation. We +further implement three key strategies to enhance the semantic accuracy of 3D +Gaussians and rendering quality: semantic projection with 2D regularization, +semantic-guided density regularization and semantic-aware regularization with +neighborhood consistency. Extensive experiments demonstrate that SG-GS achieves +state-of-the-art geometry and appearance reconstruction performance. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ☆ CHASE: 3D-Consistent Human Avatars with Sparse Inputs via Gaussian + Splatting and Contrastive Learning + + +
+ Recent advancements in human avatar synthesis have utilized radiance fields +to reconstruct photo-realistic animatable human avatars. However, both +NeRFs-based and 3DGS-based methods struggle with maintaining 3D consistency and +exhibit suboptimal detail reconstruction, especially with sparse inputs. To +address this challenge, we propose CHASE, which introduces supervision from +intrinsic 3D consistency across poses and 3D geometry contrastive learning, +achieving performance comparable with sparse inputs to that with full inputs. +Following previous work, we first integrate a skeleton-driven rigid deformation +and a non-rigid cloth dynamics deformation to coordinate the movements of +individual Gaussians during animation, reconstructing basic avatar with coarse +3D consistency. To improve 3D consistency under sparse inputs, we design +Dynamic Avatar Adjustment(DAA) to adjust deformed Gaussians based on a selected +similar pose/image from the dataset. Minimizing the difference between the +image rendered by adjusted Gaussians and the image with the similar pose serves +as an additional form of supervision for avatar. Furthermore, we propose a 3D +geometry contrastive learning strategy to maintain the 3D global consistency of +generated avatars. Though CHASE is designed for sparse inputs, it surprisingly +outperforms current SOTA methods \textbf{in both full and sparse settings} on +the ZJU-MoCap and H36M datasets, demonstrating that our CHASE successfully +maintains avatar's 3D consistency, hence improving rendering quality. + +
+
+ comment: 13 pages, 6 figures +
+
+
+
+
+ + ☆ ExpoMamba: Exploiting Frequency SSM Blocks for Efficient and Effective + Image Enhancement + + +
+ Low-light image enhancement remains a challenging task in computer vision, +with existing state-of-the-art models often limited by hardware constraints and +computational inefficiencies, particularly in handling high-resolution images. +Recent foundation models, such as transformers and diffusion models, despite +their efficacy in various domains, are limited in use on edge devices due to +their computational complexity and slow inference times. We introduce +ExpoMamba, a novel architecture that integrates components of the frequency +state space within a modified U-Net, offering a blend of efficiency and +effectiveness. This model is specifically optimized to address mixed exposure +challenges, a common issue in low-light image enhancement, while ensuring +computational efficiency. Our experiments demonstrate that ExpoMamba enhances +low-light images up to 2-3x faster than traditional models with an inference +time of 36.6 ms and achieves a PSNR improvement of approximately 15-20% over +competing models, making it highly suitable for real-time image processing +applications. + +
+
+
+
+
+ + ☆ C2P-CLIP: Injecting Category Common Prompt in CLIP to Enhance + Generalization in Deepfake Detection + + +
+ This work focuses on AIGC detection to develop universal detectors capable of +identifying various types of forgery images. Recent studies have found large +pre-trained models, such as CLIP, are effective for generalizable deepfake +detection along with linear classifiers. However, two critical issues remain +unresolved: 1) understanding why CLIP features are effective on deepfake +detection through a linear classifier; and 2) exploring the detection potential +of CLIP. In this study, we delve into the underlying mechanisms of CLIP's +detection capabilities by decoding its detection features into text and +performing word frequency analysis. Our finding indicates that CLIP detects +deepfakes by recognizing similar concepts (Fig. \ref{fig:fig1} a). Building on +this insight, we introduce Category Common Prompt CLIP, called C2P-CLIP, which +integrates the category common prompt into the text encoder to inject +category-related concepts into the image encoder, thereby enhancing detection +performance (Fig. \ref{fig:fig1} b). Our method achieves a 12.41\% improvement +in detection accuracy compared to the original CLIP, without introducing +additional parameters during testing. Comprehensive experiments conducted on +two widely-used datasets, encompassing 20 generation models, validate the +efficacy of the proposed method, demonstrating state-of-the-art performance. +The code is available at +\url{https://github.com/chuangchuangtan/C2P-CLIP-DeepfakeDetection} + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ Kubrick: Multimodal Agent Collaborations for Synthetic Video Generation + + +
+ Text-to-video generation has been dominated by end-to-end diffusion-based or +autoregressive models. On one hand, those novel models provide plausible +versatility, but they are criticized for physical correctness, shading and +illumination, camera motion, and temporal consistency. On the other hand, film +industry relies on manually-edited Computer-Generated Imagery (CGI) using 3D +modeling software. Human-directed 3D synthetic videos and animations address +the aforementioned shortcomings, but it is extremely tedious and requires tight +collaboration between movie makers and 3D rendering experts. In this paper, we +introduce an automatic synthetic video generation pipeline based on Vision +Large Language Model (VLM) agent collaborations. Given a natural language +description of a video, multiple VLM agents auto-direct various processes of +the generation pipeline. They cooperate to create Blender scripts which render +a video that best aligns with the given description. Based on film making +inspiration and augmented with Blender-based movie making knowledge, the +Director agent decomposes the input text-based video description into +sub-processes. For each sub-process, the Programmer agent produces Python-based +Blender scripts based on customized function composing and API calling. Then, +the Reviewer agent, augmented with knowledge of video reviewing, character +motion coordinates, and intermediate screenshots uses its compositional +reasoning ability to provide feedback to the Programmer agent. The Programmer +agent iteratively improves the scripts to yield the best overall video outcome. +Our generated videos show better quality than commercial video generation +models in 5 metrics on video quality and instruction-following performance. +Moreover, our framework outperforms other approaches in a comprehensive user +study on quality, consistency, and rationality. + +
+
+
+
+
+ + ☆ The Brittleness of AI-Generated Image Watermarking Techniques: Examining + Their Robustness Against Visual Paraphrasing Attacks + + +
+ The rapid advancement of text-to-image generation systems, exemplified by +models like Stable Diffusion, Midjourney, Imagen, and DALL-E, has heightened +concerns about their potential misuse. In response, companies like Meta and +Google have intensified their efforts to implement watermarking techniques on +AI-generated images to curb the circulation of potentially misleading visuals. +However, in this paper, we argue that current image watermarking methods are +fragile and susceptible to being circumvented through visual paraphrase +attacks. The proposed visual paraphraser operates in two steps. First, it +generates a caption for the given image using KOSMOS-2, one of the latest +state-of-the-art image captioning systems. Second, it passes both the original +image and the generated caption to an image-to-image diffusion system. During +the denoising step of the diffusion pipeline, the system generates a visually +similar image that is guided by the text caption. The resulting image is a +visual paraphrase and is free of any watermarks. Our empirical findings +demonstrate that visual paraphrase attacks can effectively remove watermarks +from images. This paper provides a critical assessment, empirically revealing +the vulnerability of existing watermarking techniques to visual paraphrase +attacks. While we do not propose solutions to this issue, this paper serves as +a call to action for the scientific community to prioritize the development of +more robust watermarking techniques. Our first-of-its-kind visual paraphrase +dataset and accompanying code are publicly available. + +
+
+ comment: 23 pages and 10 figures +
+
+
+
+
+ + ☆ Feasibility of assessing cognitive impairment via distributed camera + network and privacy-preserving edge computing + + +
+ INTRODUCTION: Mild cognitive impairment (MCI) is characterized by a decline +in cognitive functions beyond typical age and education-related expectations. +Since, MCI has been linked to reduced social interactions and increased aimless +movements, we aimed to automate the capture of these behaviors to enhance +longitudinal monitoring. + METHODS: Using a privacy-preserving distributed camera network, we collected +movement and social interaction data from groups of individuals with MCI +undergoing therapy within a 1700$m^2$ space. We developed movement and social +interaction features, which were then used to train a series of machine +learning algorithms to distinguish between higher and lower cognitive +functioning MCI groups. + RESULTS: A Wilcoxon rank-sum test revealed statistically significant +differences between high and low-functioning cohorts in features such as linear +path length, walking speed, change in direction while walking, entropy of +velocity and direction change, and number of group formations in the indoor +space. Despite lacking individual identifiers to associate with specific levels +of MCI, a machine learning approach using the most significant features +provided a 71% accuracy. + DISCUSSION: We provide evidence to show that a privacy-preserving low-cost +camera network using edge computing framework has the potential to distinguish +between different levels of cognitive impairment from the movements and social +interactions captured during group activities. + +
+
+
+
+
+ + ☆ CLIP-DPO: Vision-Language Models as a Source of Preference for Fixing + Hallucinations in LVLMs ECCV 2024 + + +
+ Despite recent successes, LVLMs or Large Vision Language Models are prone to +hallucinating details like objects and their properties or relations, limiting +their real-world deployment. To address this and improve their robustness, we +present CLIP-DPO, a preference optimization method that leverages contrastively +pre-trained Vision-Language (VL) embedding models, such as CLIP, for DPO-based +optimization of LVLMs. Unlike prior works tackling LVLM hallucinations, our +method does not rely on paid-for APIs, and does not require additional training +data or the deployment of other external LVLMs. Instead, starting from the +initial pool of supervised fine-tuning data, we generate a diverse set of +predictions, which are ranked based on their CLIP image-text similarities, and +then filtered using a robust rule-based approach to obtain a set of positive +and negative pairs for DPO-based training. We applied CLIP-DPO fine-tuning to +the MobileVLM-v2 family of models and to LlaVA-1.5, in all cases observing +significant improvements in terms of hallucination reduction over baseline +models. We also observe better performance for zero-shot classification, +suggesting improved grounding capabilities, and verify that the original +performance on standard LVLM benchmarks is overall preserved. + +
+
+ comment: Accepted at ECCV 2024 +
+
+
+
+
+ + ♻ ☆ CoMusion: Towards Consistent Stochastic Human Motion Prediction via + Motion Diffusion ECCV 2024 + + +
+ Stochastic Human Motion Prediction (HMP) aims to predict multiple possible +future human pose sequences from observed ones. Most prior works learn motion +distributions through encoding-decoding in the latent space, which does not +preserve motion's spatial-temporal structure. While effective, these methods +often require complex, multi-stage training and yield predictions that are +inconsistent with the provided history and can be physically unrealistic. To +address these issues, we propose CoMusion, a single-stage, end-to-end +diffusion-based stochastic HMP framework. CoMusion is inspired from the insight +that a smooth future pose initialization improves prediction performance, a +strategy not previously utilized in stochastic models but evidenced in +deterministic works. To generate such initialization, CoMusion's motion +predictor starts with a Transformer-based network for initial reconstruction of +corrupted motion. Then, a graph convolutional network (GCN) is employed to +refine the prediction considering past observations in the discrete cosine +transformation (DCT) space. Our method, facilitated by the Transformer-GCN +module design and a proposed variance scheduler, excels in predicting accurate, +realistic, and consistent motions, while maintaining appropriate diversity. +Experimental results on benchmark datasets demonstrate that CoMusion surpasses +prior methods across metrics, while demonstrating superior generation quality. +Our Code is released at https://github.com/jsun57/CoMusion/ . + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Tensor network compressibility of convolutional models + + +
+ Convolutional neural networks (CNNs) are one of the most widely used neural +network architectures, showcasing state-of-the-art performance in computer +vision tasks. Although larger CNNs generally exhibit higher accuracy, their +size can be effectively reduced by ``tensorization'' while maintaining +accuracy, namely, replacing the convolution kernels with compact decompositions +such as Tucker, Canonical Polyadic decompositions, or quantum-inspired +decompositions such as matrix product states, and directly training the factors +in the decompositions to bias the learning towards low-rank decompositions. But +why doesn't tensorization seem to impact the accuracy adversely? We explore +this by assessing how \textit{truncating} the convolution kernels of +\textit{dense} (untensorized) CNNs impact their accuracy. Specifically, we +truncated the kernels of (i) a vanilla four-layer CNN and (ii) ResNet-50 +pre-trained for image classification on CIFAR-10 and CIFAR-100 datasets. We +found that kernels (especially those inside deeper layers) could often be +truncated along several cuts resulting in significant loss in kernel norm but +not in classification accuracy. This suggests that such ``correlation +compression'' (underlying tensorization) is an intrinsic feature of how +information is encoded in dense CNNs. We also found that aggressively truncated +models could often recover the pre-truncation accuracy after only a few epochs +of re-training, suggesting that compressing the internal correlations of +convolution layers does not often transport the model to a worse minimum. Our +results can be applied to tensorize and compress CNN models more effectively. + +
+
+ comment: 40 pages, 21 images +
+
+
+
+
+ + ♻ ☆ CRITERIA: a New Benchmarking Paradigm for Evaluating Trajectory + Prediction Models for Autonomous Driving + + +
+ Benchmarking is a common method for evaluating trajectory prediction models +for autonomous driving. Existing benchmarks rely on datasets, which are biased +towards more common scenarios, such as cruising, and distance-based metrics +that are computed by averaging over all scenarios. Following such a regiment +provides a little insight into the properties of the models both in terms of +how well they can handle different scenarios and how admissible and diverse +their outputs are. There exist a number of complementary metrics designed to +measure the admissibility and diversity of trajectories, however, they suffer +from biases, such as length of trajectories. + In this paper, we propose a new benChmarking paRadIgm for evaluaTing +trajEctoRy predIction Approaches (CRITERIA). Particularly, we propose 1) a +method for extracting driving scenarios at varying levels of specificity +according to the structure of the roads, models' performance, and data +properties for fine-grained ranking of prediction models; 2) A set of new +bias-free metrics for measuring diversity, by incorporating the characteristics +of a given scenario, and admissibility, by considering the structure of roads +and kinematic compliancy, motivated by real-world driving constraints. 3) Using +the proposed benchmark, we conduct extensive experimentation on a +representative set of the prediction models using the large scale Argoverse +dataset. We show that the proposed benchmark can produce a more accurate +ranking of the models and serve as a means of characterizing their behavior. We +further present ablation studies to highlight contributions of different +elements that are used to compute the proposed metrics. + +
+
+
+
+
+ + ♻ ☆ Spatial-Frequency Dual Progressive Attention Network For Medical Image + Segmentation + + +
+ In medical images, various types of lesions often manifest significant +differences in their shape and texture. Accurate medical image segmentation +demands deep learning models with robust capabilities in multi-scale and +boundary feature learning. However, previous networks still have limitations in +addressing the above issues. Firstly, previous networks simultaneously fuse +multi-level features or employ deep supervision to enhance multi-scale +learning. However, this may lead to feature redundancy and excessive +computational overhead, which is not conducive to network training and clinical +deployment. Secondly, the majority of medical image segmentation networks +exclusively learn features in the spatial domain, disregarding the abundant +global information in the frequency domain. This results in a bias towards +low-frequency components, neglecting crucial high-frequency information. To +address these problems, we introduce SF-UNet, a spatial-frequency dual-domain +attention network. It comprises two main components: the Multi-scale +Progressive Channel Attention (MPCA) block, which progressively extract +multi-scale features across adjacent encoder layers, and the lightweight +Frequency-Spatial Attention (FSA) block, with only 0.05M parameters, enabling +concurrent learning of texture and boundary features from both spatial and +frequency domains. We validate the effectiveness of the proposed SF-UNet on +three public datasets. Experimental results show that compared to previous +state-of-the-art (SOTA) medical image segmentation networks, SF-UNet achieves +the best performance, and achieves up to 9.4\% and 10.78\% improvement in DSC +and IOU. Codes will be released at https://github.com/nkicsl/SF-UNet. + +
+
+ comment: 6 pages accepted by 2024 IEEE International Conference on + Bioinformatics and Biomedicine (BIBM 2024) +
+
+
+
+
+ + ♻ ☆ MagicFace: Training-free Universal-Style Human Image Customized + Synthesis + + +
+ Current state-of-the-art methods for human image customized synthesis +typically require tedious training on large-scale datasets. In such cases, they +are prone to overfitting and struggle to personalize individuals of unseen +styles. Moreover, these methods extensively focus on single-concept human image +synthesis and lack the flexibility needed for customizing individuals with +multiple given concepts, thereby impeding their broader practical application. +To this end, we propose MagicFace, a novel training-free method for +universal-style human image personalized synthesis, enabling multi-concept +customization by accurately integrating reference concept features into their +latent generated region at the pixel level. Specifically, MagicFace introduces +a coarse-to-fine generation pipeline, involving two sequential stages: semantic +layout construction and concept feature injection. This is achieved by our +Reference-aware Self-Attention (RSA) and Region-grouped Blend Attention (RBA) +mechanisms. In the first stage, RSA enables the latent image to query features +from all reference concepts simultaneously, extracting the overall semantic +understanding to facilitate the initial semantic layout establishment. In the +second stage, we employ an attention-based semantic segmentation method to +pinpoint the latent generated regions of all concepts at each step. Following +this, RBA divides the pixels of the latent image into semantic groups, with +each group querying fine-grained features from the corresponding reference +concept, which ensures precise attribute alignment and feature injection. +Throughout the generation process, a weighted mask strategy is employed to +ensure the model focuses more on the reference concepts. Extensive experiments +demonstrate the superiority of MagicFace in both human-centric subject-to-image +synthesis and multi-concept human image customization. + +
+
+ comment: project page: https://codegoat24.github.io/MagicFace +
+
+
+
+
+ + ♻ ☆ In-context Prompt Learning for Test-time Vision Recognition with Frozen + Vision-language Model + + +
+ Current pre-trained vision-language models, such as CLIP, have demonstrated +remarkable zero-shot generalization capabilities across various downstream +tasks. However, their performance significantly degrades when test inputs +exhibit different distributions. In this paper, we explore the concept of +test-time prompt tuning (TTPT), which facilitates the adaptation of the CLIP +model to novel downstream tasks through a one-step unsupervised optimization +that involves only test samples. Inspired by in-context learning in natural +language processing (NLP), we propose In-Context Prompt Learning (InCPL) for +test-time visual recognition tasks, which empowers a pre-trained +vision-language model with labeled examples as context information on +downstream task. Specifically, InCPL associates a new test sample with very few +labeled examples (sometimes just one) as context information, enabling reliable +label estimation for the test sample and facilitating model adaptation. To +achieve this, InCPL employs an efficient language-to-vision translator to +explore the textual prior information for visual prompt learning. Further, we +introduce a context-aware unsupervised loss to optimize visual prompts tailored +to test samples. Finally, we design a cyclic learning strategy for visual and +textual prompts to ensure mutual synergy across different modalities. This +enables a pre-trained, frozen CLIP model to adapt to any task using its learned +adaptive prompt. Our method demonstrates superior performance and achieves +state-of-the-art results across various downstream datasets. + +
+
+
+
+
+ + ♻ ☆ The Phantom Menace: Unmasking Privacy Leakages in Vision-Language Models + + +
+ Vision-Language Models (VLMs) combine visual and textual understanding, +rendering them well-suited for diverse tasks like generating image captions and +answering visual questions across various domains. However, these capabilities +are built upon training on large amount of uncurated data crawled from the web. +The latter may include sensitive information that VLMs could memorize and leak, +raising significant privacy concerns. In this paper, we assess whether these +vulnerabilities exist, focusing on identity leakage. Our study leads to three +key findings: (i) VLMs leak identity information, even when the vision-language +alignment and the fine-tuning use anonymized data; (ii) context has little +influence on identity leakage; (iii) simple, widely used anonymization +techniques, like blurring, are not sufficient to address the problem. These +findings underscore the urgent need for robust privacy protection strategies +when deploying VLMs. Ethical awareness and responsible development practices +are essential to mitigate these risks. + +
+
+
+
+
+ + ♻ ☆ Exploring Vacant Classes in Label-Skewed Federated Learning + + +
+ Label skews, characterized by disparities in local label distribution across +clients, pose a significant challenge in federated learning. As minority +classes suffer from worse accuracy due to overfitting on local imbalanced data, +prior methods often incorporate class-balanced learning techniques during local +training. Although these methods improve the mean accuracy across all classes, +we observe that vacant classes-referring to categories absent from a client's +data distribution-remain poorly recognized. Besides, there is still a gap in +the accuracy of local models on minority classes compared to the global model. +This paper introduces FedVLS, a novel approach to label-skewed federated +learning that integrates both vacant-class distillation and logit suppression +simultaneously. Specifically, vacant-class distillation leverages knowledge +distillation during local training on each client to retain essential +information related to vacant classes from the global model. Moreover, logit +suppression directly penalizes network logits for non-label classes, +effectively addressing misclassifications in minority classes that may be +biased toward majority classes. Extensive experiments validate the efficacy of +FedVLS, demonstrating superior performance compared to previous +state-of-the-art (SOTA) methods across diverse datasets with varying degrees of +label skews. Code is available in the supplementary material. + +
+
+
+
+
+ + ♻ ☆ Hybrid Reasoning Based on Large Language Models for Autonomous Car + Driving + + +
+ Large Language Models (LLMs) have garnered significant attention for their +ability to understand text and images, generate human-like text, and perform +complex reasoning tasks. However, their ability to generalize this advanced +reasoning with a combination of natural language text for decision-making in +dynamic situations requires further exploration. In this study, we investigate +how well LLMs can adapt and apply a combination of arithmetic and common-sense +reasoning, particularly in autonomous driving scenarios. We hypothesize that +LLMs hybrid reasoning abilities can improve autonomous driving by enabling them +to analyze detected object and sensor data, understand driving regulations and +physical laws, and offer additional context. This addresses complex scenarios, +like decisions in low visibility (due to weather conditions), where traditional +methods might fall short. We evaluated Large Language Models (LLMs) based on +accuracy by comparing their answers with human-generated ground truth inside +CARLA. The results showed that when a combination of images (detected objects) +and sensor data is fed into the LLM, it can offer precise information for brake +and throttle control in autonomous vehicles across various weather conditions. +This formulation and answers can assist in decision-making for auto-pilot +systems. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Source Matters: Source Dataset Impact on Model Robustness in Medical + Imaging + + +
+ Transfer learning has become an essential part of medical imaging +classification algorithms, often leveraging ImageNet weights. The domain shift +from natural to medical images has prompted alternatives such as RadImageNet, +often showing comparable classification performance. However, it remains +unclear whether the performance gains from transfer learning stem from improved +generalization or shortcut learning. To address this, we conceptualize +confounders by introducing the Medical Imaging Contextualized Confounder +Taxonomy (MICCAT) and investigate a range of confounders across it -- whether +synthetic or sampled from the data -- using two public chest X-ray and CT +datasets. We show that ImageNet and RadImageNet achieve comparable +classification performance, yet ImageNet is much more prone to overfitting to +confounders. We recommend that researchers using ImageNet-pretrained models +reexamine their model robustness by conducting similar experiments. Our code +and experiments are available at https://github.com/DovileDo/source-matters. + +
+
+
+
+
+ + ♻ ☆ Global Control for Local SO(3)-Equivariant Scale-Invariant Vessel + Segmentation + + +
+ Personalized 3D vascular models can aid in a range of diagnostic, prognostic, +and treatment-planning tasks relevant to cardiovascular disease management. +Deep learning provides a means to obtain such models automatically from image +data. Ideally, a user should have control over the included region in the +vascular model. Additionally, the model should be watertight and highly +accurate. To this end, we propose a combination of a global controller +leveraging voxel mask segmentations to provide boundary conditions for vessels +of interest to a local, iterative vessel segmentation model. We introduce the +preservation of scale- and rotational symmetries in the local segmentation +model, leading to generalisation to vessels of unseen sizes and orientations. +Combined with the global controller, this enables flexible 3D vascular model +building, without additional retraining. We demonstrate the potential of our +method on a dataset containing abdominal aortic aneurysms (AAAs). Our method +performs on par with a state-of-the-art segmentation model in the segmentation +of AAAs, iliac arteries, and renal arteries, while providing a watertight, +smooth surface representation. Moreover, we demonstrate that by adapting the +global controller, we can easily extend vessel sections in the 3D model. + +
+
+
+
+
+ + ♻ ☆ Text-Conditioned Resampler For Long Form Video Understanding ECCV24 + + +
+ In this paper we present a text-conditioned video resampler (TCR) module that +uses a pre-trained and frozen visual encoder and large language model (LLM) to +process long video sequences for a task. TCR localises relevant visual features +from the video given a text condition and provides them to a LLM to generate a +text response. Due to its lightweight design and use of cross-attention, TCR +can process more than 100 frames at a time with plain attention and without +optimised implementations. We make the following contributions: (i) we design a +transformer-based sampling architecture that can process long videos +conditioned on a task, together with a training method that enables it to +bridge pre-trained visual and language models; (ii) we identify tasks that +could benefit from longer video perception; and (iii) we empirically validate +its efficacy on a wide variety of evaluation tasks including NextQA, EgoSchema, +and the EGO4D-LTA challenge. + +
+
+ comment: Accepted to the ECCV24 conference +
+
+
+
+
+ + ♻ ☆ RET-CLIP: A Retinal Image Foundation Model Pre-trained with Clinical + Diagnostic Reports MICCAI 2024 + + +
+ The Vision-Language Foundation model is increasingly investigated in the +fields of computer vision and natural language processing, yet its exploration +in ophthalmology and broader medical applications remains limited. The +challenge is the lack of labeled data for the training of foundation model. To +handle this issue, a CLIP-style retinal image foundation model is developed in +this paper. Our foundation model, RET-CLIP, is specifically trained on a +dataset of 193,865 patients to extract general features of color fundus +photographs (CFPs), employing a tripartite optimization strategy to focus on +left eye, right eye, and patient level to reflect real-world clinical +scenarios. Extensive experiments demonstrate that RET-CLIP outperforms existing +benchmarks across eight diverse datasets spanning four critical diagnostic +categories: diabetic retinopathy, glaucoma, multiple disease diagnosis, and +multi-label classification of multiple diseases, which demonstrate the +performance and generality of our foundation model. The sourse code and +pre-trained model are available at https://github.com/sStonemason/RET-CLIP. + +
+
+ comment: Accepted by MICCAI 2024 +
+
+
+
+
+ + ♻ ☆ DatasetNeRF: Efficient 3D-aware Data Factory with Generative Radiance + Fields ECCV 2024 + + +
+ Progress in 3D computer vision tasks demands a huge amount of data, yet +annotating multi-view images with 3D-consistent annotations, or point clouds +with part segmentation is both time-consuming and challenging. This paper +introduces DatasetNeRF, a novel approach capable of generating infinite, +high-quality 3D-consistent 2D annotations alongside 3D point cloud +segmentations, while utilizing minimal 2D human-labeled annotations. +Specifically, we leverage the strong semantic prior within a 3D generative +model to train a semantic decoder, requiring only a handful of fine-grained +labeled samples. Once trained, the decoder efficiently generalizes across the +latent space, enabling the generation of infinite data. The generated data is +applicable across various computer vision tasks, including video segmentation +and 3D point cloud segmentation. Our approach not only surpasses baseline +models in segmentation quality, achieving superior 3D consistency and +segmentation precision on individual images, but also demonstrates versatility +by being applicable to both articulated and non-articulated generative models. +Furthermore, we explore applications stemming from our approach, such as +3D-aware semantic editing and 3D inversion. + +
+
+ comment: Accepted by ECCV 2024. Project page: + https://ychgoaround.github.io/projects/DatasetNeRF/ +
+
+
+
+
+ + ♻ ☆ Less but Better: Enabling Generalized Zero-shot Learning Towards Unseen + Domains by Intrinsic Learning from Redundant LLM Semantics + + +
+ Generalized zero-shot learning (GZSL) focuses on recognizing seen and unseen +classes against domain shift problem (DSP) where data of unseen classes may be +misclassified as seen classes. However, existing GZSL is still limited to seen +domains. In the current work, we pioneer cross-domain GZSL (CDGZSL) which +addresses GZSL towards unseen domains. Different from existing GZSL methods +which alleviate DSP by generating features of unseen classes with semantics, +CDGZSL needs to construct a common feature space across domains and acquire the +corresponding intrinsic semantics shared among domains to transfer from seen to +unseen domains. Considering the information asymmetry problem caused by +redundant class semantics annotated with large language models (LLMs), we +present Meta Domain Alignment Semantic Refinement (MDASR). Technically, MDASR +consists of two parts: Inter-class Similarity Alignment (ISA), which eliminates +the non-intrinsic semantics not shared across all domains under the guidance of +inter-class feature relationships, and Unseen-class Meta Generation (UMG), +which preserves intrinsic semantics to maintain connectivity between seen and +unseen classes by simulating feature generation. MDASR effectively aligns the +redundant semantic space with the common feature space, mitigating the +information asymmetry in CDGZSL. The effectiveness of MDASR is demonstrated on +the Office-Home and Mini-DomainNet, and we have shared the LLM-based semantics +for these datasets as the benchmark. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ Dynamic Resolution Guidance for Facial Expression Recognition + + +
+ Facial expression recognition (FER) is vital for human-computer interaction +and emotion analysis, yet recognizing expressions in low-resolution images +remains challenging. This paper introduces a practical method called Dynamic +Resolution Guidance for Facial Expression Recognition (DRGFER) to effectively +recognize facial expressions in images with varying resolutions without +compromising FER model accuracy. Our framework comprises two main components: +the Resolution Recognition Network (RRN) and the Multi-Resolution Adaptation +Facial Expression Recognition Network (MRAFER). The RRN determines image +resolution, outputs a binary vector, and the MRAFER assigns images to suitable +facial expression recognition networks based on resolution. We evaluated DRGFER +on widely-used datasets RAFDB and FERPlus, demonstrating that our method +retains optimal model performance at each resolution and outperforms +alternative resolution approaches. The proposed framework exhibits robustness +against resolution variations and facial expressions, offering a promising +solution for real-world applications. + +
+
+
+
+
+ + ♻ ☆ Sparse Global Matching for Video Frame Interpolation with Large Motion CVPR 2024 + + +
+ Large motion poses a critical challenge in Video Frame Interpolation (VFI) +task. Existing methods are often constrained by limited receptive fields, +resulting in sub-optimal performance when handling scenarios with large motion. +In this paper, we introduce a new pipeline for VFI, which can effectively +integrate global-level information to alleviate issues associated with large +motion. Specifically, we first estimate a pair of initial intermediate flows +using a high-resolution feature map for extracting local details. Then, we +incorporate a sparse global matching branch to compensate for flow estimation, +which consists of identifying flaws in initial flows and generating sparse flow +compensation with a global receptive field. Finally, we adaptively merge the +initial flow estimation with global flow compensation, yielding a more accurate +intermediate flow. To evaluate the effectiveness of our method in handling +large motion, we carefully curate a more challenging subset from commonly used +benchmarks. Our method demonstrates the state-of-the-art performance on these +VFI subsets with large motion. + +
+
+ comment: Accepted by CVPR 2024. Project page: https://sgm-vfi.github.io/ +
+
+
+
+
+ + ♻ ☆ Adversarial Prompt Tuning for Vision-Language Models ECCV 2024 + + +
+ With the rapid advancement of multimodal learning, pre-trained +Vision-Language Models (VLMs) such as CLIP have demonstrated remarkable +capacities in bridging the gap between visual and language modalities. However, +these models remain vulnerable to adversarial attacks, particularly in the +image modality, presenting considerable security risks. This paper introduces +Adversarial Prompt Tuning (AdvPT), a novel technique to enhance the adversarial +robustness of image encoders in VLMs. AdvPT innovatively leverages learnable +text prompts and aligns them with adversarial image embeddings, to address the +vulnerabilities inherent in VLMs without the need for extensive parameter +training or modification of the model architecture. We demonstrate that AdvPT +improves resistance against white-box and black-box adversarial attacks and +exhibits a synergistic effect when combined with existing +image-processing-based defense techniques, further boosting defensive +capabilities. Comprehensive experimental analyses provide insights into +adversarial prompt tuning, a novel paradigm devoted to improving resistance to +adversarial images through textual input modifications, paving the way for +future robust multimodal learning research. These findings open up new +possibilities for enhancing the security of VLMs. Our code is available at +https://github.com/jiamingzhang94/Adversarial-Prompt-Tuning. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Topology-preserving Adversarial Training for Alleviating Natural + Accuracy Degradation BMVC 2024 + + +
+ Despite the effectiveness in improving the robustness of neural networks, +adversarial training has suffered from the natural accuracy degradation +problem, i.e., accuracy on natural samples has reduced significantly. In this +study, we reveal that natural accuracy degradation is highly related to the +disruption of the natural sample topology in the representation space by +quantitative and qualitative experiments. Based on this observation, we propose +Topology-pReserving Adversarial traINing (TRAIN) to alleviate the problem by +preserving the topology structure of natural samples from a standard model +trained only on natural samples during adversarial training. As an additional +regularization, our method can be combined with various popular adversarial +training algorithms, taking advantage of both sides. Extensive experiments on +CIFAR-10, CIFAR-100, and Tiny ImageNet show that our proposed method achieves +consistent and significant improvements over various strong baselines in most +cases. Specifically, without additional data, TRAIN achieves up to 8.86% +improvement in natural accuracy and 6.33% improvement in robust accuracy. + +
+
+ comment: BMVC 2024; Code will be released on https://github.com/KululuMi/TRAIN +
+
+
+
+
+ + ♻ ☆ SceneMotion: From Agent-Centric Embeddings to Scene-Wide Forecasts SC 2024 + + +
+ Self-driving vehicles rely on multimodal motion forecasts to effectively +interact with their environment and plan safe maneuvers. We introduce +SceneMotion, an attention-based model for forecasting scene-wide motion modes +of multiple traffic agents. Our model transforms local agent-centric embeddings +into scene-wide forecasts using a novel latent context module. This module +learns a scene-wide latent space from multiple agent-centric embeddings, +enabling joint forecasting and interaction modeling. The competitive +performance in the Waymo Open Interaction Prediction Challenge demonstrates the +effectiveness of our approach. Moreover, we cluster future waypoints in time +and space to quantify the interaction between agents. We merge all modes and +analyze each mode independently to determine which clusters are resolved +through interaction or result in conflict. Our implementation is available at: +https://github.com/kit-mrt/future-motion + +
+
+ comment: 7 pages, 3 figures, ITSC 2024; v2: added details about waypoint + clustering +
+
+
+
+
+ + ♻ ☆ Appearance-Based Refinement for Object-Centric Motion Segmentation ECCV 2024 + + +
+ The goal of this paper is to discover, segment, and track independently +moving objects in complex visual scenes. Previous approaches have explored the +use of optical flow for motion segmentation, leading to imperfect predictions +due to partial motion, background distraction, and object articulations and +interactions. To address this issue, we introduce an appearance-based +refinement method that leverages temporal consistency in video streams to +correct inaccurate flow-based proposals. Our approach involves a sequence-level +selection mechanism that identifies accurate flow-predicted masks as exemplars, +and an object-centric architecture that refines problematic masks based on +exemplar information. The model is pre-trained on synthetic data and then +adapted to real-world videos in a self-supervised manner, eliminating the need +for human annotations. Its performance is evaluated on multiple video +segmentation benchmarks, including DAVIS, YouTubeVOS, SegTrackv2, and FBMS-59. +We achieve competitive performance on single-object segmentation, while +significantly outperforming existing models on the more challenging problem of +multi-object segmentation. Finally, we investigate the benefits of using our +model as a prompt for the per-frame Segment Anything Model. + +
+
+ comment: ECCV 2024. Project page: + https://www.robots.ox.ac.uk/vgg/research/appear-refine/ +
+
+
+
+
+ + ♻ ☆ Robust Multimodal 3D Object Detection via Modality-Agnostic Decoding and + Proximity-based Modality Ensemble + + +
+ Recent advancements in 3D object detection have benefited from multi-modal +information from the multi-view cameras and LiDAR sensors. However, the +inherent disparities between the modalities pose substantial challenges. We +observe that existing multi-modal 3D object detection methods heavily rely on +the LiDAR sensor, treating the camera as an auxiliary modality for augmenting +semantic details. This often leads to not only underutilization of camera data +but also significant performance degradation in scenarios where LiDAR data is +unavailable. Additionally, existing fusion methods overlook the detrimental +impact of sensor noise induced by environmental changes, on detection +performance. In this paper, we propose MEFormer to address the LiDAR +over-reliance problem by harnessing critical information for 3D object +detection from every available modality while concurrently safeguarding against +corrupted signals during the fusion process. Specifically, we introduce +Modality Agnostic Decoding (MOAD) that extracts geometric and semantic features +with a shared transformer decoder regardless of input modalities and provides +promising improvement with a single modality as well as multi-modality. +Additionally, our Proximity-based Modality Ensemble (PME) module adaptively +utilizes the strengths of each modality depending on the environment while +mitigating the effects of a noisy sensor. Our MEFormer achieves +state-of-the-art performance of 73.9% NDS and 71.5% mAP in the nuScenes +validation set. Extensive analyses validate that our MEFormer improves +robustness against challenging conditions such as sensor malfunctions or +environmental changes. The source code is available at +https://github.com/hanchaa/MEFormer + +
+
+
+
+
+ + ♻ ☆ LaRE^2: Latent Reconstruction Error Based Method for Diffusion-Generated + Image Detection CVPR 2024 + + +
+ The evolution of Diffusion Models has dramatically improved image generation +quality, making it increasingly difficult to differentiate between real and +generated images. This development, while impressive, also raises significant +privacy and security concerns. In response to this, we propose a novel Latent +REconstruction error guided feature REfinement method (LaRE^2) for detecting +the diffusion-generated images. We come up with the Latent Reconstruction Error +(LaRE), the first reconstruction-error based feature in the latent space for +generated image detection. LaRE surpasses existing methods in terms of feature +extraction efficiency while preserving crucial cues required to differentiate +between the real and the fake. To exploit LaRE, we propose an Error-Guided +feature REfinement module (EGRE), which can refine the image feature guided by +LaRE to enhance the discriminativeness of the feature. Our EGRE utilizes an +align-then-refine mechanism, which effectively refines the image feature for +generated-image detection from both spatial and channel perspectives. Extensive +experiments on the large-scale GenImage benchmark demonstrate the superiority +of our LaRE^2, which surpasses the best SoTA method by up to 11.9%/12.1% +average ACC/AP across 8 different image generators. LaRE also surpasses +existing methods in terms of feature extraction cost, delivering an impressive +speed enhancement of 8 times. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ FlashGS: Efficient 3D Gaussian Splatting for Large-scale and + High-resolution Rendering + + +
+ This work introduces FlashGS, an open-source CUDA Python library, designed to +facilitate the efficient differentiable rasterization of 3D Gaussian Splatting +through algorithmic and kernel-level optimizations. FlashGS is developed based +on the observations from a comprehensive analysis of the rendering process to +enhance computational efficiency and bring the technique to wide adoption. The +paper includes a suite of optimization strategies, encompassing redundancy +elimination, efficient pipelining, refined control and scheduling mechanisms, +and memory access optimizations, all of which are meticulously integrated to +amplify the performance of the rasterization process. An extensive evaluation +of FlashGS' performance has been conducted across a diverse spectrum of +synthetic and real-world large-scale scenes, encompassing a variety of image +resolutions. The empirical findings demonstrate that FlashGS consistently +achieves an average 4x acceleration over mobile consumer GPUs, coupled with +reduced memory consumption. These results underscore the superior performance +and resource optimization capabilities of FlashGS, positioning it as a +formidable tool in the domain of 3D rendering. + +
+
+
+
+
+ + ♻ ☆ Wilcoxon Nonparametric CFAR Scheme for Ship Detection in SAR Image + + +
+ The parametric constant false alarm rate (CFAR) detection algorithms which +are based on various statistical distributions, such as Gaussian, Gamma, +Weibull, log-normal, G0 distribution, alpha-stable distribution, etc, are most +widely used to detect the ship targets in SAR image at present. However, the +clutter background in SAR images is complicated and variable. When the actual +clutter background deviates from the assumed statistical distribution, the +performance of the parametric CFAR detector will deteriorate. In addition to +the parametric CFAR schemes, there is another class of nonparametric CFAR +detectors which can maintain a constant false alarm rate for the target +detection without the assumption of a known clutter distribution. In this work, +the Wilcoxon nonparametric CFAR scheme for ship detection in SAR image is +proposed and analyzed, and a closed form of the false alarm rate for the +Wilcoxon nonparametric detector to determine the decision threshold is +presented. By comparison with several typical parametric CFAR schemes on +Radarsat-2, ICEYE-X6 and Gaofen-3 SAR images, the robustness of the Wilcoxon +nonparametric detector to maintain a good false alarm performance in different +detection backgrounds is revealed, and its detection performance for the weak +ship in rough sea surface is improved to some extent. Moreover, the Wilcoxon +nonparametric detector can suppress the false alarms resulting from the +sidelobes at some degree and its detection speed is fast. + +
+
+
+
+
+ + ♻ ☆ iNeMo: Incremental Neural Mesh Models for Robust Class-Incremental + Learning ECCV-24 + + +
+ Different from human nature, it is still common practice today for vision +tasks to train deep learning models only initially and on fixed datasets. A +variety of approaches have recently addressed handling continual data streams. +However, extending these methods to manage out-of-distribution (OOD) scenarios +has not effectively been investigated. On the other hand, it has recently been +shown that non-continual neural mesh models exhibit strong performance in +generalizing to such OOD scenarios. To leverage this decisive property in a +continual learning setting, we propose incremental neural mesh models that can +be extended with new meshes over time. In addition, we present a latent space +initialization strategy that enables us to allocate feature space for future +unseen classes in advance and a positional regularization term that forces the +features of the different classes to consistently stay in respective latent +space regions. We demonstrate the effectiveness of our method through extensive +experiments on the Pascal3D and ObjectNet3D datasets and show that our approach +outperforms the baselines for classification by $2-6\%$ in the in-domain and by +$6-50\%$ in the OOD setting. Our work also presents the first incremental +learning approach for pose estimation. Our code and model can be found at +https://github.com/Fischer-Tom/iNeMo. + +
+
+ comment: ECCV-24 +
+
+
+
+
+ + ♻ ☆ MM-Mixing: Multi-Modal Mixing Alignment for 3D Understanding + + +
+ We introduce MM-Mixing, a multi-modal mixing alignment framework for 3D +understanding. MM-Mixing applies mixing-based methods to multi-modal data, +preserving and optimizing cross-modal connections while enhancing diversity and +improving alignment across modalities. Our proposed two-stage training pipeline +combines feature-level and input-level mixing to optimize the 3D encoder. The +first stage employs feature-level mixing with contrastive learning to align 3D +features with their corresponding modalities. The second stage incorporates +both feature-level and input-level mixing, introducing mixed point cloud inputs +to further refine 3D feature representations. MM-Mixing enhances intermodality +relationships, promotes generalization, and ensures feature consistency while +providing diverse and realistic training samples. We demonstrate that MM-Mixing +significantly improves baseline performance across various learning scenarios, +including zero-shot 3D classification, linear probing 3D classification, and +cross-modal 3D shape retrieval. Notably, we improved the zero-shot +classification accuracy on ScanObjectNN from 51.3% to 61.9%, and on +Objaverse-LVIS from 46.8% to 51.4%. Our findings highlight the potential of +multi-modal mixing-based alignment to significantly advance 3D object +recognition and understanding while remaining straightforward to implement and +integrate into existing frameworks. + +
+
+
+
+
+ + ♻ ☆ Patch of Invisibility: Naturalistic Physical Black-Box Adversarial + Attacks on Object Detectors ECML-PKDD 2024 + + +
+ Adversarial attacks on deep-learning models have been receiving increased +attention in recent years. Work in this area has mostly focused on +gradient-based techniques, so-called "white-box" attacks, wherein the attacker +has access to the targeted model's internal parameters; such an assumption is +usually unrealistic in the real world. Some attacks additionally use the entire +pixel space to fool a given model, which is neither practical nor physical +(i.e., real-world). On the contrary, we propose herein a direct, black-box, +gradient-free method that uses the learned image manifold of a pretrained +generative adversarial network (GAN) to generate naturalistic physical +adversarial patches for object detectors. To our knowledge this is the first +and only method that performs black-box physical attacks directly on +object-detection models, which results with a model-agnostic attack. We show +that our proposed method works both digitally and physically. We compared our +approach against four different black-box attacks with different +configurations. Our approach outperformed all other approaches that were tested +in our experiments by a large margin. + +
+
+ comment: Accepted at MLCS @ ECML-PKDD 2024 +
+
+
+
+
+ + ♻ ☆ Knowledge Distillation with Refined Logits + + +
+ Recent research on knowledge distillation has increasingly focused on logit +distillation because of its simplicity, effectiveness, and versatility in model +compression. In this paper, we introduce Refined Logit Distillation (RLD) to +address the limitations of current logit distillation methods. Our approach is +motivated by the observation that even high-performing teacher models can make +incorrect predictions, creating a conflict between the standard distillation +loss and the cross-entropy loss. This conflict can undermine the consistency of +the student model's learning objectives. Previous attempts to use labels to +empirically correct teacher predictions may undermine the class correlation. In +contrast, our RLD employs labeling information to dynamically refine teacher +logits. In this way, our method can effectively eliminate misleading +information from the teacher while preserving crucial class correlations, thus +enhancing the value and efficiency of distilled knowledge. Experimental results +on CIFAR-100 and ImageNet demonstrate its superiority over existing methods. +The code is provided at \text{https://github.com/zju-SWJ/RLD}. + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ SliceMamba with Neural Architecture Search for Medical Image + Segmentation + + +
+ Despite the progress made in Mamba-based medical image segmentation models, +existing methods utilizing unidirectional or multi-directional feature scanning +mechanisms struggle to effectively capture dependencies between neighboring +positions, limiting the discriminant representation learning of local features. +These local features are crucial for medical image segmentation as they provide +critical structural information about lesions and organs. To address this +limitation, we propose SliceMamba, a simple and effective locally sensitive +Mamba-based medical image segmentation model. SliceMamba includes an efficient +Bidirectional Slice Scan module (BSS), which performs bidirectional feature +slicing and employs varied scanning mechanisms for sliced features with +distinct shapes. This design ensures that spatially adjacent features remain +close in the scanning sequence, thereby improving segmentation performance. +Additionally, to fit the varying sizes and shapes of lesions and organs, we +further introduce an Adaptive Slice Search method to automatically determine +the optimal feature slice method based on the characteristics of the target +data. Extensive experiments on two skin lesion datasets (ISIC2017 and +ISIC2018), two polyp segmentation (Kvasir and ClinicDB) datasets, and one +multi-organ segmentation dataset (Synapse) validate the effectiveness of our +method. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ Decoupling Forgery Semantics for Generalizable Deepfake Detection BMVC 2024 + + +
+ In this paper, we propose a novel method for detecting DeepFakes, enhancing +the generalization of detection through semantic decoupling. There are now +multiple DeepFake forgery technologies that not only possess unique forgery +semantics but may also share common forgery semantics. The unique forgery +semantics and irrelevant content semantics may promote over-fitting and hamper +generalization for DeepFake detectors. For our proposed method, after +decoupling, the common forgery semantics could be extracted from DeepFakes, and +subsequently be employed for developing the generalizability of DeepFake +detectors. Also, to pursue additional generalizability, we designed an adaptive +high-pass module and a two-stage training strategy to improve the independence +of decoupled semantics. Evaluation on FF++, Celeb-DF, DFD, and DFDC datasets +showcases our method's excellent detection and generalization performance. Code +is available at: https://github.com/leaffeall/DFS-GDD. + +
+
+ comment: Accepted by BMVC 2024 +
+
+
+
+
+ + ♻ ☆ Efficient Decoder for End-to-End Oriented Object Detection in Remote + Sensing Images + + +
+ Object instances in remote sensing images often distribute with +multi-orientations, varying scales, and dense distribution. These issues bring +challenges to end-to-end oriented object detectors including multi-scale +features alignment and a large number of queries. To address these limitations, +we propose an end-to-end oriented detector equipped with an efficient decoder, +which incorporates two technologies, Rotated RoI attention (RRoI attention) and +Selective Distinct Queries (SDQ). Specifically, RRoI attention effectively +focuses on oriented regions of interest through a cross-attention mechanism and +aligns multi-scale features. SDQ collects queries from intermediate decoder +layers and then filters similar queries to obtain distinct queries. The +proposed SDQ can facilitate the optimization of one-to-one label assignment, +without introducing redundant initial queries or extra auxiliary branches. +Extensive experiments on five datasets demonstrate the effectiveness of our +method. Notably, our method achieves state-of-the-art performance on DIOR-R +(67.31% mAP), DOTA-v1.5 (67.43% mAP), and DOTA-v2.0 (53.28% mAP) with the +ResNet50 backbone. + +
+
+ comment: The paper has not been accepted yet. We will release a new version + after the paper is accepted +
+
+
+
+
+ + ♻ ☆ DomainForensics: Exposing Face Forgery across Domains via Bi-directional + Adaptation + + +
+ Recent DeepFake detection methods have shown excellent performance on public +datasets but are significantly degraded on new forgeries. Solving this problem +is important, as new forgeries emerge daily with the continuously evolving +generative techniques. Many efforts have been made for this issue by seeking +the commonly existing traces empirically on data level. In this paper, we +rethink this problem and propose a new solution from the unsupervised domain +adaptation perspective. Our solution, called DomainForensics, aims to transfer +the forgery knowledge from known forgeries to new forgeries. Unlike recent +efforts, our solution does not focus on data view but on learning strategies of +DeepFake detectors to capture the knowledge of new forgeries through the +alignment of domain discrepancies. In particular, unlike the general domain +adaptation methods which consider the knowledge transfer in the semantic class +category, thus having limited application, our approach captures the subtle +forgery traces. We describe a new bi-directional adaptation strategy dedicated +to capturing the forgery knowledge across domains. Specifically, our strategy +considers both forward and backward adaptation, to transfer the forgery +knowledge from the source domain to the target domain in forward adaptation and +then reverse the adaptation from the target domain to the source domain in +backward adaptation. In forward adaptation, we perform supervised training for +the DeepFake detector in the source domain and jointly employ adversarial +feature adaptation to transfer the ability to detect manipulated faces from +known forgeries to new forgeries. In backward adaptation, we further improve +the knowledge transfer by coupling adversarial adaptation with +self-distillation on new forgeries. This enables the detector to expose new +forgery features from unlabeled data and avoid forgetting the known knowledge +of known... + +
+
+ comment: TIFS 2024 +
+
+
+
+
+ + ♻ ☆ Universal Approximation Theory: The Basic Theory for Deep Learning-Based + Computer Vision Models + + +
+ Computer vision (CV) is one of the most crucial fields in artificial +intelligence. In recent years, a variety of deep learning models based on +convolutional neural networks (CNNs) and Transformers have been designed to +tackle diverse problems in CV. These algorithms have found practical +applications in areas such as robotics and facial recognition. Despite the +increasing power of current CV models, several fundamental questions remain +unresolved: Why do CNNs require deep layers? What ensures the generalization +ability of CNNs? Why do residual-based networks outperform fully convolutional +networks like VGG? What is the fundamental difference between residual-based +CNNs and Transformer-based networks? Why can CNNs utilize LoRA and pruning +techniques? The root cause of these questions lies in the lack of a robust +theoretical foundation for deep learning models in CV. To address these +critical issues and techniques, we employ the Universal Approximation Theorem +(UAT) to provide a theoretical basis for convolution- and Transformer-based +models in CV. By doing so, we aim to elucidate these questions from a +theoretical perspective. + +
+
+
+
+
+ + ♻ ☆ Geometric Pooling: maintaining more useful information + + +
+ Graph Pooling technology plays an important role in graph node classification +tasks. Sorting pooling technologies maintain large-value units for pooling +graphs of varying sizes. However, by analyzing the statistical characteristic +of activated units after pooling, we found that a large number of units dropped +by sorting pooling are negative-value units that contain useful information and +can contribute considerably to the final decision. To maintain more useful +information, a novel pooling technology, called Geometric Pooling (GP), was +proposed to contain the unique node features with negative values by measuring +the similarity of all node features. We reveal the effectiveness of GP from the +entropy reduction view. The experiments were conducted on TUdatasets to show +the effectiveness of GP. The results showed that the proposed GP outperforms +the SOTA graph pooling technologies by 1%\sim5% with fewer parameters. + +
+
+ comment: Accepted by IEEE ACCESS +
+
+
+
+
+ + ♻ ☆ Deep evidential fusion with uncertainty quantification and contextual + discounting for multimodal medical image segmentation + + +
+ Single-modality medical images generally do not contain enough information to +reach an accurate and reliable diagnosis. For this reason, physicians generally +diagnose diseases based on multimodal medical images such as, e.g., PET/CT. The +effective fusion of multimodal information is essential to reach a reliable +decision and explain how the decision is made as well. In this paper, we +propose a fusion framework for multimodal medical image segmentation based on +deep learning and the Dempster-Shafer theory of evidence. In this framework, +the reliability of each single modality image when segmenting different objects +is taken into account by a contextual discounting operation. The discounted +pieces of evidence from each modality are then combined by Dempster's rule to +reach a final decision. Experimental results with a PET-CT dataset with +lymphomas and a multi-MRI dataset with brain tumors show that our method +outperforms the state-of-the-art methods in accuracy and reliability. + +
+
+
+
+
+ + ♻ ☆ MergeOcc: Bridge the Domain Gap between Different LiDARs for Robust + Occupancy Prediction + + +
+ LiDAR-based 3D occupancy prediction evolved rapidly alongside the emergence +of large datasets. Nevertheless, the potential of existing diverse datasets +remains underutilized as they kick in individually. Models trained on a +specific dataset often suffer considerable performance degradation when +deployed to real-world scenarios or datasets involving disparate LiDARs. This +paper aims to develop a generalized model called MergeOcc, to simultaneously +handle different LiDARs by leveraging multiple datasets. The gaps among LiDAR +datasets primarily manifest in geometric disparities and semantic +inconsistencies. Thus, MergeOcc incorporates a novel model featuring a +geometric realignment module and a semantic label mapping module to enable +multiple datasets training (MDT). The effectiveness of MergeOcc is validated +through experiments on two prominent datasets for autonomous vehicles: +OpenOccupancy-nuScenes and SemanticKITTI. The results demonstrate its enhanced +robustness and remarkable performance across both types of LiDARs, +outperforming several SOTA multi-modality methods. Notably, despite using an +identical model architecture and hyper-parameter set, MergeOcc can +significantly surpass the baseline due to its exposure to more diverse data. +MergeOcc is considered the first cross-dataset 3D occupancy prediction pipeline +that effectively bridges the domain gap for seamless deployment across +heterogeneous platforms. + +
+
+
+
+
+ + ♻ ☆ SSL: A Self-similarity Loss for Improving Generative Image + Super-resolution ACM MM 2024 + + +
+ Generative adversarial networks (GAN) and generative diffusion models (DM) +have been widely used in real-world image super-resolution (Real-ISR) to +enhance the image perceptual quality. However, these generative models are +prone to generating visual artifacts and false image structures, resulting in +unnatural Real-ISR results. Based on the fact that natural images exhibit high +self-similarities, i.e., a local patch can have many similar patches to it in +the whole image, in this work we propose a simple yet effective self-similarity +loss (SSL) to improve the performance of generative Real-ISR models, enhancing +the hallucination of structural and textural details while reducing the +unpleasant visual artifacts. Specifically, we compute a self-similarity graph +(SSG) of the ground-truth image, and enforce the SSG of Real-ISR output to be +close to it. To reduce the training cost and focus on edge areas, we generate +an edge mask from the ground-truth image, and compute the SSG only on the +masked pixels. The proposed SSL serves as a general plug-and-play penalty, +which could be easily applied to the off-the-shelf Real-ISR models. Our +experiments demonstrate that, by coupling with SSL, the performance of many +state-of-the-art Real-ISR models, including those GAN and DM based ones, can be +largely improved, reproducing more perceptually realistic image details and +eliminating many false reconstructions and visual artifacts. Codes and +supplementary material can be found at https://github.com/ChrisDud0257/SSL + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ♻ ☆ GPT-4V(ision) for Robotics: Multimodal Task Planning from Human + Demonstration + + +
+ We introduce a pipeline that enhances a general-purpose Vision Language +Model, GPT-4V(ision), to facilitate one-shot visual teaching for robotic +manipulation. This system analyzes videos of humans performing tasks and +outputs executable robot programs that incorporate insights into affordances. +The process begins with GPT-4V analyzing the videos to obtain textual +explanations of environmental and action details. A GPT-4-based task planner +then encodes these details into a symbolic task plan. Subsequently, vision +systems spatially and temporally ground the task plan in the videos. Object are +identified using an open-vocabulary object detector, and hand-object +interactions are analyzed to pinpoint moments of grasping and releasing. This +spatiotemporal grounding allows for the gathering of affordance information +(e.g., grasp types, waypoints, and body postures) critical for robot execution. +Experiments across various scenarios demonstrate the method's efficacy in +achieving real robots' operations from human demonstrations in a one-shot +manner. Meanwhile, quantitative tests have revealed instances of hallucination +in GPT-4V, highlighting the importance of incorporating human supervision +within the pipeline. The prompts of GPT-4V/GPT-4 are available at this project +page: https://microsoft.github.io/GPT4Vision-Robot-Manipulation-Prompts/ + +
+
+ comment: 8 pages, 10 figures, 3 tables. Last updated on August 18th, 2024 +
+
+
+
+
+ + ♻ ☆ Beyond Full Label: Single-Point Prompt for Infrared Small Target Label + Generation + + +
+ In this work, we make the first attempt to construct a learning-based +single-point annotation paradigm for infrared small target label generation +(IRSTLG). Our intuition is that label generation requires just one more point +prompt than target detection: IRSTLG can be regarded as an infrared small +target detection (IRSTD) task with the target location hint. Based on this +insight, we introduce an energy double guided single-point prompt (EDGSP) +framework, which adeptly transforms the target detection network into a refined +label generation method. Specifically, the proposed EDGSP includes: 1) target +energy initialization (TEI) to create a foundational outline for sufficient +shape evolution of pseudo label, 2) double prompt embedding (DPE) for rapid +localization of interested regions and reinforcement of individual differences +to avoid label adhesion, and 3) bounding box-based matching (BBM) to eliminate +false alarms. Experimental results show that pseudo labels generated by three +baselines equipped with EDGSP achieve 100% object-level probability of +detection (Pd) and 0% false-alarm rate (Fa) on SIRST, NUDT-SIRST, and IRSTD-1k +datasets, with a pixel-level intersection over union (IoU) improvement of +13.28% over state-of-the-art (SOTA) label generation methods. In the practical +application of downstream IRSTD, EDGSP realizes, for the first time, a +single-point generated pseudo mask beyond the full label. Even with coarse +single-point annotations, it still achieves 99.5% performance of full labeling. + +
+
+
+
+
+ + ♻ ☆ An Optimization-based Baseline for Rigid 2D/3D Registration Applied to + Spine Surgical Navigation Using CMA-ES + + +
+ A robust and efficient optimization-based 2D/3D registration framework is +crucial for the navigation system of orthopedic surgical robots. It can provide +precise position information of surgical instruments and implants during +surgery. While artificial intelligence technology has advanced rapidly in +recent years, traditional optimization-based registration methods remain +indispensable in the field of 2D/3D registration.he exceptional precision of +this method enables it to be considered as a post-processing step of the +learning-based methods, thereby offering a reliable assurance for registration. +In this paper, we present a coarse-to-fine registration framework based on the +CMA-ES algorithm. We conducted intensive testing of our method using data from +different parts of the spine. The results shows the effectiveness of the +proposed framework on real orthopedic spine surgery clinical data. This work +can be viewed as an additional extension that complements the +optimization-based methods employed in our previous studies. + +
+
+
+
+
+ + ♻ ☆ CIC: A framework for Culturally-aware Image Captioning IJCAI 2024 + + +
+ Image Captioning generates descriptive sentences from images using +Vision-Language Pre-trained models (VLPs) such as BLIP, which has improved +greatly. However, current methods lack the generation of detailed descriptive +captions for the cultural elements depicted in the images, such as the +traditional clothing worn by people from Asian cultural groups. In this paper, +we propose a new framework, Culturally-aware Image Captioning (CIC), that +generates captions and describes cultural elements extracted from cultural +visual elements in images representing cultures. Inspired by methods combining +visual modality and Large Language Models (LLMs) through appropriate prompts, +our framework (1) generates questions based on cultural categories from images, +(2) extracts cultural visual elements from Visual Question Answering (VQA) +using generated questions, and (3) generates culturally-aware captions using +LLMs with the prompts. Our human evaluation conducted on 45 participants from 4 +different cultural groups with a high understanding of the corresponding +culture shows that our proposed framework generates more culturally descriptive +captions when compared to the image captioning baseline based on VLPs. +Resources can be found at https://shane3606.github.io/cic.. + +
+
+ comment: Accepted in IJCAI 2024 +
+
+
+
+
+ + ♻ ☆ DiracDiffusion: Denoising and Incremental Reconstruction with Assured + Data-Consistency + + +
+ Diffusion models have established new state of the art in a multitude of +computer vision tasks, including image restoration. Diffusion-based inverse +problem solvers generate reconstructions of exceptional visual quality from +heavily corrupted measurements. However, in what is widely known as the +perception-distortion trade-off, the price of perceptually appealing +reconstructions is often paid in declined distortion metrics, such as PSNR. +Distortion metrics measure faithfulness to the observation, a crucial +requirement in inverse problems. In this work, we propose a novel framework for +inverse problem solving, namely we assume that the observation comes from a +stochastic degradation process that gradually degrades and noises the original +clean image. We learn to reverse the degradation process in order to recover +the clean image. Our technique maintains consistency with the original +measurement throughout the reverse process, and allows for great flexibility in +trading off perceptual quality for improved distortion metrics and sampling +speedup via early-stopping. We demonstrate the efficiency of our method on +different high-resolution datasets and inverse problems, achieving great +improvements over other state-of-the-art diffusion-based methods with respect +to both perceptual and distortion metrics. + +
+
+ comment: 30 pages, 15 figures, published at the 41st International Conference + on Machine Learning, Vienna, Austria, 2024 +
+
+
+
+
+
+
+
+ + Information Retrieval 24 + +
+
+
+ + ☆ Customizing Language Models with Instance-wise LoRA for Sequential + Recommendation + + +
+ Sequential recommendation systems predict a user's next item of interest by +analyzing past interactions, aligning recommendations with individual +preferences. Leveraging the strengths of Large Language Models (LLMs) in +knowledge comprehension and reasoning, recent approaches have applied LLMs to +sequential recommendation through language generation paradigms. These methods +convert user behavior sequences into prompts for LLM fine-tuning, utilizing +Low-Rank Adaptation (LoRA) modules to refine recommendations. However, the +uniform application of LoRA across diverse user behaviors sometimes fails to +capture individual variability, leading to suboptimal performance and negative +transfer between disparate sequences. To address these challenges, we propose +Instance-wise LoRA (iLoRA), integrating LoRA with the Mixture of Experts (MoE) +framework. iLoRA creates a diverse array of experts, each capturing specific +aspects of user preferences, and introduces a sequence representation guided +gate function. This gate function processes historical interaction sequences to +generate enriched representations, guiding the gating network to output +customized expert participation weights. This tailored approach mitigates +negative transfer and dynamically adjusts to diverse behavior patterns. +Extensive experiments on three benchmark datasets demonstrate the effectiveness +of iLoRA, highlighting its superior performance compared to existing methods in +capturing user-specific preferences and improving recommendation accuracy. + +
+
+
+
+
+ + ☆ Molecular Graph Representation Learning Integrating Large Language + Models with Domain-specific Small Models + + +
+ Molecular property prediction is a crucial foundation for drug discovery. In +recent years, pre-trained deep learning models have been widely applied to this +task. Some approaches that incorporate prior biological domain knowledge into +the pre-training framework have achieved impressive results. However, these +methods heavily rely on biochemical experts, and retrieving and summarizing +vast amounts of domain knowledge literature is both time-consuming and +expensive. Large Language Models (LLMs) have demonstrated remarkable +performance in understanding and efficiently providing general knowledge. +Nevertheless, they occasionally exhibit hallucinations and lack precision in +generating domain-specific knowledge. Conversely, Domain-specific Small Models +(DSMs) possess rich domain knowledge and can accurately calculate molecular +domain-related metrics. However, due to their limited model size and singular +functionality, they lack the breadth of knowledge necessary for comprehensive +representation learning. To leverage the advantages of both approaches in +molecular property prediction, we propose a novel Molecular Graph +representation learning framework that integrates Large language models and +Domain-specific small models (MolGraph-LarDo). Technically, we design a +two-stage prompt strategy where DSMs are introduced to calibrate the knowledge +provided by LLMs, enhancing the accuracy of domain-specific information and +thus enabling LLMs to generate more precise textual descriptions for molecular +samples. Subsequently, we employ a multi-modal alignment method to coordinate +various modalities, including molecular graphs and their corresponding +descriptive texts, to guide the pre-training of molecular representations. +Extensive experiments demonstrate the effectiveness of the proposed method. + +
+
+
+
+
+ + ☆ Efficient Inference of Sub-Item Id-based Sequential Recommendation + Models with Millions of Items RecSys 2024 + + +
+ Transformer-based recommender systems, such as BERT4Rec or SASRec, achieve +state-of-the-art results in sequential recommendation. However, it is +challenging to use these models in production environments with catalogues of +millions of items: scaling Transformers beyond a few thousand items is +problematic for several reasons, including high model memory consumption and +slow inference. In this respect, RecJPQ is a state-of-the-art method of +reducing the models' memory consumption; RecJPQ compresses item catalogues by +decomposing item IDs into a small number of shared sub-item IDs. Despite +reporting the reduction of memory consumption by a factor of up to 50x, the +original RecJPQ paper did not report inference efficiency improvements over the +baseline Transformer-based models. Upon analysing RecJPQ's scoring algorithm, +we find that its efficiency is limited by its use of score accumulators for +each item, which prevents parallelisation. In contrast, LightRec (a +non-sequential method that uses a similar idea of sub-ids) reported large +inference efficiency improvements using an algorithm we call PQTopK. We show +that it is also possible to improve RecJPQ-based models' inference efficiency +using the PQTopK algorithm. In particular, we speed up RecJPQ-enhanced SASRec +by a factor of 4.5 x compared to the original SASRec's inference method and by +a factor of 1.56 x compared to the method implemented in RecJPQ code on a +large-scale Gowalla dataset with more than a million items. Further, using +simulated data, we show that PQTopK remains efficient with catalogues of up to +tens of millions of items, removing one of the last obstacles to using +Transformer-based models in production environments with large catalogues. + +
+
+ comment: Accepted by RecSys 2024 +
+
+
+
+
+ + ☆ MAPLE: Enhancing Review Generation with Multi-Aspect Prompt LEarning in + Explainable Recommendation + + +
+ Explainable Recommendation task is designed to receive a pair of user and +item and output explanations to justify why an item is recommended to a user. +Many models treat review-generation as a proxy of explainable recommendation. +Although they are able to generate fluent and grammatical sentences, they +suffer from generality and hallucination issues. We propose a personalized, +aspect-controlled model called Multi-Aspect Prompt LEarner (MAPLE), in which it +integrates aspect category as another input dimension to facilitate the +memorization of fine-grained aspect terms. Experiments on two real-world review +datasets in restaurant domain show that MAPLE outperforms the baseline +review-generation models in terms of text and feature diversity while +maintaining excellent coherence and factual relevance. We further treat MAPLE +as a retriever component in the retriever-reader framework and employ a +Large-Language Model (LLM) as the reader, showing that MAPLE's explanation +along with the LLM's comprehension ability leads to enriched and personalized +explanation as a result. We will release the code and data in this http upon +acceptance. + +
+
+ comment: 8 main pages, 10 pages for appendix. Under review +
+
+
+
+
+ + ☆ Fashion Image-to-Image Translation for Complementary Item Retrieval + + +
+ The increasing demand for online fashion retail has boosted research in +fashion compatibility modeling and item retrieval, focusing on matching user +queries (textual descriptions or reference images) with compatible fashion +items. A key challenge is top-bottom retrieval, where precise compatibility +modeling is essential. Traditional methods, often based on Bayesian +Personalized Ranking (BPR), have shown limited performance. Recent efforts have +explored using generative models in compatibility modeling and item retrieval, +where generated images serve as additional inputs. However, these approaches +often overlook the quality of generated images, which could be crucial for +model performance. Additionally, generative models typically require large +datasets, posing challenges when such data is scarce. + To address these issues, we introduce the Generative Compatibility Model +(GeCo), a two-stage approach that improves fashion image retrieval through +paired image-to-image translation. First, the Complementary Item Generation +Model (CIGM), built on Conditional Generative Adversarial Networks (GANs), +generates target item images (e.g., bottoms) from seed items (e.g., tops), +offering conditioning signals for retrieval. These generated samples are then +integrated into GeCo, enhancing compatibility modeling and retrieval accuracy. +Evaluations on three datasets show that GeCo outperforms state-of-the-art +baselines. Key contributions include: (i) the GeCo model utilizing paired +image-to-image translation within the Composed Image Retrieval framework, (ii) +comprehensive evaluations on benchmark datasets, and (iii) the release of a new +Fashion Taobao dataset designed for top-bottom retrieval, promoting further +research. + +
+
+
+
+
+ + ☆ Ranking Generated Answers: On the Agreement of Retrieval Models with + Humans on Consumer Health Questions + + +
+ Evaluating the output of generative large language models (LLMs) is +challenging and difficult to scale. Most evaluations of LLMs focus on tasks +such as single-choice question-answering or text classification. These tasks +are not suitable for assessing open-ended question-answering capabilities, +which are critical in domains where expertise is required, such as health, and +where misleading or incorrect answers can have a significant impact on a user's +health. Using human experts to evaluate the quality of LLM answers is generally +considered the gold standard, but expert annotation is costly and slow. We +present a method for evaluating LLM answers that uses ranking signals as a +substitute for explicit relevance judgements. Our scoring method correlates +with the preferences of human experts. We validate it by investigating the +well-known fact that the quality of generated answers improves with the size of +the model as well as with more sophisticated prompting strategies. + +
+
+
+
+
+ + ☆ Contextual Dual Learning Algorithm with Listwise Distillation for + Unbiased Learning to Rank + + +
+ Unbiased Learning to Rank (ULTR) aims to leverage biased implicit user +feedback (e.g., click) to optimize an unbiased ranking model. The effectiveness +of the existing ULTR methods has primarily been validated on synthetic +datasets. However, their performance on real-world click data remains unclear. +Recently, Baidu released a large publicly available dataset of their web search +logs. Subsequently, the NTCIR-17 ULTRE-2 task released a subset dataset +extracted from it. We conduct experiments on commonly used or effective ULTR +methods on this subset to determine whether they maintain their effectiveness. +In this paper, we propose a Contextual Dual Learning Algorithm with Listwise +Distillation (CDLA-LD) to simultaneously address both position bias and +contextual bias. We utilize a listwise-input ranking model to obtain +reconstructed feature vectors incorporating local contextual information and +employ the Dual Learning Algorithm (DLA) method to jointly train this ranking +model and a propensity model to address position bias. As this ranking model +learns the interaction information within the documents list of the training +set, to enhance the ranking model's generalization ability, we additionally +train a pointwise-input ranking model to learn the listwise-input ranking +model's capability for relevance judgment in a listwise manner. Extensive +experiments and analysis confirm the effectiveness of our approach. + +
+
+ comment: 12 pages, 2 figures +
+
+
+
+
+ + ☆ Revisiting Reciprocal Recommender Systems: Metrics, Formulation, and + Method KDD 2024 + + +
+ Reciprocal recommender systems~(RRS), conducting bilateral recommendations +between two involved parties, have gained increasing attention for enhancing +matching efficiency. However, the majority of existing methods in the +literature still reuse conventional ranking metrics to separately assess the +performance on each side of the recommendation process. These methods overlook +the fact that the ranking outcomes of both sides collectively influence the +effectiveness of the RRS, neglecting the necessity of a more holistic +evaluation and a capable systemic solution. + In this paper, we systemically revisit the task of reciprocal recommendation, +by introducing the new metrics, formulation, and method. Firstly, we propose +five new evaluation metrics that comprehensively and accurately assess the +performance of RRS from three distinct perspectives: overall coverage, +bilateral stability, and balanced ranking. These metrics provide a more +holistic understanding of the system's effectiveness and enable a comprehensive +evaluation. Furthermore, we formulate the RRS from a causal perspective, +formulating recommendations as bilateral interventions, which can better model +the decoupled effects of potential influencing factors. By utilizing the +potential outcome framework, we further develop a model-agnostic causal +reciprocal recommendation method that considers the causal effects of +recommendations. Additionally, we introduce a reranking strategy to maximize +matching outcomes, as measured by the proposed metrics. Extensive experiments +on two real-world datasets from recruitment and dating scenarios demonstrate +the effectiveness of our proposed metrics and approach. The code and dataset +are available at: https://github.com/RUCAIBox/CRRS. + +
+
+ comment: KDD 2024 +
+
+
+
+
+ + ☆ Carbon Footprint Accounting Driven by Large Language Models and + Retrieval-augmented Generation + + +
+ Carbon footprint accounting is crucial for quantifying greenhouse gas +emissions and achieving carbon neutrality.The dynamic nature of processes, +accounting rules, carbon-related policies, and energy supply structures +necessitates real-time updates of CFA. Traditional life cycle assessment +methods rely heavily on human expertise, making near-real-time updates +challenging. This paper introduces a novel approach integrating large language +models (LLMs) with retrieval-augmented generation technology to enhance the +real-time, professional, and economical aspects of carbon footprint information +retrieval and analysis. By leveraging LLMs' logical and language understanding +abilities and RAG's efficient retrieval capabilities, the proposed method +LLMs-RAG-CFA can retrieve more relevant professional information to assist +LLMs, enhancing the model's generative abilities. This method offers broad +professional coverage, efficient real-time carbon footprint information +acquisition and accounting, and cost-effective automation without frequent +LLMs' parameter updates. Experimental results across five industries(primary +aluminum, lithium battery, photovoltaic, new energy vehicles, and +transformers)demonstrate that the LLMs-RAG-CFA method outperforms traditional +methods and other LLMs, achieving higher information retrieval rates and +significantly lower information deviations and carbon footprint accounting +deviations. The economically viable design utilizes RAG technology to balance +real-time updates with cost-effectiveness, providing an efficient, reliable, +and cost-saving solution for real-time carbon emission management, thereby +enhancing environmental sustainability practices. + +
+
+
+
+
+ + ☆ Harnessing Multimodal Large Language Models for Multimodal Sequential + Recommendation + + +
+ Recent advances in Large Language Models (LLMs) have demonstrated significant +potential in the field of Recommendation Systems (RSs). Most existing studies +have focused on converting user behavior logs into textual prompts and +leveraging techniques such as prompt tuning to enable LLMs for recommendation +tasks. Meanwhile, research interest has recently grown in multimodal +recommendation systems that integrate data from images, text, and other sources +using modality fusion techniques. This introduces new challenges to the +existing LLM-based recommendation paradigm which relies solely on text modality +information. Moreover, although Multimodal Large Language Models (MLLMs) +capable of processing multi-modal inputs have emerged, how to equip MLLMs with +multi-modal recommendation capabilities remains largely unexplored. To this +end, in this paper, we propose the Multimodal Large Language Model-enhanced +Sequential Multimodal Recommendation (MLLM-MSR) model. To capture the dynamic +user preference, we design a two-stage user preference summarization method. +Specifically, we first utilize an MLLM-based item-summarizer to extract image +feature given an item and convert the image into text. Then, we employ a +recurrent user preference summarization generation paradigm to capture the +dynamic changes in user preferences based on an LLM-based user-summarizer. +Finally, to enable the MLLM for multi-modal recommendation task, we propose to +fine-tune a MLLM-based recommender using Supervised Fine-Tuning (SFT) +techniques. Extensive evaluations across various datasets validate the +effectiveness of MLLM-MSR, showcasing its superior ability to capture and adapt +to the evolving dynamics of user preferences. + +
+
+
+
+
+ + ☆ GANPrompt: Enhancing Robustness in LLM-Based Recommendations with + GAN-Enhanced Diversity Prompts + + +
+ In recent years, LLM has demonstrated remarkable proficiency in comprehending +and generating natural language, with a growing prevalence in the domain of +recommender systems. However, LLM continues to face a significant challenge in +that it is highly susceptible to the influence of prompt words. This +inconsistency in response to minor alterations in prompt input may compromise +the accuracy and resilience of recommendation models. To address this issue, +this paper proposes GANPrompt, a multi-dimensional large language model prompt +diversity framework based on Generative Adversarial Networks (GANs). The +framework enhances the model's adaptability and stability to diverse prompts by +integrating GAN generation techniques with the deep semantic understanding +capabilities of LLMs. GANPrompt first trains a generator capable of producing +diverse prompts by analysing multidimensional user behavioural data. These +diverse prompts are then used to train the LLM to improve its performance in +the face of unseen prompts. Furthermore, to ensure a high degree of diversity +and relevance of the prompts, this study introduces a mathematical theory-based +diversity constraint mechanism that optimises the generated prompts to ensure +that they are not only superficially distinct, but also semantically cover a +wide range of user intentions. Through extensive experiments on multiple +datasets, we demonstrate the effectiveness of the proposed framework, +especially in improving the adaptability and robustness of recommender systems +in complex and dynamic environments. The experimental results demonstrate that +GANPrompt yields substantial enhancements in accuracy and robustness relative +to existing state-of-the-art methodologies. + +
+
+
+
+
+ + ☆ Data-driven Conditional Instrumental Variables for Debiasing Recommender + Systems + + +
+ In recommender systems, latent variables can cause user-item interaction data +to deviate from true user preferences. This biased data is then used to train +recommendation models, further amplifying the bias and ultimately compromising +both recommendation accuracy and user satisfaction. Instrumental Variable (IV) +methods are effective tools for addressing the confounding bias introduced by +latent variables; however, identifying a valid IV is often challenging. To +overcome this issue, we propose a novel data-driven conditional IV (CIV) +debiasing method for recommender systems, called CIV4Rec. CIV4Rec automatically +generates valid CIVs and their corresponding conditioning sets directly from +interaction data, significantly reducing the complexity of IV selection while +effectively mitigating the confounding bias caused by latent variables in +recommender systems. Specifically, CIV4Rec leverages a variational autoencoder +(VAE) to generate the representations of the CIV and its conditional set from +interaction data, followed by the application of least squares to derive causal +representations for click prediction. Extensive experiments on two real-world +datasets, Movielens-10M and Douban-Movie, demonstrate that our CIV4Rec +successfully identifies valid CIVs, effectively reduces bias, and consequently +improves recommendation accuracy. + +
+
+
+
+
+ + ☆ Debiased Contrastive Representation Learning for Mitigating Dual Biases + in Recommender Systems + + +
+ In recommender systems, popularity and conformity biases undermine +recommender effectiveness by disproportionately favouring popular items, +leading to their over-representation in recommendation lists and causing an +unbalanced distribution of user-item historical data. We construct a causal +graph to address both biases and describe the abstract data generation +mechanism. Then, we use it as a guide to develop a novel Debiased Contrastive +Learning framework for Mitigating Dual Biases, called DCLMDB. In DCLMDB, both +popularity bias and conformity bias are handled in the model training process +by contrastive learning to ensure that user choices and recommended items are +not unduly influenced by conformity and popularity. Extensive experiments on +two real-world datasets, Movielens-10M and Netflix, show that DCLMDB can +effectively reduce the dual biases, as well as significantly enhance the +accuracy and diversity of recommendations. + +
+
+
+
+
+ + ☆ Enhanced document retrieval with topic embeddings + + +
+ Document retrieval systems have experienced a revitalized interest with the +advent of retrieval-augmented generation (RAG). RAG architecture offers a lower +hallucination rate than LLM-only applications. However, the accuracy of the +retrieval mechanism is known to be a bottleneck in the efficiency of these +applications. A particular case of subpar retrieval performance is observed in +situations where multiple documents from several different but related topics +are in the corpus. We have devised a new vectorization method that takes into +account the topic information of the document. The paper introduces this new +method for text vectorization and evaluates it in the context of RAG. +Furthermore, we discuss the challenge of evaluating RAG systems, which pertains +to the case at hand. + +
+
+ comment: Accepted to AICT 2024 +
+
+
+
+
+ + ☆ Joint Modeling of Search and Recommendations Via an Unified Contextual + Recommender (UniCoRn) + + +
+ Search and recommendation systems are essential in many services, and they +are often developed separately, leading to complex maintenance and technical +debt. In this paper, we present a unified deep learning model that efficiently +handles key aspects of both tasks. + +
+
+ comment: 3 pages, 1 figure +
+
+
+
+
+ + ☆ Beyond Relevant Documents: A Knowledge-Intensive Approach for + Query-Focused Summarization using Large Language Models ICPR 2024 + + +
+ Query-focused summarization (QFS) is a fundamental task in natural language +processing with broad applications, including search engines and report +generation. However, traditional approaches assume the availability of relevant +documents, which may not always hold in practical scenarios, especially in +highly specialized topics. To address this limitation, we propose a novel +knowledge-intensive approach that reframes QFS as a knowledge-intensive task +setup. This approach comprises two main components: a retrieval module and a +summarization controller. The retrieval module efficiently retrieves +potentially relevant documents from a large-scale knowledge corpus based on the +given textual query, eliminating the dependence on pre-existing document sets. +The summarization controller seamlessly integrates a powerful large language +model (LLM)-based summarizer with a carefully tailored prompt, ensuring the +generated summary is comprehensive and relevant to the query. To assess the +effectiveness of our approach, we create a new dataset, along with +human-annotated relevance labels, to facilitate comprehensive evaluation +covering both retrieval and summarization performance. Extensive experiments +demonstrate the superior performance of our approach, particularly its ability +to generate accurate summaries without relying on the availability of relevant +documents initially. This underscores our method's versatility and practical +applicability across diverse query scenarios. + +
+
+ comment: Accepted by the 27th International Conference on Pattern Recognition + (ICPR 2024) +
+
+
+
+
+ + ♻ ☆ BLAZE: Cross-Language and Cross-Project Bug Localization via Dynamic + Chunking and Hard Example Learning + + +
+ Software bugs require developers to exert significant effort to identify and +resolve them, often consuming about one-third of their time. Bug localization, +the process of pinpointing the exact source code files that need modification, +is crucial in reducing this effort. Existing bug localization tools, typically +reliant on deep learning techniques, face limitations in cross-project +applicability and effectiveness in multi-language environments. Recent +advancements with Large Language Models (LLMs) offer detailed representations +for bug localization. However, they encounter challenges with limited context +windows and mapping accuracy. To address these issues, we propose BLAZE, an +approach that employs dynamic chunking and hard example learning. First, BLAZE +dynamically segments source code to minimize continuity loss. Then, BLAZE +fine-tunes a GPT-based model using challenging bug cases, in order to enhance +cross-project and cross-language bug localization. To support the capability of +BLAZE, we create the BEETLEBOX dataset, which comprises 26,321 bugs from 29 +large and thriving open-source projects across five different programming +languages (Java, C++, Python, Go, and JavaScript). Our evaluations of BLAZE on +three benchmark datasets BEETLEBOX, SWE-Bench, and Ye et al. demonstrate +substantial improvements compared to six state-of-the-art baselines. +Specifically, BLAZE achieves up to an increase of 120% in Top 1 accuracy, 144% +in Mean Average Precision (MAP), and 100% in Mean Reciprocal Rank (MRR). An +extensive ablation study confirms the contributions of our pipeline components +to the overall performance enhancement. + +
+
+
+
+
+ + ♻ ☆ Prompt Tuning on Graph-augmented Low-resource Text Classification + + +
+ Text classification is a fundamental problem in information retrieval with +many real-world applications, such as predicting the topics of online articles +and the categories of e-commerce product descriptions. However, low-resource +text classification, with no or few labeled samples, presents a serious concern +for supervised learning. Meanwhile, many text data are inherently grounded on a +network structure, such as a hyperlink/citation network for online articles, +and a user-item purchase network for e-commerce products. These graph +structures capture rich semantic relationships, which can potentially augment +low-resource text classification. In this paper, we propose a novel model +called Graph-Grounded Pre-training and Prompting (G2P2) to address low-resource +text classification in a two-pronged approach. During pre-training, we propose +three graph interaction-based contrastive strategies to jointly pre-train a +graph-text model; during downstream classification, we explore handcrafted +discrete prompts and continuous prompt tuning for the jointly pre-trained model +to achieve zero- and few-shot classification, respectively. Moreover, we +explore the possibility of employing continuous prompt tuning for zero-shot +inference. Specifically, we aim to generalize continuous prompts to unseen +classes while leveraging a set of base classes. To this end, we extend G2P2 +into G2P2$^*$, hinging on a new architecture of conditional prompt tuning. +Extensive experiments on four real-world datasets demonstrate the strength of +G2P2 in zero- and few-shot low-resource text classification tasks, and +illustrate the advantage of G2P2$^*$ in dealing with unseen classes. + +
+
+ comment: 15 pages, accepted by TKDE (IEEE Transactions on Knowledge and Data + Engineering). arXiv admin note: substantial text overlap with + arXiv:2305.03324 +
+
+
+
+
+ + ♻ ☆ Linguistic and Structural Basis of Engineering Design Knowledge + + +
+ Natural language artefact descriptions are primary carriers of engineering +design knowledge, whose retrieval, representation, and reuse are fundamental to +supporting knowledge-intensive tasks in the design process. In this paper, we +explicate design knowledge from patented artefact descriptions as knowledge +graphs and examine these to understand the linguistic and structural basis. The +purpose of our work is to advance the traditional and ontological perspectives +of design knowledge and to guide Large-Language Models (LLMs) on how to +articulate natural language responses that reflect knowledge that is valuable +in a design environment. We populate 33,881 knowledge graphs from a sample of +patents stratified according to technology classes. For linguistic basis, we +conduct Zipf distribution analyses on the frequencies of unique entities and +relationships to identify 64 and 37 generalisable linguistic syntaxes +respectively. The relationships largely represent attributes ('of'), structure +('in', 'with'), purpose ('to', 'for'), hierarchy ('include'), exemplification +('such as'), and behaviour ('to', 'from'). For structural basis, we draw +inspiration from various studies on biological/ecological networks and discover +motifs from patent knowledge graphs. We identify four 3-node and four 4-node +subgraph patterns that could be converged and simplified into sequence +[->...->], aggregation [->...<-], and hierarchy [<-...->]. Based on these +results, we suggest concretisation strategies for entities and relationships +and explicating hierarchical structures, potentially aiding the construction +and modularisation of design knowledge. + +
+
+ comment: The data for this research is made available at Zenodo - + https://zenodo.org/doi/10.5281/zenodo.13328257 +
+
+
+
+
+ + ♻ ☆ EmbSum: Leveraging the Summarization Capabilities of Large Language + Models for Content-Based Recommendations RecSys 2024 + + +
+ Content-based recommendation systems play a crucial role in delivering +personalized content to users in the digital world. In this work, we introduce +EmbSum, a novel framework that enables offline pre-computations of users and +candidate items while capturing the interactions within the user engagement +history. By utilizing the pretrained encoder-decoder model and poly-attention +layers, EmbSum derives User Poly-Embedding (UPE) and Content Poly-Embedding +(CPE) to calculate relevance scores between users and candidate items. EmbSum +actively learns the long user engagement histories by generating user-interest +summary with supervision from large language model (LLM). The effectiveness of +EmbSum is validated on two datasets from different domains, surpassing +state-of-the-art (SoTA) methods with higher accuracy and fewer parameters. +Additionally, the model's ability to generate summaries of user interests +serves as a valuable by-product, enhancing its usefulness for personalized +content recommendations. + +
+
+ comment: Accepted by RecSys 2024 +
+
+
+
+
+ + ♻ ☆ IncDSI: Incrementally Updatable Document Retrieval + + +
+ Differentiable Search Index is a recently proposed paradigm for document +retrieval, that encodes information about a corpus of documents within the +parameters of a neural network and directly maps queries to corresponding +documents. These models have achieved state-of-the-art performances for +document retrieval across many benchmarks. These kinds of models have a +significant limitation: it is not easy to add new documents after a model is +trained. We propose IncDSI, a method to add documents in real time (about +20-50ms per document), without retraining the model on the entire dataset (or +even parts thereof). Instead we formulate the addition of documents as a +constrained optimization problem that makes minimal changes to the network +parameters. Although orders of magnitude faster, our approach is competitive +with re-training the model on the whole dataset and enables the development of +document retrieval systems that can be updated with new information in +real-time. Our code for IncDSI is available at +https://github.com/varshakishore/IncDSI. + +
+
+
+
+
+ + ♻ ☆ SC-Rec: Enhancing Generative Retrieval with Self-Consistent Reranking + for Sequential Recommendation + + +
+ Language Models (LMs) are increasingly employed in recommendation systems due +to their advanced language understanding and generation capabilities. Recent +recommender systems based on generative retrieval have leveraged the +inferential abilities of LMs to directly generate the index tokens of the next +item, based on item sequences within the user's interaction history. Previous +studies have mostly focused on item indices based solely on textual semantic or +collaborative information. However, although the standalone effectiveness of +these aspects has been demonstrated, the integration of this information has +remained unexplored. Our in-depth analysis finds that there is a significant +difference in the knowledge captured by the model from heterogeneous item +indices and diverse input prompts, which can have a high potential for +complementarity. In this paper, we propose SC-Rec, a unified recommender system +that learns diverse preference knowledge from two distinct item indices and +multiple prompt templates. Furthermore, SC-Rec adopts a novel reranking +strategy that aggregates a set of ranking results, inferred based on different +indices and prompts, to achieve the self-consistency of the model. Our +empirical evaluation on three real-world datasets demonstrates that SC-Rec +considerably outperforms the state-of-the-art methods for sequential +recommendation, effectively incorporating complementary knowledge from varied +outputs of the model. + +
+
+
+
+
+ + ♻ ☆ Learnable Item Tokenization for Generative Recommendation CIKM 2024 + + +
+ Utilizing powerful Large Language Models (LLMs) for generative recommendation +has attracted much attention. Nevertheless, a crucial challenge is transforming +recommendation data into the language space of LLMs through effective item +tokenization. Current approaches, such as ID, textual, and codebook-based +identifiers, exhibit shortcomings in encoding semantic information, +incorporating collaborative signals, or handling code assignment bias. To +address these limitations, we propose LETTER (a LEarnable Tokenizer for +generaTivE Recommendation), which integrates hierarchical semantics, +collaborative signals, and code assignment diversity to satisfy the essential +requirements of identifiers. LETTER incorporates Residual Quantized VAE for +semantic regularization, a contrastive alignment loss for collaborative +regularization, and a diversity loss to mitigate code assignment bias. We +instantiate LETTER on two models and propose a ranking-guided generation loss +to augment their ranking ability theoretically. Experiments on three datasets +validate the superiority of LETTER, advancing the state-of-the-art in the field +of LLM-based generative recommendation. + +
+
+ comment: Accepted by CIKM 2024 +
+
+
+
+
+ + ♻ ☆ Evidence-Based Temporal Fact Verification + + +
+ Automated fact verification plays an essential role in fostering trust in the +digital space. Despite the growing interest, the verification of temporal facts +has not received much attention in the community. Temporal fact verification +brings new challenges where cues of the temporal information need to be +extracted and temporal reasoning involving various temporal aspects of the text +must be applied. In this work, we propose an end-to-end solution for temporal +fact verification that considers the temporal information in claims to obtain +relevant evidence sentences and harness the power of large language model for +temporal reasoning. Recognizing that temporal facts often involve events, we +model these events in the claim and evidence sentences. We curate two temporal +fact datasets to learn time-sensitive representations that encapsulate not only +the semantic relationships among the events, but also their chronological +proximity. This allows us to retrieve the top-k relevant evidence sentences and +provide the context for a large language model to perform temporal reasoning +and outputs whether a claim is supported or refuted by the retrieved evidence +sentences. Experiment results demonstrate that the proposed approach +significantly enhances the accuracy of temporal claim verification, thereby +advancing current state-of-the-art in automated fact verification. + +
+
+
+
+
+
+
+
+ + Machine Learning 147 + +
+
+
+ + ☆ KAN 2.0: Kolmogorov-Arnold Networks Meet Science + + +
+ A major challenge of AI + Science lies in their inherent incompatibility: +today's AI is primarily based on connectionism, while science depends on +symbolism. To bridge the two worlds, we propose a framework to seamlessly +synergize Kolmogorov-Arnold Networks (KANs) and science. The framework +highlights KANs' usage for three aspects of scientific discovery: identifying +relevant features, revealing modular structures, and discovering symbolic +formulas. The synergy is bidirectional: science to KAN (incorporating +scientific knowledge into KANs), and KAN to science (extracting scientific +insights from KANs). We highlight major new functionalities in the pykan +package: (1) MultKAN: KANs with multiplication nodes. (2) kanpiler: a KAN +compiler that compiles symbolic formulas into KANs. (3) tree converter: convert +KANs (or any neural networks) to tree graphs. Based on these tools, we +demonstrate KANs' capability to discover various types of physical laws, +including conserved quantities, Lagrangians, symmetries, and constitutive laws. + +
+
+ comment: 27 pages, 14 figures +
+
+
+
+
+ + ☆ Criticality Leveraged Adversarial Training (CLAT) for Boosted + Performance via Parameter Efficiency + + +
+ Adversarial training enhances neural network robustness but suffers from a +tendency to overfit and increased generalization errors on clean data. This +work introduces CLAT, an innovative approach that mitigates adversarial +overfitting by introducing parameter efficiency into the adversarial training +process, improving both clean accuracy and adversarial robustness. Instead of +tuning the entire model, CLAT identifies and fine-tunes robustness-critical +layers - those predominantly learning non-robust features - while freezing the +remaining model to enhance robustness. It employs dynamic critical layer +selection to adapt to changes in layer criticality throughout the fine-tuning +process. Empirically, CLAT can be applied on top of existing adversarial +training methods, significantly reduces the number of trainable parameters by +approximately 95%, and achieves more than a 2% improvement in adversarial +robustness compared to baseline methods. + +
+
+ comment: 9 pages + appendix/ additional experiments +
+
+
+
+
+ + ☆ Area under the ROC Curve has the Most Consistent Evaluation for Binary + Classification + + +
+ Evaluation Metrics is an important question for model evaluation and model +selection in binary classification tasks. This study investigates how +consistent metrics are at evaluating different models under different data +scenarios. Analyzing over 150 data scenarios and 18 model evaluation metrics +using statistical simulation, I find that for binary classification tasks, +evaluation metrics that are less influenced by prevalence offer more consistent +ranking of a set of different models. In particular, Area Under the ROC Curve +(AUC) has smallest variance in ranking of different models. Matthew's +correlation coefficient as a more strict measure of model performance has the +second smallest variance. These patterns holds across a rich set of data +scenarios and five commonly used machine learning models as well as a naive +random guess model. The results have significant implications for model +evaluation and model selection in binary classification tasks. + +
+
+
+
+
+ + ☆ Transformers to SSMs: Distilling Quadratic Knowledge to Subquadratic + Models + + +
+ Transformer architectures have become a dominant paradigm for domains like +language modeling but suffer in many inference settings due to their +quadratic-time self-attention. Recently proposed subquadratic architectures, +such as Mamba, have shown promise, but have been pretrained with substantially +less computational resources than the strongest Transformer models. In this +work, we present a method that is able to distill a pretrained Transformer +architecture into alternative architectures such as state space models (SSMs). +The key idea to our approach is that we can view both Transformers and SSMs as +applying different forms of mixing matrices over the token sequences. We can +thus progressively distill the Transformer architecture by matching different +degrees of granularity in the SSM: first matching the mixing matrices +themselves, then the hidden units at each block, and finally the end-to-end +predictions. Our method, called MOHAWK, is able to distill a Mamba-2 variant +based on the Phi-1.5 architecture (Phi-Mamba) using only 3B tokens and a hybrid +version (Hybrid Phi-Mamba) using 5B tokens. Despite using less than 1% of the +training data typically used to train models from scratch, Phi-Mamba boasts +substantially stronger performance compared to all past open-source +non-Transformer models. MOHAWK allows models like SSMs to leverage +computational resources invested in training Transformer-based architectures, +highlighting a new avenue for building such models. + +
+
+
+
+
+ + ☆ SMILE: Zero-Shot Sparse Mixture of Low-Rank Experts Construction From + Pre-Trained Foundation Models + + +
+ Deep model training on extensive datasets is increasingly becoming +cost-prohibitive, prompting the widespread adoption of deep model fusion +techniques to leverage knowledge from pre-existing models. From simple weight +averaging to more sophisticated methods like AdaMerging, model fusion +effectively improves model performance and accelerates the development of new +models. However, potential interference between parameters of individual models +and the lack of interpretability in the fusion progress remain significant +challenges. Existing methods often try to resolve the parameter interference +issue by evaluating attributes of parameters, such as their magnitude or sign, +or by parameter pruning. In this study, we begin by examining the fine-tuning +of linear layers through the lens of subspace analysis and explicitly define +parameter interference as an optimization problem to shed light on this +subject. Subsequently, we introduce an innovative approach to model fusion +called zero-shot Sparse MIxture of Low-rank Experts (SMILE) construction, which +allows for the upscaling of source models into an MoE model without extra data +or further training. Our approach relies on the observation that fine-tuning +mostly keeps the important parts from the pre-training, but it uses less +significant or unused areas to adapt to new tasks. Also, the issue of parameter +interference, which is intrinsically intractable in the original parameter +space, can be managed by expanding the dimensions. We conduct extensive +experiments across diverse scenarios, such as image classification and text +generalization tasks, using full fine-tuning and LoRA fine-tuning, and we apply +our method to large language models (CLIP models, Flan-T5 models, and +Mistral-7B models), highlighting the adaptability and scalability of SMILE. +Code is available at https://github.com/tanganke/fusion_bench + +
+
+ comment: Code is available at https://github.com/tanganke/fusion_bench +
+
+
+
+
+ + ☆ Physics-Aware Combinatorial Assembly Planning using Deep Reinforcement + Learning + + +
+ Combinatorial assembly uses standardized unit primitives to build objects +that satisfy user specifications. Lego is a widely used platform for +combinatorial assembly, in which people use unit primitives (ie Lego bricks) to +build highly customizable 3D objects. This paper studies sequence planning for +physical combinatorial assembly using Lego. Given the shape of the desired +object, we want to find a sequence of actions for placing Lego bricks to build +the target object. In particular, we aim to ensure the planned assembly +sequence is physically executable. However, assembly sequence planning (ASP) +for combinatorial assembly is particularly challenging due to its combinatorial +nature, ie the vast number of possible combinations and complex constraints. To +address the challenges, we employ deep reinforcement learning to learn a +construction policy for placing unit primitives sequentially to build the +desired object. Specifically, we design an online physics-aware action mask +that efficiently filters out invalid actions and guides policy learning. In the +end, we demonstrate that the proposed method successfully plans physically +valid assembly sequences for constructing different Lego structures. The +generated construction plan can be executed in real. + +
+
+
+
+
+ + ☆ Multilingual Needle in a Haystack: Investigating Long-Context Behavior + of Multilingual Large Language Models + + +
+ While recent large language models (LLMs) demonstrate remarkable abilities in +responding to queries in diverse languages, their ability to handle long +multilingual contexts is unexplored. As such, a systematic evaluation of the +long-context capabilities of LLMs in multilingual settings is crucial, +specifically in the context of information retrieval. To address this gap, we +introduce the MultiLingual Needle-in-a-Haystack (MLNeedle) test, designed to +assess a model's ability to retrieve relevant information (the needle) from a +collection of multilingual distractor texts (the haystack). This test serves as +an extension of the multilingual question-answering task, encompassing both +monolingual and cross-lingual retrieval. We evaluate four state-of-the-art LLMs +on MLNeedle. Our findings reveal that model performance can vary significantly +with language and needle position. Specifically, we observe that model +performance is the lowest when the needle is (i) in a language outside the +English language family and (ii) located in the middle of the input context. +Furthermore, although some models claim a context size of $8k$ tokens or +greater, none demonstrate satisfactory cross-lingual retrieval performance as +the context length increases. Our analysis provides key insights into the +long-context behavior of LLMs in multilingual settings to guide future +evaluation protocols. To our knowledge, this is the first study to investigate +the multilingual long-context behavior of LLMs. + +
+
+
+
+
+ + ☆ In-Context Learning with Representations: Contextual Generalization of + Trained Transformers + + +
+ In-context learning (ICL) refers to a remarkable capability of pretrained +large language models, which can learn a new task given a few examples during +inference. However, theoretical understanding of ICL is largely under-explored, +particularly whether transformers can be trained to generalize to unseen +examples in a prompt, which will require the model to acquire contextual +knowledge of the prompt for generalization. This paper investigates the +training dynamics of transformers by gradient descent through the lens of +non-linear regression tasks. The contextual generalization here can be attained +via learning the template function for each task in-context, where all template +functions lie in a linear space with $m$ basis functions. We analyze the +training dynamics of one-layer multi-head transformers to in-contextly predict +unlabeled inputs given partially labeled prompts, where the labels contain +Gaussian noise and the number of examples in each prompt are not sufficient to +determine the template. Under mild assumptions, we show that the training loss +for a one-layer multi-head transformer converges linearly to a global minimum. +Moreover, the transformer effectively learns to perform ridge regression over +the basis functions. To our knowledge, this study is the first provable +demonstration that transformers can learn contextual (i.e., template) +information to generalize to both unseen examples and tasks when prompts +contain only a small number of query-answer pairs. + +
+
+
+
+
+ + ☆ Robust spectral clustering with rank statistics + + +
+ This paper analyzes the statistical performance of a robust spectral +clustering method for latent structure recovery in noisy data matrices. We +consider eigenvector-based clustering applied to a matrix of nonparametric rank +statistics that is derived entrywise from the raw, original data matrix. This +approach is robust in the sense that, unlike traditional spectral clustering +procedures, it can provably recover population-level latent block structure +even when the observed data matrix includes heavy-tailed entries and has a +heterogeneous variance profile. + Our main theoretical contributions are threefold and hold under flexible data +generating conditions. First, we establish that robust spectral clustering with +rank statistics can consistently recover latent block structure, viewed as +communities of nodes in a graph, in the sense that unobserved community +memberships for all but a vanishing fraction of nodes are correctly recovered +with high probability when the data matrix is large. Second, we refine the +former result and further establish that, under certain conditions, the +community membership of any individual, specified node of interest can be +asymptotically exactly recovered with probability tending to one in the +large-data limit. Third, we establish asymptotic normality results associated +with the truncated eigenstructure of matrices whose entries are rank +statistics, made possible by synthesizing contemporary entrywise matrix +perturbation analysis with the classical nonparametric theory of so-called +simple linear rank statistics. Collectively, these results demonstrate the +statistical utility of rank-based data transformations when paired with +spectral techniques for dimensionality reduction. Additionally, for a dataset +of human connectomes, our approach yields parsimonious dimensionality reduction +and improved recovery of ground-truth neuroanatomical cluster structure. + +
+
+ comment: 82 pages, 8 figures, 1 table +
+
+
+
+
+ + ☆ Advancing Voice Cloning for Nepali: Leveraging Transfer Learning in a + Low-Resource Language + + +
+ Voice cloning is a prominent feature in personalized speech interfaces. A +neural vocal cloning system can mimic someone's voice using just a few audio +samples. Both speaker encoding and speaker adaptation are topics of research in +the field of voice cloning. Speaker adaptation relies on fine-tuning a +multi-speaker generative model, which involves training a separate model to +infer a new speaker embedding used for speaker encoding. Both methods can +achieve excellent performance, even with a small number of cloning audios, in +terms of the speech's naturalness and similarity to the original speaker. +Speaker encoding approaches are more appropriate for low-resource deployment +since they require significantly less memory and have a faster cloning time +than speaker adaption, which can offer slightly greater naturalness and +similarity. The main goal is to create a vocal cloning system that produces +audio output with a Nepali accent or that sounds like Nepali. For the further +advancement of TTS, the idea of transfer learning was effectively used to +address several issues that were encountered in the development of this system, +including the poor audio quality and the lack of available data. + +
+
+ comment: 7 pages, 10 figures +
+
+
+
+
+ + ☆ Learning Brave Assumption-Based Argumentation Frameworks via ASP ECAI 2024 + + +
+ Assumption-based Argumentation (ABA) is advocated as a unifying formalism for +various forms of non-monotonic reasoning, including logic programming. It +allows capturing defeasible knowledge, subject to argumentative debate. While, +in much existing work, ABA frameworks are given up-front, in this paper we +focus on the problem of automating their learning from background knowledge and +positive/negative examples. Unlike prior work, we newly frame the problem in +terms of brave reasoning under stable extensions for ABA. We present a novel +algorithm based on transformation rules (such as Rote Learning, Folding, +Assumption Introduction and Fact Subsumption) and an implementation thereof +that makes use of Answer Set Programming. Finally, we compare our technique to +state-of-the-art ILP systems that learn defeasible knowledge. + +
+
+ comment: Extended version of the paper accepted at the 27th European + Conference on Artificial Intelligence (ECAI 2024); Paper ID: M1488 + (https://www.ecai2024.eu/) +
+
+
+
+
+ + ☆ Molecular Graph Representation Learning Integrating Large Language + Models with Domain-specific Small Models + + +
+ Molecular property prediction is a crucial foundation for drug discovery. In +recent years, pre-trained deep learning models have been widely applied to this +task. Some approaches that incorporate prior biological domain knowledge into +the pre-training framework have achieved impressive results. However, these +methods heavily rely on biochemical experts, and retrieving and summarizing +vast amounts of domain knowledge literature is both time-consuming and +expensive. Large Language Models (LLMs) have demonstrated remarkable +performance in understanding and efficiently providing general knowledge. +Nevertheless, they occasionally exhibit hallucinations and lack precision in +generating domain-specific knowledge. Conversely, Domain-specific Small Models +(DSMs) possess rich domain knowledge and can accurately calculate molecular +domain-related metrics. However, due to their limited model size and singular +functionality, they lack the breadth of knowledge necessary for comprehensive +representation learning. To leverage the advantages of both approaches in +molecular property prediction, we propose a novel Molecular Graph +representation learning framework that integrates Large language models and +Domain-specific small models (MolGraph-LarDo). Technically, we design a +two-stage prompt strategy where DSMs are introduced to calibrate the knowledge +provided by LLMs, enhancing the accuracy of domain-specific information and +thus enabling LLMs to generate more precise textual descriptions for molecular +samples. Subsequently, we employ a multi-modal alignment method to coordinate +various modalities, including molecular graphs and their corresponding +descriptive texts, to guide the pre-training of molecular representations. +Extensive experiments demonstrate the effectiveness of the proposed method. + +
+
+
+
+
+ + ☆ PLUTUS: A Well Pre-trained Large Unified Transformer can Unveil + Financial Time Series Regularities + + +
+ Financial time series modeling is crucial for understanding and predicting +market behaviors but faces challenges such as non-linearity, non-stationarity, +and high noise levels. Traditional models struggle to capture complex patterns +due to these issues, compounded by limitations in computational resources and +model capacity. Inspired by the success of large language models in NLP, we +introduce \textbf{PLUTUS}, a \textbf{P}re-trained \textbf{L}arge +\textbf{U}nified \textbf{T}ransformer-based model that \textbf{U}nveils +regularities in financial time \textbf{S}eries. PLUTUS uses an invertible +embedding module with contrastive learning and autoencoder techniques to create +an approximate one-to-one mapping between raw data and patch embeddings. +TimeFormer, an attention based architecture, forms the core of PLUTUS, +effectively modeling high-noise time series. We incorporate a novel attention +mechanisms to capture features across both variable and temporal dimensions. +PLUTUS is pre-trained on an unprecedented dataset of 100 billion observations, +designed to thrive in noisy financial environments. To our knowledge, PLUTUS is +the first open-source, large-scale, pre-trained financial time series model +with over one billion parameters. It achieves state-of-the-art performance in +various tasks, demonstrating strong transferability and establishing a robust +foundational model for finance. Our research provides technical guidance for +pre-training financial time series data, setting a new standard in the field. + +
+
+
+
+
+ + ☆ Perturb-and-Compare Approach for Detecting Out-of-Distribution Samples + in Constrained Access Environments ECAI + + +
+ Accessing machine learning models through remote APIs has been gaining +prevalence following the recent trend of scaling up model parameters for +increased performance. Even though these models exhibit remarkable ability, +detecting out-of-distribution (OOD) samples remains a crucial safety concern +for end users as these samples may induce unreliable outputs from the model. In +this work, we propose an OOD detection framework, MixDiff, that is applicable +even when the model's parameters or its activations are not accessible to the +end user. To bypass the access restriction, MixDiff applies an identical +input-level perturbation to a given target sample and a similar in-distribution +(ID) sample, then compares the relative difference in the model outputs of +these two samples. MixDiff is model-agnostic and compatible with existing +output-based OOD detection methods. We provide theoretical analysis to +illustrate MixDiff's effectiveness in discerning OOD samples that induce +overconfident outputs from the model and empirically demonstrate that MixDiff +consistently enhances the OOD detection performance on various datasets in +vision and text domains. + +
+
+ comment: Accepted to European Conference on Artificial Intelligence (ECAI) + 2024 +
+
+
+
+
+ + ☆ Federated Frank-Wolfe Algorithm + + +
+ Federated learning (FL) has gained a lot of attention in recent years for +building privacy-preserving collaborative learning systems. However, FL +algorithms for constrained machine learning problems are still limited, +particularly when the projection step is costly. To this end, we propose a +Federated Frank-Wolfe Algorithm (FedFW). FedFW features data privacy, low +per-iteration cost, and communication of sparse signals. In the deterministic +setting, FedFW achieves an $\varepsilon$-suboptimal solution within +$O(\varepsilon^{-2})$ iterations for smooth and convex objectives, and +$O(\varepsilon^{-3})$ iterations for smooth but non-convex objectives. +Furthermore, we present a stochastic variant of FedFW and show that it finds a +solution within $O(\varepsilon^{-3})$ iterations in the convex setting. We +demonstrate the empirical performance of FedFW on several machine learning +tasks. + +
+
+ comment: European Conference on Machine Learning and Principles and Practice + of Knowledge Discovery in Databases +
+
+
+
+
+ + ☆ MASALA: Model-Agnostic Surrogate Explanations by Locality Adaptation + + +
+ Existing local Explainable AI (XAI) methods, such as LIME, select a region of +the input space in the vicinity of a given input instance, for which they +approximate the behaviour of a model using a simpler and more interpretable +surrogate model. The size of this region is often controlled by a user-defined +locality hyperparameter. In this paper, we demonstrate the difficulties +associated with defining a suitable locality size to capture impactful model +behaviour, as well as the inadequacy of using a single locality size to explain +all predictions. We propose a novel method, MASALA, for generating +explanations, which automatically determines the appropriate local region of +impactful model behaviour for each individual instance being explained. MASALA +approximates the local behaviour used by a complex model to make a prediction +by fitting a linear surrogate model to a set of points which experience similar +model behaviour. These points are found by clustering the input space into +regions of linear behavioural trends exhibited by the model. We compare the +fidelity and consistency of explanations generated by our method with existing +local XAI methods, namely LIME and CHILLI. Experiments on the PHM08 and MIDAS +datasets show that our method produces more faithful and consistent +explanations than existing methods, without the need to define any sensitive +locality hyperparameters. + +
+
+
+
+
+ + ☆ TANGO: Clustering with Typicality-Aware Nonlocal Mode-Seeking and + Graph-Cut Optimization + + +
+ Density-based clustering methods by mode-seeking usually achieve clustering +by using local density estimation to mine structural information, such as local +dependencies from lower density points to higher neighbors. However, they often +rely too heavily on \emph{local} structures and neglect \emph{global} +characteristics, which can lead to significant errors in peak selection and +dependency establishment. Although introducing more hyperparameters that revise +dependencies can help mitigate this issue, tuning them is challenging and even +impossible on real-world datasets. In this paper, we propose a new algorithm +(TANGO) to establish local dependencies by exploiting a global-view +\emph{typicality} of points, which is obtained by mining further the density +distributions and initial dependencies. TANGO then obtains sub-clusters with +the help of the adjusted dependencies, and characterizes the similarity between +sub-clusters by incorporating path-based connectivity. It achieves final +clustering by employing graph-cut on sub-clusters, thus avoiding the +challenging selection of cluster centers. Moreover, this paper provides +theoretical analysis and an efficient method for the calculation of typicality. +Experimental results on several synthetic and $16$ real-world datasets +demonstrate the effectiveness and superiority of TANGO. + +
+
+
+
+
+ + ☆ No Screening is More Efficient with Multiple Objects + + +
+ We study efficient mechanism design for allocating multiple heterogeneous +objects. We aim to maximize the residual surplus, the total value generated +from an allocation minus the costs for screening agents' values. We discover a +robust trend indicating that no-screening mechanisms such as serial +dictatorship with exogenous priority order tend to perform better as the +variety of goods increases. We analyze the underlying reasons by characterizing +efficient mechanisms in a stylized environment. We also apply an automated +mechanism design approach to numerically derive efficient mechanisms and +validate the trend in general environments. Building on this implication, we +propose the register-invite-book system (RIB) as an efficient system for +scheduling vaccination against pandemic diseases. + +
+
+
+
+
+ + ☆ Personalizing Reinforcement Learning from Human Feedback with + Variational Preference Learning + + +
+ Reinforcement Learning from Human Feedback (RLHF) is a powerful paradigm for +aligning foundation models to human values and preferences. However, current +RLHF techniques cannot account for the naturally occurring differences in +individual human preferences across a diverse population. When these +differences arise, traditional RLHF frameworks simply average over them, +leading to inaccurate rewards and poor performance for individual subgroups. To +address the need for pluralistic alignment, we develop a class of multimodal +RLHF methods. Our proposed techniques are based on a latent variable +formulation - inferring a novel user-specific latent and learning reward models +and policies conditioned on this latent without additional user-specific data. +While conceptually simple, we show that in practice, this reward modeling +requires careful algorithmic considerations around model architecture and +reward scaling. To empirically validate our proposed technique, we first show +that it can provide a way to combat underspecification in simulated control +problems, inferring and optimizing user-specific reward functions. Next, we +conduct experiments on pluralistic language datasets representing diverse user +preferences and demonstrate improved reward function accuracy. We additionally +show the benefits of this probabilistic framework in terms of measuring +uncertainty, and actively learning user preferences. This work enables learning +from diverse populations of users with divergent preferences, an important +challenge that naturally occurs in problems from robot learning to foundation +model alignment. + +
+
+ comment: weirdlabuw.github.io/vpl +
+
+
+
+
+ + ☆ Facial Wrinkle Segmentation for Cosmetic Dermatology: Pretraining with + Texture Map-Based Weak Supervision + + +
+ Facial wrinkle detection plays a crucial role in cosmetic dermatology. +Precise manual segmentation of facial wrinkles is challenging and +time-consuming, with inherent subjectivity leading to inconsistent results +among graders. To address this issue, we propose two solutions. First, we build +and release the first public facial wrinkle dataset, `FFHQ-Wrinkle', an +extension of the NVIDIA FFHQ dataset. This dataset includes 1,000 images with +human labels and 50,000 images with automatically generated weak labels. This +dataset can foster the research community to develop advanced wrinkle detection +algorithms. Second, we introduce a training strategy for U-Net-like +encoder-decoder models to detect wrinkles across the face automatically. Our +method employs a two-stage training strategy: texture map pretraining and +finetuning on human-labeled data. Initially, we pretrain models on a large +dataset with weak labels (N=50k) or masked texture maps generated through +computer vision techniques, without human intervention. Subsequently, we +finetune the models using human-labeled data (N=1k), which consists of manually +labeled wrinkle masks. During finetuning, the network inputs a combination of +RGB and masked texture maps, comprising four channels. We effectively combine +labels from multiple annotators to minimize subjectivity in manual labeling. +Our strategies demonstrate improved segmentation performance in facial wrinkle +segmentation both quantitatively and visually compared to existing pretraining +methods. + +
+
+
+
+
+ + ☆ Efficient Exploration in Deep Reinforcement Learning: A Novel Bayesian + Actor-Critic Algorithm + + +
+ Reinforcement learning (RL) and Deep Reinforcement Learning (DRL), in +particular, have the potential to disrupt and are already changing the way we +interact with the world. One of the key indicators of their applicability is +their ability to scale and work in real-world scenarios, that is in large-scale +problems. This scale can be achieved via a combination of factors, the +algorithm's ability to make use of large amounts of data and computational +resources and the efficient exploration of the environment for viable solutions +(i.e. policies). + In this work, we investigate and motivate some theoretical foundations for +deep reinforcement learning. We start with exact dynamic programming and work +our way up to stochastic approximations and stochastic approximations for a +model-free scenario, which forms the theoretical basis of modern reinforcement +learning. We present an overview of this highly varied and rapidly changing +field from the perspective of Approximate Dynamic Programming. We then focus +our study on the short-comings with respect to exploration of the cornerstone +approaches (i.e. DQN, DDQN, A2C) in deep reinforcement learning. On the theory +side, our main contribution is the proposal of a novel Bayesian actor-critic +algorithm. On the empirical side, we evaluate Bayesian exploration as well as +actor-critic algorithms on standard benchmarks as well as state-of-the-art +evaluation suites and show the benefits of both of these approaches over +current state-of-the-art deep RL methods. We release all the implementations +and provide a full python library that is easy to install and hopefully will +serve the reinforcement learning community in a meaningful way, and provide a +strong foundation for future work. + +
+
+ comment: 74 pages, MRes Thesis in Computer Science, UCL +
+
+
+
+
+ + ☆ Exploiting Fine-Grained Prototype Distribution for Boosting Unsupervised + Class Incremental Learning + + +
+ The dynamic nature of open-world scenarios has attracted more attention to +class incremental learning (CIL). However, existing CIL methods typically +presume the availability of complete ground-truth labels throughout the +training process, an assumption rarely met in practical applications. +Consequently, this paper explores a more challenging problem of unsupervised +class incremental learning (UCIL). The essence of addressing this problem lies +in effectively capturing comprehensive feature representations and discovering +unknown novel classes. To achieve this, we first model the knowledge of class +distribution by exploiting fine-grained prototypes. Subsequently, a granularity +alignment technique is introduced to enhance the unsupervised class discovery. +Additionally, we proposed a strategy to minimize overlap between novel and +existing classes, thereby preserving historical knowledge and mitigating the +phenomenon of catastrophic forgetting. Extensive experiments on the five +datasets demonstrate that our approach significantly outperforms current +state-of-the-art methods, indicating the effectiveness of the proposed method. + +
+
+
+
+
+ + ☆ PinnDE: Physics-Informed Neural Networks for Solving Differential + Equations + + +
+ In recent years the study of deep learning for solving differential equations +has grown substantially. The use of physics-informed neural networks (PINNs) +and deep operator networks (DeepONets) have emerged as two of the most useful +approaches in approximating differential equation solutions using machine +learning. Here, we propose PinnDE, an open-source python library for solving +differential equations with both PINNs and DeepONets. We give a brief review of +both PINNs and DeepONets, introduce PinnDE along with the structure and usage +of the package, and present worked examples to show PinnDE's effectiveness in +approximating solutions with both PINNs and DeepONets. + +
+
+
+
+
+ + ☆ Unlocking the Power of LSTM for Long Term Time Series Forecasting + + +
+ Traditional recurrent neural network architectures, such as long short-term +memory neural networks (LSTM), have historically held a prominent role in time +series forecasting (TSF) tasks. While the recently introduced sLSTM for Natural +Language Processing (NLP) introduces exponential gating and memory mixing that +are beneficial for long term sequential learning, its potential short memory +issue is a barrier to applying sLSTM directly in TSF. To address this, we +propose a simple yet efficient algorithm named P-sLSTM, which is built upon +sLSTM by incorporating patching and channel independence. These modifications +substantially enhance sLSTM's performance in TSF, achieving state-of-the-art +results. Furthermore, we provide theoretical justifications for our design, and +conduct extensive comparative and analytical experiments to fully validate the +efficiency and superior performance of our model. + +
+
+
+
+
+ + ☆ The Fairness-Quality Trade-off in Clustering + + +
+ Fairness in clustering has been considered extensively in the past; however, +the trade-off between the two objectives -- e.g., can we sacrifice just a +little in the quality of the clustering to significantly increase fairness, or +vice-versa? -- has rarely been addressed. We introduce novel algorithms for +tracing the complete trade-off curve, or Pareto front, between quality and +fairness in clustering problems; that is, computing all clusterings that are +not dominated in both objectives by other clusterings. Unlike previous work +that deals with specific objectives for quality and fairness, we deal with all +objectives for fairness and quality in two general classes encompassing most of +the special cases addressed in previous work. Our algorithm must take +exponential time in the worst case as the Pareto front itself can be +exponential. Even when the Pareto front is polynomial, our algorithm may take +exponential time, and we prove that this is inevitable unless P = NP. However, +we also present a new polynomial-time algorithm for computing the entire Pareto +front when the cluster centers are fixed, and for perhaps the most natural +fairness objective: minimizing the sum, over all clusters, of the imbalance +between the two groups in each cluster. + +
+
+
+
+
+ + ☆ Uniting contrastive and generative learning for event sequences models + + +
+ High-quality representation of transactional sequences is vital for modern +banking applications, including risk management, churn prediction, and +personalized customer offers. Different tasks require distinct representation +properties: local tasks benefit from capturing the client's current state, +while global tasks rely on general behavioral patterns. Previous research has +demonstrated that various self-supervised approaches yield representations that +better capture either global or local qualities. + This study investigates the integration of two self-supervised learning +techniques - instance-wise contrastive learning and a generative approach based +on restoring masked events in latent space. The combined approach creates +representations that balance local and global transactional data +characteristics. Experiments conducted on several public datasets, focusing on +sequence classification and next-event type prediction, show that the +integrated method achieves superior performance compared to individual +approaches and demonstrates synergistic effects. These findings suggest that +the proposed approach offers a robust framework for advancing event sequences +representation learning in the financial sector. + +
+
+
+
+
+ + ☆ Parseval Convolution Operators and Neural Networks + + +
+ We first establish a kernel theorem that characterizes all linear +shift-invariant (LSI) operators acting on discrete multicomponent signals. This +result naturally leads to the identification of the Parseval convolution +operators as the class of energy-preserving filterbanks. We then present a +constructive approach for the design/specification of such filterbanks via the +chaining of elementary Parseval modules, each of which being parameterized by +an orthogonal matrix or a 1-tight frame. Our analysis is complemented with +explicit formulas for the Lipschitz constant of all the components of a +convolutional neural network (CNN), which gives us a handle on their stability. +Finally, we demonstrate the usage of those tools with the design of a CNN-based +algorithm for the iterative reconstruction of biomedical images. Our algorithm +falls within the plug-and-play framework for the resolution of inverse +problems. It yields better-quality results than the sparsity-based methods used +in compressed sensing, while offering essentially the same convergence and +robustness guarantees. + +
+
+
+
+
+ + ☆ Preference-Optimized Pareto Set Learning for Blackbox Optimization + + +
+ Multi-Objective Optimization (MOO) is an important problem in real-world +applications. However, for a non-trivial problem, no single solution exists +that can optimize all the objectives simultaneously. In a typical MOO problem, +the goal is to find a set of optimum solutions (Pareto set) that trades off the +preferences among objectives. Scalarization in MOO is a well-established method +for finding a finite set approximation of the whole Pareto set (PS). However, +in real-world experimental design scenarios, it's beneficial to obtain the +whole PS for flexible exploration of the design space. Recently Pareto set +learning (PSL) has been introduced to approximate the whole PS. PSL involves +creating a manifold representing the Pareto front of a multi-objective +optimization problem. A naive approach includes finding discrete points on the +Pareto front through randomly generated preference vectors and connecting them +by regression. However, this approach is computationally expensive and leads to +a poor PS approximation. We propose to optimize the preference points to be +distributed evenly on the Pareto front. Our formulation leads to a bilevel +optimization problem that can be solved by e.g. differentiable cross-entropy +methods. We demonstrated the efficacy of our method for complex and difficult +black-box MOO problems using both synthetic and real-world benchmark data. + +
+
+
+
+
+ + ☆ The Exploration-Exploitation Dilemma Revisited: An Entropy Perspective + + +
+ The imbalance of exploration and exploitation has long been a significant +challenge in reinforcement learning. In policy optimization, excessive reliance +on exploration reduces learning efficiency, while over-dependence on +exploitation might trap agents in local optima. This paper revisits the +exploration-exploitation dilemma from the perspective of entropy by revealing +the relationship between entropy and the dynamic adaptive process of +exploration and exploitation. Based on this theoretical insight, we establish +an end-to-end adaptive framework called AdaZero, which automatically determines +whether to explore or to exploit as well as their balance of strength. +Experiments show that AdaZero significantly outperforms baseline models across +various Atari and MuJoCo environments with only a single setting. Especially in +the challenging environment of Montezuma, AdaZero boosts the final returns by +up to fifteen times. Moreover, we conduct a series of visualization analyses to +reveal the dynamics of our self-adaptive mechanism, demonstrating how entropy +reflects and changes with respect to the agent's performance and adaptive +process. + +
+
+
+
+
+ + ☆ Unsupervised Machine Learning Hybrid Approach Integrating Linear + Programming in Loss Function: A Robust Optimization Technique + + +
+ This paper presents a novel hybrid approach that integrates linear +programming (LP) within the loss function of an unsupervised machine learning +model. By leveraging the strengths of both optimization techniques and machine +learning, this method introduces a robust framework for solving complex +optimization problems where traditional methods may fall short. The proposed +approach encapsulates the constraints and objectives of a linear programming +problem directly into the loss function, guiding the learning process to adhere +to these constraints while optimizing the desired outcomes. This technique not +only preserves the interpretability of linear programming but also benefits +from the flexibility and adaptability of machine learning, making it +particularly well-suited for unsupervised or semi-supervised learning +scenarios. + +
+
+
+
+
+ + ☆ Mask in the Mirror: Implicit Sparsification + + +
+ Sparsifying deep neural networks to reduce their inference cost is an NP-hard +problem and difficult to optimize due to its mixed discrete and continuous +nature. Yet, as we prove, continuous sparsification has already an implicit +bias towards sparsity that would not require common projections of relaxed mask +variables. While implicit rather than explicit regularization induces benefits, +it usually does not provide enough flexibility in practice, as only a specific +target sparsity is obtainable. To exploit its potential for continuous +sparsification, we propose a way to control the strength of the implicit bias. +Based on the mirror flow framework, we derive resulting convergence and +optimality guarantees in the context of underdetermined linear regression and +demonstrate the utility of our insights in more general neural network +sparsification experiments, achieving significant performance gains, +particularly in the high-sparsity regime. Our theoretical contribution might be +of independent interest, as we highlight a way to enter the rich regime and +show that implicit bias is controllable by a time-dependent Bregman potential. + +
+
+ comment: 20 pages, 5 figures +
+
+
+
+
+ + ☆ AdaResNet: Enhancing Residual Networks with Dynamic Weight Adjustment + for Improved Feature Integration + + +
+ In very deep neural networks, gradients can become extremely small during +backpropagation, making it challenging to train the early layers. ResNet +(Residual Network) addresses this issue by enabling gradients to flow directly +through the network via skip connections, facilitating the training of much +deeper networks. However, in these skip connections, the input ipd is directly +added to the transformed data tfd, treating ipd and tfd equally, without +adapting to different scenarios. In this paper, we propose AdaResNet +(Auto-Adapting Residual Network), which automatically adjusts the ratio between +ipd and tfd based on the training data. We introduce a variable, +weight}_{tfd}^{ipd, to represent this ratio. This variable is dynamically +adjusted during backpropagation, allowing it to adapt to the training data +rather than remaining fixed. Experimental results demonstrate that AdaResNet +achieves a maximum accuracy improvement of over 50\% compared to traditional +ResNet. + +
+
+
+
+
+ + ☆ Weakly Supervised Pretraining and Multi-Annotator Supervised Finetuning + for Facial Wrinkle Detection + + +
+ 1. Research question: With the growing interest in skin diseases and skin +aesthetics, the ability to predict facial wrinkles is becoming increasingly +important. This study aims to evaluate whether a computational model, +convolutional neural networks (CNN), can be trained for automated facial +wrinkle segmentation. 2. Findings: Our study presents an effective technique +for integrating data from multiple annotators and illustrates that transfer +learning can enhance performance, resulting in dependable segmentation of +facial wrinkles. 3. Meaning: This approach automates intricate and +time-consuming tasks of wrinkle analysis with a deep learning framework. It +could be used to facilitate skin treatments and diagnostics. + +
+
+
+
+
+ + ☆ The curse of random quantum data + + +
+ Quantum machine learning, which involves running machine learning algorithms +on quantum devices, may be one of the most significant flagship applications +for these devices. Unlike its classical counterparts, the role of data in +quantum machine learning has not been fully understood. In this work, we +quantify the performances of quantum machine learning in the landscape of +quantum data. Provided that the encoding of quantum data is sufficiently +random, the performance, we find that the training efficiency and +generalization capabilities in quantum machine learning will be exponentially +suppressed with the increase in the number of qubits, which we call "the curse +of random quantum data". Our findings apply to both the quantum kernel method +and the large-width limit of quantum neural networks. Conversely, we highlight +that through meticulous design of quantum datasets, it is possible to avoid +these curses, thereby achieving efficient convergence and robust +generalization. Our conclusions are corroborated by extensive numerical +simulations. + +
+
+ comment: 40 pages, 8 figures +
+
+
+
+
+ + ☆ Electron-nucleus cross sections from transfer learning + + +
+ Transfer learning (TL) allows a deep neural network (DNN) trained on one type +of data to be adapted for new problems with limited information. We propose to +use the TL technique in physics. The DNN learns the physics of one process, and +after fine-tuning, it makes predictions for related processes. We consider the +DNNs, trained on inclusive electron-carbon scattering data, and show that after +fine-tuning, they accurately predict cross sections for electron interactions +with nuclear targets ranging from lithium to iron. The method works even when +the DNN is fine-tuned on a small dataset. + +
+
+ comment: 4 pages, 2 figures +
+
+
+
+
+ + ☆ Data Augmentation of Contrastive Learning is Estimating + Positive-incentive Noise + + +
+ Inspired by the idea of Positive-incentive Noise (Pi-Noise or $\pi$-Noise) +that aims at learning the reliable noise beneficial to tasks, we scientifically +investigate the connection between contrastive learning and $\pi$-noise in this +paper. By converting the contrastive loss to an auxiliary Gaussian distribution +to quantitatively measure the difficulty of the specific contrastive model +under the information theory framework, we properly define the task entropy, +the core concept of $\pi$-noise, of contrastive learning. It is further proved +that the predefined data augmentation in the standard contrastive learning +paradigm can be regarded as a kind of point estimation of $\pi$-noise. Inspired +by the theoretical study, a framework that develops a $\pi$-noise generator to +learn the beneficial noise (instead of estimation) as data augmentations for +contrast is proposed. The designed framework can be applied to diverse types of +data and is also completely compatible with the existing contrastive models. +From the visualization, we surprisingly find that the proposed method +successfully learns effective augmentations. + +
+
+
+
+
+ + ☆ Expressive Power of Temporal Message Passing + + +
+ Graph neural networks (GNNs) have recently been adapted to temporal settings, +often employing temporal versions of the message-passing mechanism known from +GNNs. We divide temporal message passing mechanisms from literature into two +main types: global and local, and establish Weisfeiler-Leman characterisations +for both. This allows us to formally analyse expressive power of temporal +message-passing models. We show that global and local temporal message-passing +mechanisms have incomparable expressive power when applied to arbitrary +temporal graphs. However, the local mechanism is strictly more expressive than +the global mechanism when applied to colour-persistent temporal graphs, whose +node colours are initially the same in all time points. Our theoretical +findings are supported by experimental evidence, underlining practical +implications of our analysis. + +
+
+ comment: 18 pages +
+
+
+
+
+ + ☆ Active Learning for Identifying Disaster-Related Tweets: A Comparison + with Keyword Filtering and Generic Fine-Tuning + + +
+ Information from social media can provide essential information for emergency +response during natural disasters in near real-time. However, it is difficult +to identify the disaster-related posts among the large amounts of unstructured +data available. Previous methods often use keyword filtering, topic modelling +or classification-based techniques to identify such posts. Active Learning (AL) +presents a promising sub-field of Machine Learning (ML) that has not been used +much in the field of text classification of social media content. This study +therefore investigates the potential of AL for identifying disaster-related +Tweets. We compare a keyword filtering approach, a RoBERTa model fine-tuned +with generic data from CrisisLex, a base RoBERTa model trained with AL and a +fine-tuned RoBERTa model trained with AL regarding classification performance. +For testing, data from CrisisLex and manually labelled data from the 2021 flood +in Germany and the 2023 Chile forest fires were considered. The results show +that generic fine-tuning combined with 10 rounds of AL outperformed all other +approaches. Consequently, a broadly applicable model for the identification of +disaster-related Tweets could be trained with very little labelling effort. The +model can be applied to use cases beyond this study and provides a useful tool +for further research in social media analysis. + +
+
+ comment: Submitted for the Intelligent Systems Conference (IntelliSys 2024). + The version of record of this contribution is published in the Springer + series Lecture Notes in Networks and Systems, and is available online at + https://doi.org/10.1007/978-3-031-66428-1_8. This preprint has not undergone + peer review or any post-submission improvements or corrections. 13 pages, 2 + figures +
+
+
+
+
+ + ☆ $p$SVM: Soft-margin SVMs with $p$-norm Hinge Loss + + +
+ Support Vector Machines (SVMs) based on hinge loss have been extensively +discussed and applied to various binary classification tasks. These SVMs +achieve a balance between margin maximization and the minimization of slack due +to outliers. Although many efforts have been dedicated to enhancing the +performance of SVMs with hinge loss, studies on $p$SVMs, soft-margin SVMs with +$p$-norm hinge loss, remain relatively scarce. In this paper, we explore the +properties, performance, and training algorithms of $p$SVMs. We first derive +the generalization bound of $p$SVMs, then formulate the dual optimization +problem, comparing it with the traditional approach. Furthermore, we discuss a +generalized version of the Sequential Minimal Optimization (SMO) algorithm, +$p$SMO, to train our $p$SVM model. Comparative experiments on various datasets, +including binary and multi-class classification tasks, demonstrate the +effectiveness and advantages of our $p$SVM model and the $p$SMO method. + +
+
+
+
+
+ + ☆ Instruction-Based Molecular Graph Generation with Unified Text-Graph + Diffusion Model + + +
+ Recent advancements in computational chemistry have increasingly focused on +synthesizing molecules based on textual instructions. Integrating graph +generation with these instructions is complex, leading most current methods to +use molecular sequences with pre-trained large language models. In response to +this challenge, we propose a novel framework, named $\textbf{UTGDiff (Unified +Text-Graph Diffusion Model)}$, which utilizes language models for discrete +graph diffusion to generate molecular graphs from instructions. UTGDiff +features a unified text-graph transformer as the denoising network, derived +from pre-trained language models and minimally modified to process graph data +through attention bias. Our experimental results demonstrate that UTGDiff +consistently outperforms sequence-based baselines in tasks involving +instruction-based molecule generation and editing, achieving superior +performance with fewer parameters given an equivalent level of pretraining +corpus. Our code is availble at https://github.com/ran1812/UTGDiff. + +
+
+
+
+
+ + ☆ Performance Law of Large Language Models + + +
+ Guided by the belief of the scaling law, large language models (LLMs) have +achieved impressive performance in recent years. However, scaling law only +gives a qualitative estimation of loss, which is influenced by various factors +such as model architectures, data distributions, tokenizers, and computation +precision. Thus, estimating the real performance of LLMs with different +training settings rather than loss may be quite useful in practical +development. In this article, we present an empirical equation named +"Performance Law" to directly predict the MMLU score of an LLM, which is a +widely used metric to indicate the general capability of LLMs in real-world +conversations and applications. Based on only a few key hyperparameters of the +LLM architecture and the size of training data, we obtain a quite accurate MMLU +prediction of various LLMs with diverse sizes and architectures developed by +different organizations in different years. Performance law can be used to +guide the choice of LLM architecture and the effective allocation of +computational resources without extensive experiments. + +
+
+ comment: Personal opinions of the authors +
+
+
+
+
+ + ☆ Differential Private Stochastic Optimization with Heavy-tailed Data: + Towards Optimal Rates + + +
+ We study convex optimization problems under differential privacy (DP). With +heavy-tailed gradients, existing works achieve suboptimal rates. The main +obstacle is that existing gradient estimators have suboptimal tail properties, +resulting in a superfluous factor of $d$ in the union bound. In this paper, we +explore algorithms achieving optimal rates of DP optimization with heavy-tailed +gradients. Our first method is a simple clipping approach. Under bounded $p$-th +order moments of gradients, with $n$ samples, it achieves +$\tilde{O}(\sqrt{d/n}+\sqrt{d}(\sqrt{d}/n\epsilon)^{1-1/p})$ population risk +with $\epsilon\leq 1/\sqrt{d}$. We then propose an iterative updating method, +which is more complex but achieves this rate for all $\epsilon\leq 1$. The +results significantly improve over existing methods. Such improvement relies on +a careful treatment of the tail behavior of gradient estimators. Our results +match the minimax lower bound in \cite{kamath2022improved}, indicating that the +theoretical limit of stochastic convex optimization under DP is achievable. + +
+
+
+
+
+ + ☆ GINO-Q: Learning an Asymptotically Optimal Index Policy for Restless + Multi-armed Bandits + + +
+ The restless multi-armed bandit (RMAB) framework is a popular model with +applications across a wide variety of fields. However, its solution is hindered +by the exponentially growing state space (with respect to the number of arms) +and the combinatorial action space, making traditional reinforcement learning +methods infeasible for large-scale instances. In this paper, we propose GINO-Q, +a three-timescale stochastic approximation algorithm designed to learn an +asymptotically optimal index policy for RMABs. GINO-Q mitigates the curse of +dimensionality by decomposing the RMAB into a series of subproblems, each with +the same dimension as a single arm, ensuring that complexity increases linearly +with the number of arms. Unlike recently developed Whittle-index-based +algorithms, GINO-Q does not require RMABs to be indexable, enhancing its +flexibility and applicability. Our experimental results demonstrate that GINO-Q +consistently learns near-optimal policies, even for non-indexable RMABs where +Whittle-index-based algorithms perform poorly, and it converges significantly +faster than existing baselines. + +
+
+ comment: 9 pages, 11 figures +
+
+
+
+
+ + ☆ New spectral imaging biomarkers for sepsis and mortality in intensive + care + + +
+ With sepsis remaining a leading cause of mortality, early identification of +septic patients and those at high risk of death is a challenge of high +socioeconomic importance. The driving hypothesis of this study was that +hyperspectral imaging (HSI) could provide novel biomarkers for sepsis diagnosis +and treatment management due to its potential to monitor microcirculatory +alterations. We conducted a comprehensive study involving HSI data of the palm +and fingers from more than 480 patients on the day of their intensive care unit +(ICU) admission. The findings demonstrate that HSI measurements can predict +sepsis with an area under the receiver operating characteristic curve (AUROC) +of 0.80 (95 % confidence interval (CI) [0.76; 0.84]) and mortality with an +AUROC of 0.72 (95 % CI [0.65; 0.79]). The predictive performance improves +substantially when additional clinical data is incorporated, leading to an +AUROC of up to 0.94 (95 % CI [0.92; 0.96]) for sepsis and 0.84 (95 % CI [0.78; +0.89]) for mortality. We conclude that HSI presents novel imaging biomarkers +for the rapid, non-invasive prediction of sepsis and mortality, suggesting its +potential as an important modality for guiding diagnosis and treatment. + +
+
+ comment: Markus A. Weigand, Lena Maier-Hein and Maximilian Dietrich + contributed equally +
+
+
+
+
+ + ☆ MAPLE: Enhancing Review Generation with Multi-Aspect Prompt LEarning in + Explainable Recommendation + + +
+ Explainable Recommendation task is designed to receive a pair of user and +item and output explanations to justify why an item is recommended to a user. +Many models treat review-generation as a proxy of explainable recommendation. +Although they are able to generate fluent and grammatical sentences, they +suffer from generality and hallucination issues. We propose a personalized, +aspect-controlled model called Multi-Aspect Prompt LEarner (MAPLE), in which it +integrates aspect category as another input dimension to facilitate the +memorization of fine-grained aspect terms. Experiments on two real-world review +datasets in restaurant domain show that MAPLE outperforms the baseline +review-generation models in terms of text and feature diversity while +maintaining excellent coherence and factual relevance. We further treat MAPLE +as a retriever component in the retriever-reader framework and employ a +Large-Language Model (LLM) as the reader, showing that MAPLE's explanation +along with the LLM's comprehension ability leads to enriched and personalized +explanation as a result. We will release the code and data in this http upon +acceptance. + +
+
+ comment: 8 main pages, 10 pages for appendix. Under review +
+
+
+
+
+ + ☆ 3D-Aware Instance Segmentation and Tracking in Egocentric Videos + + +
+ Egocentric videos present unique challenges for 3D scene understanding due to +rapid camera motion, frequent object occlusions, and limited object visibility. +This paper introduces a novel approach to instance segmentation and tracking in +first-person video that leverages 3D awareness to overcome these obstacles. Our +method integrates scene geometry, 3D object centroid tracking, and instance +segmentation to create a robust framework for analyzing dynamic egocentric +scenes. By incorporating spatial and temporal cues, we achieve superior +performance compared to state-of-the-art 2D approaches. Extensive evaluations +on the challenging EPIC Fields dataset demonstrate significant improvements +across a range of tracking and segmentation consistency metrics. Specifically, +our method outperforms the next best performing approach by $7$ points in +Association Accuracy (AssA) and $4.5$ points in IDF1 score, while reducing the +number of ID switches by $73\%$ to $80\%$ across various object categories. +Leveraging our tracked instance segmentations, we showcase downstream +applications in 3D object reconstruction and amodal video object segmentation +in these egocentric settings. + +
+
+
+
+
+ + ☆ ShortCircuit: AlphaZero-Driven Circuit Design + + +
+ Chip design relies heavily on generating Boolean circuits, such as +AND-Inverter Graphs (AIGs), from functional descriptions like truth tables. +While recent advances in deep learning have aimed to accelerate circuit design, +these efforts have mostly focused on tasks other than synthesis, and +traditional heuristic methods have plateaued. In this paper, we introduce +ShortCircuit, a novel transformer-based architecture that leverages the +structural properties of AIGs and performs efficient space exploration. +Contrary to prior approaches attempting end-to-end generation of logic circuits +using deep networks, ShortCircuit employs a two-phase process combining +supervised with reinforcement learning to enhance generalization to unseen +truth tables. We also propose an AlphaZero variant to handle the double +exponentially large state space and the sparsity of the rewards, enabling the +discovery of near-optimal designs. To evaluate the generative performance of +our trained model , we extract 500 truth tables from a benchmark set of 20 +real-world circuits. ShortCircuit successfully generates AIGs for 84.6% of the +8-input test truth tables, and outperforms the state-of-the-art logic synthesis +tool, ABC, by 14.61% in terms of circuits size. + +
+
+
+
+
+ + ☆ Machine Learning with Physics Knowledge for Prediction: A Survey + + +
+ This survey examines the broad suite of methods and models for combining +machine learning with physics knowledge for prediction and forecast, with a +focus on partial differential equations. These methods have attracted +significant interest due to their potential impact on advancing scientific +research and industrial practices by improving predictive models with small- or +large-scale datasets and expressive predictive models with useful inductive +biases. The survey has two parts. The first considers incorporating physics +knowledge on an architectural level through objective functions, structured +predictive models, and data augmentation. The second considers data as physics +knowledge, which motivates looking at multi-task, meta, and contextual learning +as an alternative approach to incorporating physics knowledge in a data-driven +fashion. Finally, we also provide an industrial perspective on the application +of these methods and a survey of the open-source ecosystem for physics-informed +machine learning. + +
+
+ comment: 56 pages, 8 figures, 2 tables +
+
+
+
+
+ + ☆ Mitigating the Stability-Plasticity Dilemma in Adaptive Train Scheduling + with Curriculum-Driven Continual DQN Expansion + + +
+ A continual learning agent builds on previous experiences to develop +increasingly complex behaviors by adapting to non-stationary and dynamic +environments while preserving previously acquired knowledge. However, scaling +these systems presents significant challenges, particularly in balancing the +preservation of previous policies with the adaptation of new ones to current +environments. This balance, known as the stability-plasticity dilemma, is +especially pronounced in complex multi-agent domains such as the train +scheduling problem, where environmental and agent behaviors are constantly +changing, and the search space is vast. In this work, we propose addressing +these challenges in the train scheduling problem using curriculum learning. We +design a curriculum with adjacent skills that build on each other to improve +generalization performance. Introducing a curriculum with distinct tasks +introduces non-stationarity, which we address by proposing a new algorithm: +Continual Deep Q-Network (DQN) Expansion (CDE). Our approach dynamically +generates and adjusts Q-function subspaces to handle environmental changes and +task requirements. CDE mitigates catastrophic forgetting through EWC while +ensuring high plasticity using adaptive rational activation functions. +Experimental results demonstrate significant improvements in learning +efficiency and adaptability compared to RL baselines and other adapted methods +for continual learning, highlighting the potential of our method in managing +the stability-plasticity dilemma in the adaptive train scheduling setting. + +
+
+ comment: 9 Pages, 2 Figures +
+
+
+
+
+ + ☆ Symplectic Neural Networks Based on Dynamical Systems + + +
+ We present and analyze a framework for designing symplectic neural networks +(SympNets) based on geometric integrators for Hamiltonian differential +equations. The SympNets are universal approximators in the space of Hamiltonian +diffeomorphisms, interpretable and have a non-vanishing gradient property. We +also give a representation theory for linear systems, meaning the proposed +P-SympNets can exactly parameterize any symplectic map corresponding to +quadratic Hamiltonians. Extensive numerical tests demonstrate increased +expressiveness and accuracy -- often several orders of magnitude better -- for +lower training cost over existing architectures. Lastly, we show how to perform +symbolic Hamiltonian regression with SympNets for polynomial systems using +backward error analysis. + +
+
+ comment: 33 pages including appendices but not references, 7 figures +
+
+
+
+
+ + ☆ Liquid Fourier Latent Dynamics Networks for fast GPU-based numerical + simulations in computational cardiology + + +
+ Scientific Machine Learning (ML) is gaining momentum as a cost-effective +alternative to physics-based numerical solvers in many engineering +applications. In fact, scientific ML is currently being used to build accurate +and efficient surrogate models starting from high-fidelity numerical +simulations, effectively encoding the parameterized temporal dynamics +underlying Ordinary Differential Equations (ODEs), or even the spatio-temporal +behavior underlying Partial Differential Equations (PDEs), in appropriately +designed neural networks. We propose an extension of Latent Dynamics Networks +(LDNets), namely Liquid Fourier LDNets (LFLDNets), to create parameterized +space-time surrogate models for multiscale and multiphysics sets of highly +nonlinear differential equations on complex geometries. LFLDNets employ a +neurologically-inspired, sparse, liquid neural network for temporal dynamics, +relaxing the requirement of a numerical solver for time advancement and leading +to superior performance in terms of tunable parameters, accuracy, efficiency +and learned trajectories with respect to neural ODEs based on feedforward +fully-connected neural networks. Furthermore, in our implementation of +LFLDNets, we use a Fourier embedding with a tunable kernel in the +reconstruction network to learn high-frequency functions better and faster than +using space coordinates directly as input. We challenge LFLDNets in the +framework of computational cardiology and evaluate their capabilities on two +3-dimensional test cases arising from multiscale cardiac electrophysiology and +cardiovascular hemodynamics. This paper illustrates the capability to run +Artificial Intelligence-based numerical simulations on single or multiple GPUs +in a matter of minutes and represents a significant step forward in the +development of physics-informed digital twins. + +
+
+
+
+
+ + ☆ A Population-to-individual Tuning Framework for Adapting Pretrained LM + to On-device User Intent Prediction KDD 2024 + + +
+ Mobile devices, especially smartphones, can support rich functions and have +developed into indispensable tools in daily life. With the rise of generative +AI services, smartphones can potentially transform into personalized +assistants, anticipating user needs and scheduling services accordingly. +Predicting user intents on smartphones, and reflecting anticipated activities +based on past interactions and context, remains a pivotal step towards this +vision. Existing research predominantly focuses on specific domains, neglecting +the challenge of modeling diverse event sequences across dynamic contexts. +Leveraging pre-trained language models (PLMs) offers a promising avenue, yet +adapting PLMs to on-device user intent prediction presents significant +challenges. To address these challenges, we propose PITuning, a +Population-to-Individual Tuning framework. PITuning enhances common pattern +extraction through dynamic event-to-intent transition modeling and addresses +long-tailed preferences via adaptive unlearning strategies. Experimental +results on real-world datasets demonstrate PITuning's superior intent +prediction performance, highlighting its ability to capture long-tailed +preferences and its practicality for on-device prediction scenarios. + +
+
+ comment: accepted by KDD 2024 +
+
+
+
+
+ + ☆ Enhance Modality Robustness in Text-Centric Multimodal Alignment with + Adversarial Prompting + + +
+ Converting different modalities into generalized text, which then serves as +input prompts for large language models (LLMs), is a common approach for +aligning multimodal models, particularly when pairwise data is limited. +Text-centric alignment method leverages the unique properties of text as a +modality space, transforming diverse inputs into a unified textual +representation, thereby enabling downstream models to effectively interpret +various modal inputs. This study evaluates the quality and robustness of +multimodal representations in the face of noise imperfections, dynamic input +order permutations, and missing modalities, revealing that current text-centric +alignment methods can compromise downstream robustness. To address this issue, +we propose a new text-centric adversarial training approach that significantly +enhances robustness compared to traditional robust training methods and +pre-trained multimodal foundation models. Our findings underscore the potential +of this approach to improve the robustness and adaptability of multimodal +representations, offering a promising solution for dynamic and real-world +applications. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2407.05036 +
+
+
+
+
+ + ☆ Unsupervised Composable Representations for Audio + + +
+ Current generative models are able to generate high-quality artefacts but +have been shown to struggle with compositional reasoning, which can be defined +as the ability to generate complex structures from simpler elements. In this +paper, we focus on the problem of compositional representation learning for +music data, specifically targeting the fully-unsupervised setting. We propose a +simple and extensible framework that leverages an explicit compositional +inductive bias, defined by a flexible auto-encoding objective that can leverage +any of the current state-of-art generative models. We demonstrate that our +framework, used with diffusion models, naturally addresses the task of +unsupervised audio source separation, showing that our model is able to perform +high-quality separation. Our findings reveal that our proposal achieves +comparable or superior performance with respect to other blind source +separation methods and, furthermore, it even surpasses current state-of-art +supervised baselines on signal-to-interference ratio metrics. Additionally, by +learning an a-posteriori masking diffusion model in the space of composable +representations, we achieve a system capable of seamlessly performing +unsupervised source separation, unconditional generation, and variation +generation. Finally, as our proposal works in the latent space of pre-trained +neural audio codecs, it also provides a lower computational cost with respect +to other neural baselines. + +
+
+ comment: ISMIR 2024 +
+
+
+
+
+ + ☆ ALTBI: Constructing Improved Outlier Detection Models via Optimization + of Inlier-Memorization Effect + + +
+ Outlier detection (OD) is the task of identifying unusual observations (or +outliers) from a given or upcoming data by learning unique patterns of normal +observations (or inliers). Recently, a study introduced a powerful unsupervised +OD (UOD) solver based on a new observation of deep generative models, called +inlier-memorization (IM) effect, which suggests that generative models memorize +inliers before outliers in early learning stages. In this study, we aim to +develop a theoretically principled method to address UOD tasks by maximally +utilizing the IM effect. We begin by observing that the IM effect is observed +more clearly when the given training data contain fewer outliers. This finding +indicates a potential for enhancing the IM effect in UOD regimes if we can +effectively exclude outliers from mini-batches when designing the loss +function. To this end, we introduce two main techniques: 1) increasing the +mini-batch size as the model training proceeds and 2) using an adaptive +threshold to calculate the truncated loss function. We theoretically show that +these two techniques effectively filter out outliers from the truncated loss +function, allowing us to utilize the IM effect to the fullest. Coupled with an +additional ensemble strategy, we propose our method and term it Adaptive Loss +Truncation with Batch Increment (ALTBI). We provide extensive experimental +results to demonstrate that ALTBI achieves state-of-the-art performance in +identifying outliers compared to other recent methods, even with significantly +lower computation costs. Additionally, we show that our method yields robust +performances when combined with privacy-preserving algorithms. + +
+
+ comment: 24 pages in total +
+
+
+
+
+ + ☆ Structure-enhanced Contrastive Learning for Graph Clustering + + +
+ Graph clustering is a crucial task in network analysis with widespread +applications, focusing on partitioning nodes into distinct groups with stronger +intra-group connections than inter-group ones. Recently, contrastive learning +has achieved significant progress in graph clustering. However, most methods +suffer from the following issues: 1) an over-reliance on meticulously designed +data augmentation strategies, which can undermine the potential of contrastive +learning. 2) overlooking cluster-oriented structural information, particularly +the higher-order cluster(community) structure information, which could unveil +the mesoscopic cluster structure information of the network. In this study, +Structure-enhanced Contrastive Learning (SECL) is introduced to addresses these +issues by leveraging inherent network structures. SECL utilizes a cross-view +contrastive learning mechanism to enhance node embeddings without elaborate +data augmentations, a structural contrastive learning module for ensuring +structural consistency, and a modularity maximization strategy for harnessing +clustering-oriented information. This comprehensive approach results in robust +node representations that greatly enhance clustering performance. Extensive +experiments on six datasets confirm SECL's superiority over current +state-of-the-art methods, indicating a substantial improvement in the domain of +graph clustering. + +
+
+
+
+
+ + ☆ Faster Adaptive Decentralized Learning Algorithms ICML 2024 + + +
+ Decentralized learning recently has received increasing attention in machine +learning due to its advantages in implementation simplicity and system +robustness, data privacy. Meanwhile, the adaptive gradient methods show +superior performances in many machine learning tasks such as training neural +networks. Although some works focus on studying decentralized optimization +algorithms with adaptive learning rates, these adaptive decentralized +algorithms still suffer from high sample complexity. To fill these gaps, we +propose a class of faster adaptive decentralized algorithms (i.e., AdaMDOS and +AdaMDOF) for distributed nonconvex stochastic and finite-sum optimization, +respectively. Moreover, we provide a solid convergence analysis framework for +our methods. In particular, we prove that our AdaMDOS obtains a near-optimal +sample complexity of $\tilde{O}(\epsilon^{-3})$ for finding an +$\epsilon$-stationary solution of nonconvex stochastic optimization. Meanwhile, +our AdaMDOF obtains a near-optimal sample complexity of +$O(\sqrt{n}\epsilon^{-2})$ for finding an $\epsilon$-stationary solution of +nonconvex finite-sum optimization, where $n$ denotes the sample size. To the +best of our knowledge, our AdaMDOF algorithm is the first adaptive +decentralized algorithm for nonconvex finite-sum optimization. Some +experimental results demonstrate efficiency of our algorithms. + +
+
+ comment: ICML 2024 (Spotlight) +
+
+
+
+
+ + ☆ Baby Bear: Seeking a Just Right Rating Scale for Scalar Annotations + + +
+ Our goal is a mechanism for efficiently assigning scalar ratings to each of a +large set of elements. For example, "what percent positive or negative is this +product review?" When sample sizes are small, prior work has advocated for +methods such as Best Worst Scaling (BWS) as being more robust than direct +ordinal annotation ("Likert scales"). Here we first introduce IBWS, which +iteratively collects annotations through Best-Worst Scaling, resulting in +robustly ranked crowd-sourced data. While effective, IBWS is too expensive for +large-scale tasks. Using the results of IBWS as a best-desired outcome, we +evaluate various direct assessment methods to determine what is both +cost-efficient and best correlating to a large scale BWS annotation strategy. +Finally, we illustrate in the domains of dialogue and sentiment how these +annotations can support robust learning-to-rank models. + +
+
+
+
+
+ + ☆ Sequential Federated Learning in Hierarchical Architecture on Non-IID + Datasets + + +
+ In a real federated learning (FL) system, communication overhead for passing +model parameters between the clients and the parameter server (PS) is often a +bottleneck. Hierarchical federated learning (HFL) that poses multiple edge +servers (ESs) between clients and the PS can partially alleviate communication +pressure but still needs the aggregation of model parameters from multiple ESs +at the PS. To further reduce communication overhead, we bring sequential FL +(SFL) into HFL for the first time, which removes the central PS and enables the +model training to be completed only through passing the global model between +two adjacent ESs for each iteration, and propose a novel algorithm adaptive to +such a combinational framework, referred to as Fed-CHS. Convergence results are +derived for strongly convex and non-convex loss functions under various data +heterogeneity setups, which show comparable convergence performance with the +algorithms for HFL or SFL solely. Experimental results provide evidence of the +superiority of our proposed Fed-CHS on both communication overhead saving and +test accuracy over baseline methods. + +
+
+
+
+
+ + ☆ Strategic Demonstration Selection for Improved Fairness in LLM + In-Context Learning + + +
+ Recent studies highlight the effectiveness of using in-context learning (ICL) +to steer large language models (LLMs) in processing tabular data, a challenging +task given the structured nature of such data. Despite advancements in +performance, the fairness implications of these methods are less understood. +This study investigates how varying demonstrations within ICL prompts influence +the fairness outcomes of LLMs. Our findings reveal that deliberately including +minority group samples in prompts significantly boosts fairness without +sacrificing predictive accuracy. Further experiments demonstrate that the +proportion of minority to majority samples in demonstrations affects the +trade-off between fairness and prediction accuracy. Based on these insights, we +introduce a mitigation technique that employs clustering and evolutionary +strategies to curate a diverse and representative sample set from the training +data. This approach aims to enhance both predictive performance and fairness in +ICL applications. Experimental results validate that our proposed method +dramatically improves fairness across various metrics, showing its efficacy in +real-world scenarios. + +
+
+
+
+
+ + ☆ Parallel-in-Time Solutions with Random Projection Neural Networks + + +
+ This paper considers one of the fundamental parallel-in-time methods for the +solution of ordinary differential equations, Parareal, and extends it by +adopting a neural network as a coarse propagator. We provide a theoretical +analysis of the convergence properties of the proposed algorithm and show its +effectiveness for several examples, including Lorenz and Burgers' equations. In +our numerical simulations, we further specialize the underpinning neural +architecture to Random Projection Neural Networks (RPNNs), a 2-layer neural +network where the first layer weights are drawn at random rather than +optimized. This restriction substantially increases the efficiency of fitting +RPNN's weights in comparison to a standard feedforward network without +negatively impacting the accuracy, as demonstrated in the SIR system example. + +
+
+
+
+
+ + ☆ Icing on the Cake: Automatic Code Summarization at Ericsson + + +
+ This paper presents our findings on the automatic summarization of Java +methods within Ericsson, a global telecommunications company. We evaluate the +performance of an approach called Automatic Semantic Augmentation of Prompts +(ASAP), which uses a Large Language Model (LLM) to generate leading summary +comments for Java methods. ASAP enhances the $LLM's$ prompt context by +integrating static program analysis and information retrieval techniques to +identify similar exemplar methods along with their developer-written Javadocs, +and serves as the baseline in our study. In contrast, we explore and compare +the performance of four simpler approaches that do not require static program +analysis, information retrieval, or the presence of exemplars as in the ASAP +method. Our methods rely solely on the Java method body as input, making them +lightweight and more suitable for rapid deployment in commercial software +development environments. We conducted experiments on an Ericsson software +project and replicated the study using two widely-used open-source Java +projects, Guava and Elasticsearch, to ensure the reliability of our results. +Performance was measured across eight metrics that capture various aspects of +similarity. Notably, one of our simpler approaches performed as well as or +better than the ASAP method on both the Ericsson project and the open-source +projects. Additionally, we performed an ablation study to examine the impact of +method names on Javadoc summary generation across our four proposed approaches +and the ASAP method. By masking the method names and observing the generated +summaries, we found that our approaches were statistically significantly less +influenced by the absence of method names compared to the baseline. This +suggests that our methods are more robust to variations in method names and may +derive summaries more comprehensively from the method body than the ASAP +approach. + +
+
+ comment: 16 pages, 6 tables, 4 figures. Accepted at the 2024 International + Conference on Software Maintenance and Evolution (ICSME) 2024 - Industry + Track +
+
+
+
+
+ + ☆ sTransformer: A Modular Approach for Extracting Inter-Sequential and + Temporal Information for Time-Series Forecasting + + +
+ In recent years, numerous Transformer-based models have been applied to +long-term time-series forecasting (LTSF) tasks. However, recent studies with +linear models have questioned their effectiveness, demonstrating that simple +linear layers can outperform sophisticated Transformer-based models. In this +work, we review and categorize existing Transformer-based models into two main +types: (1) modifications to the model structure and (2) modifications to the +input data. The former offers scalability but falls short in capturing +inter-sequential information, while the latter preprocesses time-series data +but is challenging to use as a scalable module. We propose +$\textbf{sTransformer}$, which introduces the Sequence and Temporal +Convolutional Network (STCN) to fully capture both sequential and temporal +information. Additionally, we introduce a Sequence-guided Mask Attention +mechanism to capture global feature information. Our approach ensures the +capture of inter-sequential information while maintaining module scalability. +We compare our model with linear models and existing forecasting models on +long-term time-series forecasting, achieving new state-of-the-art results. We +also conducted experiments on other time-series tasks, achieving strong +performance. These demonstrate that Transformer-based structures remain +effective and our model can serve as a viable baseline for time-series tasks. + +
+
+
+
+
+ + ☆ Towards Few-Shot Learning in the Open World: A Review and Beyond + + +
+ Human intelligence is characterized by our ability to absorb and apply +knowledge from the world around us, especially in rapidly acquiring new +concepts from minimal examples, underpinned by prior knowledge. Few-shot +learning (FSL) aims to mimic this capacity by enabling significant +generalizations and transferability. However, traditional FSL frameworks often +rely on assumptions of clean, complete, and static data, conditions that are +seldom met in real-world environments. Such assumptions falter in the +inherently uncertain, incomplete, and dynamic contexts of the open world. This +paper presents a comprehensive review of recent advancements designed to adapt +FSL for use in open-world settings. We categorize existing methods into three +distinct types of open-world few-shot learning: those involving varying +instances, varying classes, and varying distributions. Each category is +discussed in terms of its specific challenges and methods, as well as its +strengths and weaknesses. We standardize experimental settings and metric +benchmarks across scenarios, and provide a comparative analysis of the +performance of various methods. In conclusion, we outline potential future +research directions for this evolving field. It is our hope that this review +will catalyze further development of effective solutions to these complex +challenges, thereby advancing the field of artificial intelligence. + +
+
+
+
+
+ + ☆ Confirmation Bias in Gaussian Mixture Models + + +
+ Confirmation bias, the tendency to interpret information in a way that aligns +with one's preconceptions, can profoundly impact scientific research, leading +to conclusions that reflect the researcher's hypotheses even when the +observational data do not support them. This issue is especially critical in +scientific fields involving highly noisy observations, such as cryo-electron +microscopy. + This study investigates confirmation bias in Gaussian mixture models. We +consider the following experiment: A team of scientists assumes they are +analyzing data drawn from a Gaussian mixture model with known signals +(hypotheses) as centroids. However, in reality, the observations consist +entirely of noise without any informative structure. The researchers use a +single iteration of the K-means or expectation-maximization algorithms, two +popular algorithms to estimate the centroids. Despite the observations being +pure noise, we show that these algorithms yield biased estimates that resemble +the initial hypotheses, contradicting the unbiased expectation that averaging +these noise observations would converge to zero. Namely, the algorithms +generate estimates that mirror the postulated model, although the hypotheses +(the presumed centroids of the Gaussian mixture) are not evident in the +observations. Specifically, among other results, we prove a positive +correlation between the estimates produced by the algorithms and the +corresponding hypotheses. We also derive explicit closed-form expressions of +the estimates for a finite and infinite number of hypotheses. This study +underscores the risks of confirmation bias in low signal-to-noise environments, +provides insights into potential pitfalls in scientific methodologies, and +highlights the importance of prudent data interpretation. + +
+
+
+
+
+ + ☆ HYDEN: Hyperbolic Density Representations for Medical Images and Reports + + +
+ In light of the inherent entailment relations between images and text, +hyperbolic point vector embeddings, leveraging the hierarchical modeling +advantages of hyperbolic space, have been utilized for visual semantic +representation learning. However, point vector embedding approaches fail to +address the issue of semantic uncertainty, where an image may have multiple +interpretations, and text may refer to different images, a phenomenon +particularly prevalent in the medical domain. Therefor, we propose +\textbf{HYDEN}, a novel hyperbolic density embedding based image-text +representation learning approach tailored for specific medical domain data. +This method integrates text-aware local features alongside global features from +images, mapping image-text features to density features in hyperbolic space via +using hyperbolic pseudo-Gaussian distributions. An encapsulation loss function +is employed to model the partial order relations between image-text density +distributions. Experimental results demonstrate the interpretability of our +approach and its superior performance compared to the baseline methods across +various zero-shot tasks and different datasets. + +
+
+
+
+
+ + ☆ Community-Centric Graph Unlearning + + +
+ Graph unlearning technology has become increasingly important since the +advent of the `right to be forgotten' and the growing concerns about the +privacy and security of artificial intelligence. Graph unlearning aims to +quickly eliminate the effects of specific data on graph neural networks (GNNs). +However, most existing deterministic graph unlearning frameworks follow a +balanced partition-submodel training-aggregation paradigm, resulting in a lack +of structural information between subgraph neighborhoods and redundant +unlearning parameter calculations. To address this issue, we propose a novel +Graph Structure Mapping Unlearning paradigm (GSMU) and a novel method based on +it named Community-centric Graph Eraser (CGE). CGE maps community subgraphs to +nodes, thereby enabling the reconstruction of a node-level unlearning operation +within a reduced mapped graph. CGE makes the exponential reduction of both the +amount of training data and the number of unlearning parameters. Extensive +experiments conducted on five real-world datasets and three widely used GNN +backbones have verified the high performance and efficiency of our CGE method, +highlighting its potential in the field of graph unlearning. + +
+
+
+
+
+ + ☆ LightWeather: Harnessing Absolute Positional Encoding to Efficient and + Scalable Global Weather Forecasting + + +
+ Recently, Transformers have gained traction in weather forecasting for their +capability to capture long-term spatial-temporal correlations. However, their +complex architectures result in large parameter counts and extended training +times, limiting their practical application and scalability to global-scale +forecasting. This paper aims to explore the key factor for accurate weather +forecasting and design more efficient solutions. Interestingly, our empirical +findings reveal that absolute positional encoding is what really works in +Transformer-based weather forecasting models, which can explicitly model the +spatial-temporal correlations even without attention mechanisms. We +theoretically prove that its effectiveness stems from the integration of +geographical coordinates and real-world time features, which are intrinsically +related to the dynamics of weather. Based on this, we propose LightWeather, a +lightweight and effective model for station-based global weather forecasting. +We employ absolute positional encoding and a simple MLP in place of other +components of Transformer. With under 30k parameters and less than one hour of +training time, LightWeather achieves state-of-the-art performance on global +weather datasets compared to other advanced DL methods. The results underscore +the superiority of integrating spatial-temporal knowledge over complex +architectures, providing novel insights for DL in weather forecasting. + +
+
+
+
+
+ + ☆ Regularization for Adversarial Robust Learning + + +
+ Despite the growing prevalence of artificial neural networks in real-world +applications, their vulnerability to adversarial attacks remains to be a +significant concern, which motivates us to investigate the robustness of +machine learning models. While various heuristics aim to optimize the +distributionally robust risk using the $\infty$-Wasserstein metric, such a +notion of robustness frequently encounters computation intractability. To +tackle the computational challenge, we develop a novel approach to adversarial +training that integrates $\phi$-divergence regularization into the +distributionally robust risk function. This regularization brings a notable +improvement in computation compared with the original formulation. We develop +stochastic gradient methods with biased oracles to solve this problem +efficiently, achieving the near-optimal sample complexity. Moreover, we +establish its regularization effects and demonstrate it is asymptotic +equivalence to a regularized empirical risk minimization (ERM) framework, by +considering various scaling regimes of the regularization parameter $\eta$ and +robustness level $\rho$. These regimes yield gradient norm regularization, +variance regularization, or a smoothed gradient norm regularization that +interpolates between these extremes. We numerically validate our proposed +method in supervised learning, reinforcement learning, and contextual learning +and showcase its state-of-the-art performance against various adversarial +attacks. + +
+
+ comment: 51 pages, 5 figures +
+
+
+
+
+ + ☆ Contextual Bandits for Unbounded Context Distributions + + +
+ Nonparametric contextual bandit is an important model of sequential decision +making problems. Under $\alpha$-Tsybakov margin condition, existing research +has established a regret bound of +$\tilde{O}\left(T^{1-\frac{\alpha+1}{d+2}}\right)$ for bounded supports. +However, the optimal regret with unbounded contexts has not been analyzed. The +challenge of solving contextual bandit problems with unbounded support is to +achieve both exploration-exploitation tradeoff and bias-variance tradeoff +simultaneously. In this paper, we solve the nonparametric contextual bandit +problem with unbounded contexts. We propose two nearest neighbor methods +combined with UCB exploration. The first method uses a fixed $k$. Our analysis +shows that this method achieves minimax optimal regret under a weak margin +condition and relatively light-tailed context distributions. The second method +uses adaptive $k$. By a proper data-driven selection of $k$, this method +achieves an expected regret of +$\tilde{O}\left(T^{1-\frac{(\alpha+1)\beta}{\alpha+(d+2)\beta}}+T^{1-\beta}\right)$, +in which $\beta$ is a parameter describing the tail strength. This bound +matches the minimax lower bound up to logarithm factors, indicating that the +second method is approximately optimal. + +
+
+
+
+
+ + ☆ Meta-Learning on Augmented Gene Expression Profiles for Enhanced Lung + Cancer Detection + + +
+ Gene expression profiles obtained through DNA microarray have proven +successful in providing critical information for cancer detection classifiers. +However, the limited number of samples in these datasets poses a challenge to +employ complex methodologies such as deep neural networks for sophisticated +analysis. To address this "small data" dilemma, Meta-Learning has been +introduced as a solution to enhance the optimization of machine learning models +by utilizing similar datasets, thereby facilitating a quicker adaptation to +target datasets without the requirement of sufficient samples. In this study, +we present a meta-learning-based approach for predicting lung cancer from gene +expression profiles. We apply this framework to well-established deep learning +methodologies and employ four distinct datasets for the meta-learning tasks, +where one as the target dataset and the rest as source datasets. Our approach +is evaluated against both traditional and deep learning methodologies, and the +results show the superior performance of meta-learning on augmented source data +compared to the baselines trained on single datasets. Moreover, we conduct the +comparative analysis between meta-learning and transfer learning methodologies +to highlight the efficiency of the proposed approach in addressing the +challenges associated with limited sample sizes. Finally, we incorporate the +explainability study to illustrate the distinctiveness of decisions made by +meta-learning. + +
+
+ comment: Accepted to AMIA 2024 Annual Symposium +
+
+
+
+
+ + ☆ Branch and Bound to Assess Stability of Regression Coefficients in + Uncertain Models + + +
+ It can be difficult to interpret a coefficient of an uncertain model. A slope +coefficient of a regression model may change as covariates are added or removed +from the model. In the context of high-dimensional data, there are too many +model extensions to check. However, as we show here, it is possible to +efficiently search, with a branch and bound algorithm, for maximum and minimum +values of that adjusted slope coefficient over a discrete space of regularized +regression models. Here we introduce our algorithm, along with supporting +mathematical results, an example application, and a link to our computer code, +to help researchers summarize high-dimensional data and assess the stability of +regression coefficients in uncertain models. + +
+
+
+
+
+ + ☆ MoDeGPT: Modular Decomposition for Large Language Model Compression + + +
+ Large Language Models (LLMs) have reshaped the landscape of artificial +intelligence by demonstrating exceptional performance across various tasks. +However, substantial computational requirements make their deployment +challenging on devices with limited resources. Recently, compression methods +using low-rank matrix techniques have shown promise, yet these often lead to +degraded accuracy or introduce significant overhead in parameters and inference +latency. This paper introduces \textbf{Mo}dular \textbf{De}composition +(MoDeGPT), a novel structured compression framework that does not need recovery +fine-tuning while resolving the above drawbacks. MoDeGPT partitions the +Transformer block into modules comprised of matrix pairs and reduces the hidden +dimensions via reconstructing the module-level outputs. MoDeGPT is developed +based on a theoretical framework that utilizes three well-established matrix +decomposition algorithms -- Nystr\"om approximation, CR decomposition, and SVD +-- and applies them to our redefined transformer modules. Our comprehensive +experiments show MoDeGPT, without backward propagation, matches or surpasses +previous structured compression methods that rely on gradient information, and +saves 98% of compute costs on compressing a 13B model. On \textsc{Llama}-2/3 +and OPT models, MoDeGPT maintains 90-95% zero-shot performance with 25-30% +compression rates. Moreover, the compression can be done on a single GPU within +a few hours and increases the inference throughput by up to 46%. + +
+
+ comment: 31 pages, 9 figures +
+
+
+
+
+ + ☆ Attention is a smoothed cubic spline + + +
+ We highlight a perhaps important but hitherto unobserved insight: The +attention module in a transformer is a smoothed cubic spline. Viewed in this +manner, this mysterious but critical component of a transformer becomes a +natural development of an old notion deeply entrenched in classical +approximation theory. More precisely, we show that with ReLU-activation, +attention, masked attention, encoder-decoder attention are all cubic splines. +As every component in a transformer is constructed out of compositions of +various attention modules (= cubic splines) and feed forward neural networks (= +linear splines), all its components -- encoder, decoder, and encoder-decoder +blocks; multilayered encoders and decoders; the transformer itself -- are cubic +or higher-order splines. If we assume the Pierce-Birkhoff conjecture, then the +converse also holds, i.e., every spline is a ReLU-activated encoder. Since a +spline is generally just $C^2$, one way to obtain a smoothed $C^\infty$-version +is by replacing ReLU with a smooth activation; and if this activation is chosen +to be SoftMax, we recover the original transformer as proposed by Vaswani et +al. This insight sheds light on the nature of the transformer by casting it +entirely in terms of splines, one of the best known and thoroughly understood +objects in applied mathematics. + +
+
+ comment: 20 pages, 2 figures +
+
+
+
+
+ + ☆ Parkinson's Disease Classification via EEG: All You Need is a Single + Convolutional Layer + + +
+ In this work, we introduce LightCNN, a minimalist Convolutional Neural +Network (CNN) architecture designed for Parkinson's disease (PD) classification +using EEG data. LightCNN's strength lies in its simplicity, utilizing just a +single convolutional layer. Embracing Leonardo da Vinci's principle that +"simplicity is the ultimate sophistication," LightCNN demonstrates that +complexity is not required to achieve outstanding results. We benchmarked +LightCNN against several state-of-the-art deep learning models known for their +effectiveness in EEG-based PD classification. Remarkably, LightCNN outperformed +all these complex architectures, with a 2.3% improvement in recall, a 4.6% +increase in precision, a 0.1% edge in AUC, a 4% boost in F1-score, and a 3.3% +higher accuracy compared to the closest competitor. Furthermore, LightCNN +identifies known pathological brain rhythms associated with PD and effectively +captures clinically relevant neurophysiological changes in EEG. Its simplicity +and interpretability make it ideal for deployment in resource-constrained +environments, such as mobile or embedded systems for EEG analysis. In +conclusion, LightCNN represents a significant step forward in efficient +EEG-based PD classification, demonstrating that a well-designed, lightweight +model can achieve superior performance over more complex architectures. This +work underscores the potential for minimalist models to meet the needs of +modern healthcare applications, particularly where resources are limited. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ Differentially Private Stochastic Gradient Descent with Fixed-Size + Minibatches: Tighter RDP Guarantees with or without Replacement + + +
+ Differentially private stochastic gradient descent (DP-SGD) has been +instrumental in privately training deep learning models by providing a +framework to control and track the privacy loss incurred during training. At +the core of this computation lies a subsampling method that uses a privacy +amplification lemma to enhance the privacy guarantees provided by the additive +noise. Fixed size subsampling is appealing for its constant memory usage, +unlike the variable sized minibatches in Poisson subsampling. It is also of +interest in addressing class imbalance and federated learning. However, the +current computable guarantees for fixed-size subsampling are not tight and do +not consider both add/remove and replace-one adjacency relationships. We +present a new and holistic R{\'e}nyi differential privacy (RDP) accountant for +DP-SGD with fixed-size subsampling without replacement (FSwoR) and with +replacement (FSwR). For FSwoR we consider both add/remove and replace-one +adjacency. Our FSwoR results improves on the best current computable bound by a +factor of $4$. We also show for the first time that the widely-used Poisson +subsampling and FSwoR with replace-one adjacency have the same privacy to +leading order in the sampling probability. Accordingly, our work suggests that +FSwoR is often preferable to Poisson subsampling due to constant memory usage. +Our FSwR accountant includes explicit non-asymptotic upper and lower bounds +and, to the authors' knowledge, is the first such analysis of fixed-size RDP +with replacement for DP-SGD. We analytically and empirically compare fixed size +and Poisson subsampling, and show that DP-SGD gradients in a fixed-size +subsampling regime exhibit lower variance in practice in addition to memory +usage benefits. + +
+
+ comment: 39 pages, 10 figures +
+
+
+
+
+ + ☆ Federated Learning of Large ASR Models in the Real World + + +
+ Federated learning (FL) has shown promising results on training machine +learning models with privacy preservation. However, for large models with over +100 million parameters, the training resource requirement becomes an obstacle +for FL because common devices do not have enough memory and computation power +to finish the FL tasks. Although efficient training methods have been proposed, +it is still a challenge to train the large models like Conformer based ASR. +This paper presents a systematic solution to train the full-size ASR models of +130M parameters with FL. To our knowledge, this is the first real-world FL +application of the Conformer model, which is also the largest model ever +trained with FL so far. And this is the first paper showing FL can improve the +ASR model quality with a set of proposed methods to refine the quality of data +and labels of clients. We demonstrate both the training efficiency and the +model quality improvement in real-world experiments. + +
+
+
+
+
+ + ☆ Understanding Generative AI Content with Embedding Models + + +
+ The construction of high-quality numerical features is critical to any +quantitative data analysis. Feature engineering has been historically addressed +by carefully hand-crafting data representations based on domain expertise. This +work views the internal representations of modern deep neural networks (DNNs), +called embeddings, as an automated form of traditional feature engineering. For +trained DNNs, we show that these embeddings can reveal interpretable, +high-level concepts in unstructured sample data. We use these embeddings in +natural language and computer vision tasks to uncover both inherent +heterogeneity in the underlying data and human-understandable explanations for +it. In particular, we find empirical evidence that there is inherent +separability between real data and that generated from AI models. + +
+
+
+
+
+ + ☆ Learning Regularization for Graph Inverse Problems + + +
+ In recent years, Graph Neural Networks (GNNs) have been utilized for various +applications ranging from drug discovery to network design and social networks. +In many applications, it is impossible to observe some properties of the graph +directly; instead, noisy and indirect measurements of these properties are +available. These scenarios are coined as Graph Inverse Problems (GRIP). In this +work, we introduce a framework leveraging GNNs to solve GRIPs. The framework is +based on a combination of likelihood and prior terms, which are used to find a +solution that fits the data while adhering to learned prior information. +Specifically, we propose to combine recent deep learning techniques that were +developed for inverse problems, together with GNN architectures, to formulate +and solve GRIP. We study our approach on a number of representative problems +that demonstrate the effectiveness of the framework. + +
+
+
+
+
+ + ♻ ☆ Towards Quantum Federated Learning + + +
+ Quantum Federated Learning (QFL) is an emerging interdisciplinary field that +merges the principles of Quantum Computing (QC) and Federated Learning (FL), +with the goal of leveraging quantum technologies to enhance privacy, security, +and efficiency in the learning process. Currently, there is no comprehensive +survey for this interdisciplinary field. This review offers a thorough, +holistic examination of QFL. We aim to provide a comprehensive understanding of +the principles, techniques, and emerging applications of QFL. We discuss the +current state of research in this rapidly evolving field, identify challenges +and opportunities associated with integrating these technologies, and outline +future directions and open research questions. We propose a unique taxonomy of +QFL techniques, categorized according to their characteristics and the quantum +techniques employed. As the field of QFL continues to progress, we can +anticipate further breakthroughs and applications across various industries, +driving innovation and addressing challenges related to data privacy, security, +and resource optimization. This review serves as a first-of-its-kind +comprehensive guide for researchers and practitioners interested in +understanding and advancing the field of QFL. + +
+
+ comment: Survey of quantum federated learning (QFL) +
+
+
+
+
+ + ♻ ☆ A Unified Framework to Enforce, Discover, and Promote Symmetry in + Machine Learning + + +
+ Symmetry is present throughout nature and continues to play an increasingly +central role in physics and machine learning. Fundamental symmetries, such as +Poincar\'{e} invariance, allow physical laws discovered in laboratories on +Earth to be extrapolated to the farthest reaches of the universe. Symmetry is +essential to achieving this extrapolatory power in machine learning +applications. For example, translation invariance in image classification +allows models with fewer parameters, such as convolutional neural networks, to +be trained on smaller data sets and achieve state-of-the-art performance. In +this paper, we provide a unifying theoretical and methodological framework for +incorporating symmetry into machine learning models in three ways: 1. enforcing +known symmetry when training a model; 2. discovering unknown symmetries of a +given model or data set; and 3. promoting symmetry during training by learning +a model that breaks symmetries within a user-specified group of candidates when +there is sufficient evidence in the data. We show that these tasks can be cast +within a common mathematical framework whose central object is the Lie +derivative associated with fiber-linear Lie group actions on vector bundles. We +extend and unify several existing results by showing that enforcing and +discovering symmetry are linear-algebraic tasks that are dual with respect to +the bilinear structure of the Lie derivative. We also propose a novel way to +promote symmetry by introducing a class of convex regularization functions +based on the Lie derivative and nuclear norm relaxation to penalize symmetry +breaking during training of machine learning models. We explain how these ideas +can be applied to a wide range of machine learning models including basis +function regression, dynamical systems discovery, neural networks, and neural +operators acting on fields. + +
+
+
+
+
+ + ♻ ☆ Classical Machine Learning: Seventy Years of Algorithmic Learning + Evolution + + +
+ Machine learning (ML) has transformed numerous fields, but understanding its +foundational research is crucial for its continued progress. This paper +presents an overview of the significant classical ML algorithms and examines +the state-of-the-art publications spanning twelve decades through an extensive +bibliometric analysis study. We analyzed a dataset of highly cited papers from +prominent ML conferences and journals, employing citation and keyword analyses +to uncover critical insights. The study further identifies the most influential +papers and authors, reveals the evolving collaborative networks within the ML +community, and pinpoints prevailing research themes and emerging focus areas. +Additionally, we examine the geographic distribution of highly cited +publications, highlighting the leading countries in ML research. This study +provides a comprehensive overview of the evolution of traditional learning +algorithms and their impacts. It discusses challenges and opportunities for +future development, focusing on the Global South. The findings from this paper +offer valuable insights for both ML experts and the broader research community, +enhancing understanding of the field's trajectory and its significant influence +on recent advances in learning algorithms. + +
+
+
+
+
+ + ♻ ☆ Mechanistic Design and Scaling of Hybrid Architectures + + +
+ The development of deep learning architectures is a resource-demanding +process, due to a vast design space, long prototyping times, and high compute +costs associated with at-scale model training and evaluation. We set out to +simplify this process by grounding it in an end-to-end mechanistic architecture +design (MAD) pipeline, encompassing small-scale capability unit tests +predictive of scaling laws. Through a suite of synthetic token manipulation +tasks such as compression and recall, designed to probe capabilities, we +identify and test new hybrid architectures constructed from a variety of +computational primitives. We experimentally validate the resulting +architectures via an extensive compute-optimal and a new state-optimal scaling +law analysis, training over 500 language models between 70M to 7B parameters. +Surprisingly, we find MAD synthetics to correlate with compute-optimal +perplexity, enabling accurate evaluation of new architectures via isolated +proxy tasks. The new architectures found via MAD, based on simple ideas such as +hybridization and sparsity, outperform state-of-the-art Transformer, +convolutional, and recurrent architectures (Transformer++, Hyena, Mamba) in +scaling, both at compute-optimal budgets and in overtrained regimes. Overall, +these results provide evidence that performance on curated synthetic tasks can +be predictive of scaling laws, and that an optimal architecture should leverage +specialized layers via a hybrid topology. + +
+
+
+
+
+ + ♻ ☆ Topic-Based Watermarks for LLM-Generated Text + + +
+ The indistinguishability of text generated by large language models (LLMs) +from human-generated text poses significant challenges. Watermarking algorithms +are potential solutions by embedding detectable signatures within LLM-generated +outputs. However, current watermarking schemes lack robustness to a range of +attacks such as text substitution or manipulation, undermining their +reliability. This paper proposes a novel topic-based watermarking algorithm for +LLMs, designed to enhance the robustness of watermarking in LLMs. Our approach +leverages the topics extracted from input prompts or outputs of non-watermarked +LLMs in the generation process of watermarked text. We dynamically utilize +token lists on identified topics and adjust token sampling weights accordingly. +By using these topic-specific token biases, we embed a topic-sensitive +watermarking into the generated text. We outline the theoretical framework of +our topic-based watermarking algorithm and discuss its potential advantages in +various scenarios. Additionally, we explore a comprehensive range of attacks +against watermarking algorithms, including discrete alterations, paraphrasing, +and tokenizations. We demonstrate that our proposed watermarking scheme +classifies various watermarked text topics with 99.99% confidence and +outperforms existing algorithms in terms of z-score robustness and the +feasibility of modeling text degradation by potential attackers, while +considering the trade-offs between the benefits and losses of watermarking +LLM-generated text. + +
+
+ comment: Results for proposed scheme, additional/removal of content (figures + and equations), 12 pages +
+
+
+
+
+ + ♻ ☆ Structure Learning with Continuous Optimization: A Sober Look and Beyond + + +
+ This paper investigates in which cases continuous optimization for directed +acyclic graph (DAG) structure learning can and cannot perform well and why this +happens, and suggests possible directions to make the search procedure more +reliable. Reisach et al. (2021) suggested that the remarkable performance of +several continuous structure learning approaches is primarily driven by a high +agreement between the order of increasing marginal variances and the +topological order, and demonstrated that these approaches do not perform well +after data standardization. We analyze this phenomenon for continuous +approaches assuming equal and non-equal noise variances, and show that the +statement may not hold in either case by providing counterexamples, +justifications, and possible alternative explanations. We further demonstrate +that nonconvexity may be a main concern especially for the non-equal noise +variances formulation, while recent advances in continuous structure learning +fail to achieve improvement in this case. Our findings suggest that future +works should take into account the non-equal noise variances formulation to +handle more general settings and for a more comprehensive empirical evaluation. +Lastly, we provide insights into other aspects of the search procedure, +including thresholding and sparsity, and show that they play an important role +in the final solutions. + +
+
+ comment: 3rd Conference on Causal Learning and Reasoning (CLeaR 2024) +
+
+
+
+
+ + ♻ ☆ CoMusion: Towards Consistent Stochastic Human Motion Prediction via + Motion Diffusion ECCV 2024 + + +
+ Stochastic Human Motion Prediction (HMP) aims to predict multiple possible +future human pose sequences from observed ones. Most prior works learn motion +distributions through encoding-decoding in the latent space, which does not +preserve motion's spatial-temporal structure. While effective, these methods +often require complex, multi-stage training and yield predictions that are +inconsistent with the provided history and can be physically unrealistic. To +address these issues, we propose CoMusion, a single-stage, end-to-end +diffusion-based stochastic HMP framework. CoMusion is inspired from the insight +that a smooth future pose initialization improves prediction performance, a +strategy not previously utilized in stochastic models but evidenced in +deterministic works. To generate such initialization, CoMusion's motion +predictor starts with a Transformer-based network for initial reconstruction of +corrupted motion. Then, a graph convolutional network (GCN) is employed to +refine the prediction considering past observations in the discrete cosine +transformation (DCT) space. Our method, facilitated by the Transformer-GCN +module design and a proposed variance scheduler, excels in predicting accurate, +realistic, and consistent motions, while maintaining appropriate diversity. +Experimental results on benchmark datasets demonstrate that CoMusion surpasses +prior methods across metrics, while demonstrating superior generation quality. +Our Code is released at https://github.com/jsun57/CoMusion/ . + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Conjectural Online Learning with First-order Beliefs in Asymmetric + Information Stochastic Games + + +
+ Asymmetric information stochastic games (AISGs) arise in many complex +socio-technical systems, such as cyber-physical systems and IT infrastructures. +Existing computational methods for AISGs are primarily offline and can not +adapt to equilibrium deviations. Further, current methods are limited to +particular information structures to avoid belief hierarchies. Considering +these limitations, we propose conjectural online learning (COL), an online +learning method under generic information structures in AISGs. COL uses a +forecaster-actor-critic (FAC) architecture, where subjective forecasts are used +to conjecture the opponents' strategies within a lookahead horizon, and +Bayesian learning is used to calibrate the conjectures. To adapt strategies to +nonstationary environments based on information feedback, COL uses online +rollout with cost function approximation (actor-critic). We prove that the +conjectures produced by COL are asymptotically consistent with the information +feedback in the sense of a relaxed Bayesian consistency. We also prove that the +empirical strategy profile induced by COL converges to the Berk-Nash +equilibrium, a solution concept characterizing rationality under subjectivity. +Experimental results from an intrusion response use case demonstrate COL's +{faster convergence} over state-of-the-art reinforcement learning methods +against nonstationary attacks. + +
+
+ comment: Accepted to the 63rd IEEE Conference on Decision and Control, Special + Session on Networks, Games and Learning +
+
+
+
+
+ + ♻ ☆ Constructing Domain-Specific Evaluation Sets for LLM-as-a-judge + + +
+ Large Language Models (LLMs) have revolutionized the landscape of machine +learning, yet current benchmarks often fall short in capturing the diverse +behavior of these models in real-world applications. A benchmark's usefulness +is determined by its ability to clearly differentiate between models of varying +capabilities (separability) and closely align with human preferences. Existing +frameworks like Alpaca-Eval 2.0 LC +\cite{dubois2024lengthcontrolledalpacaevalsimpleway} and Arena-Hard v0.1 +\cite{li2024crowdsourced} are limited by their focus on general-purpose queries +and lack of diversity across domains such as law, medicine, and multilingual +contexts. In this paper, we address these limitations by introducing a novel +data pipeline that curates diverse, domain-specific evaluation sets tailored +for LLM-as-a-Judge frameworks. Our approach leverages a combination of manual +curation, semi-supervised learning to generate clusters, and stratified +sampling to ensure balanced representation across a wide range of domains and +languages. The resulting evaluation set, which includes 1573 samples across 14 +categories, demonstrates high separability (84\%) across ten top-ranked models, +and agreement (84\%) with Chatbot Arena and (0.915) Spearman correlation. The +agreement values are 9\% better than Arena Hard and 20\% better than AlpacaEval +2.0 LC, while the Spearman coefficient is 0.7 more than the next best +benchmark, showcasing a significant improvement in the usefulness of the +benchmark. We further provide an open-source evaluation tool that enables +fine-grained analysis of model performance across user-defined categories, +offering valuable insights for practitioners. This work contributes to the +ongoing effort to enhance the transparency, diversity, and effectiveness of LLM +evaluation methodologies. + +
+
+ comment: 14 pages, 8 figures, Under review +
+
+
+
+
+ + ♻ ☆ Tensor network compressibility of convolutional models + + +
+ Convolutional neural networks (CNNs) are one of the most widely used neural +network architectures, showcasing state-of-the-art performance in computer +vision tasks. Although larger CNNs generally exhibit higher accuracy, their +size can be effectively reduced by ``tensorization'' while maintaining +accuracy, namely, replacing the convolution kernels with compact decompositions +such as Tucker, Canonical Polyadic decompositions, or quantum-inspired +decompositions such as matrix product states, and directly training the factors +in the decompositions to bias the learning towards low-rank decompositions. But +why doesn't tensorization seem to impact the accuracy adversely? We explore +this by assessing how \textit{truncating} the convolution kernels of +\textit{dense} (untensorized) CNNs impact their accuracy. Specifically, we +truncated the kernels of (i) a vanilla four-layer CNN and (ii) ResNet-50 +pre-trained for image classification on CIFAR-10 and CIFAR-100 datasets. We +found that kernels (especially those inside deeper layers) could often be +truncated along several cuts resulting in significant loss in kernel norm but +not in classification accuracy. This suggests that such ``correlation +compression'' (underlying tensorization) is an intrinsic feature of how +information is encoded in dense CNNs. We also found that aggressively truncated +models could often recover the pre-truncation accuracy after only a few epochs +of re-training, suggesting that compressing the internal correlations of +convolution layers does not often transport the model to a worse minimum. Our +results can be applied to tensorize and compress CNN models more effectively. + +
+
+ comment: 40 pages, 21 images +
+
+
+
+
+ + ♻ ☆ Best of Both Worlds: Practical and Theoretically Optimal Submodular + Maximization in Parallel NeurIPS 2021 + + +
+ For the problem of maximizing a monotone, submodular function with respect to +a cardinality constraint $k$ on a ground set of size $n$, we provide an +algorithm that achieves the state-of-the-art in both its empirical performance +and its theoretical properties, in terms of adaptive complexity, query +complexity, and approximation ratio; that is, it obtains, with high +probability, query complexity of $O(n)$ in expectation, adaptivity of +$O(\log(n))$, and approximation ratio of nearly $1-1/e$. The main algorithm is +assembled from two components which may be of independent interest. The first +component of our algorithm, LINEARSEQ, is useful as a preprocessing algorithm +to improve the query complexity of many algorithms. Moreover, a variant of +LINEARSEQ is shown to have adaptive complexity of $O( \log (n / k) )$ which is +smaller than that of any previous algorithm in the literature. The second +component is a parallelizable thresholding procedure THRESHOLDSEQ for adding +elements with gain above a constant threshold. Finally, we demonstrate that our +main algorithm empirically outperforms, in terms of runtime, adaptive rounds, +total queries, and objective values, the previous state-of-the-art algorithm +FAST in a comprehensive evaluation with six submodular objective functions. + +
+
+ comment: 32 pages, 8 figures, to be published in NeurIPS 2021 +
+
+
+
+
+ + ♻ ☆ Data-driven Energy Consumption Modelling for Electric Micromobility + using an Open Dataset + + +
+ The escalating challenges of traffic congestion and environmental degradation +underscore the critical importance of embracing E-Mobility solutions in urban +spaces. In particular, micro E-Mobility tools such as E-scooters and E-bikes, +play a pivotal role in this transition, offering sustainable alternatives for +urban commuters. However, the energy consumption patterns for these tools are a +critical aspect that impacts their effectiveness in real-world scenarios and is +essential for trip planning and boosting user confidence in using these. To +this effect, recent studies have utilised physical models customised for +specific mobility tools and conditions, but these models struggle with +generalization and effectiveness in real-world scenarios due to a notable +absence of open datasets for thorough model evaluation and verification. To +fill this gap, our work presents an open dataset, collected in Dublin, Ireland, +specifically designed for energy modelling research related to E-Scooters and +E-Bikes. Furthermore, we provide a comprehensive analysis of energy consumption +modelling based on the dataset using a set of representative machine learning +algorithms and compare their performance against the contemporary mathematical +models as a baseline. Our results demonstrate a notable advantage for +data-driven models in comparison to the corresponding mathematical models for +estimating energy consumption. Specifically, data-driven models outperform +physical models in accuracy by up to 83.83% for E-Bikes and 82.16% for +E-Scooters based on an in-depth analysis of the dataset under certain +assumptions. + +
+
+ comment: 7 pages, 5 figures, 4 tables. This manuscript has been accepted by + the IEEE ITEC 2024 +
+
+
+
+
+ + ♻ ☆ CRITERIA: a New Benchmarking Paradigm for Evaluating Trajectory + Prediction Models for Autonomous Driving + + +
+ Benchmarking is a common method for evaluating trajectory prediction models +for autonomous driving. Existing benchmarks rely on datasets, which are biased +towards more common scenarios, such as cruising, and distance-based metrics +that are computed by averaging over all scenarios. Following such a regiment +provides a little insight into the properties of the models both in terms of +how well they can handle different scenarios and how admissible and diverse +their outputs are. There exist a number of complementary metrics designed to +measure the admissibility and diversity of trajectories, however, they suffer +from biases, such as length of trajectories. + In this paper, we propose a new benChmarking paRadIgm for evaluaTing +trajEctoRy predIction Approaches (CRITERIA). Particularly, we propose 1) a +method for extracting driving scenarios at varying levels of specificity +according to the structure of the roads, models' performance, and data +properties for fine-grained ranking of prediction models; 2) A set of new +bias-free metrics for measuring diversity, by incorporating the characteristics +of a given scenario, and admissibility, by considering the structure of roads +and kinematic compliancy, motivated by real-world driving constraints. 3) Using +the proposed benchmark, we conduct extensive experimentation on a +representative set of the prediction models using the large scale Argoverse +dataset. We show that the proposed benchmark can produce a more accurate +ranking of the models and serve as a means of characterizing their behavior. We +further present ablation studies to highlight contributions of different +elements that are used to compute the proposed metrics. + +
+
+
+
+
+ + ♻ ☆ Semantic Prototypes: Enhancing Transparency Without Black Boxes CIKM 2024 + + +
+ As machine learning (ML) models and datasets increase in complexity, the +demand for methods that enhance explainability and interpretability becomes +paramount. Prototypes, by encapsulating essential characteristics within data, +offer insights that enable tactical decision-making and enhance transparency. +Traditional prototype methods often rely on sub-symbolic raw data and opaque +latent spaces, reducing explainability and increasing the risk of +misinterpretations. This paper presents a novel framework that utilizes +semantic descriptions to define prototypes and provide clear explanations, +effectively addressing the shortcomings of conventional methods. Our approach +leverages concept-based descriptions to cluster data on the semantic level, +ensuring that prototypes not only represent underlying properties intuitively +but are also straightforward to interpret. Our method simplifies the +interpretative process and effectively bridges the gap between complex data +structures and human cognitive processes, thereby enhancing transparency and +fostering trust. Our approach outperforms existing widely-used prototype +methods in facilitating human understanding and informativeness, as validated +through a user survey. + +
+
+ comment: This paper has been accepted for publication as a full paper at the + 33rd ACM International Conference on Information and Knowledge Management + (CIKM 2024) +
+
+
+
+
+ + ♻ ☆ Learning Using Generated Privileged Information by Text-to-Image + Diffusion Models ICPR 2024 + + +
+ Learning Using Privileged Information is a particular type of knowledge +distillation where the teacher model benefits from an additional data +representation during training, called privileged information, improving the +student model, which does not see the extra representation. However, privileged +information is rarely available in practice. To this end, we propose a text +classification framework that harnesses text-to-image diffusion models to +generate artificial privileged information. The generated images and the +original text samples are further used to train multimodal teacher models based +on state-of-the-art transformer-based architectures. Finally, the knowledge +from multimodal teachers is distilled into a text-based (unimodal) student. +Hence, by employing a generative model to produce synthetic data as privileged +information, we guide the training of the student model. Our framework, called +Learning Using Generated Privileged Information (LUGPI), yields noticeable +performance gains on four text classification data sets, demonstrating its +potential in text classification without any additional cost during inference. + +
+
+ comment: Accepted at ICPR 2024 +
+
+
+
+
+ + ♻ ☆ Sim-to-Real Transfer of Deep Reinforcement Learning Agents for Online + Coverage Path Planning + + +
+ Sim-to-real transfer presents a difficult challenge, where models trained in +simulation are to be deployed in the real world. The distribution shift between +the two settings leads to biased representations of the dynamics, and thus to +suboptimal predictions in the real-world environment. In this work, we tackle +the challenge of sim-to-real transfer of reinforcement learning (RL) agents for +coverage path planning (CPP). In CPP, the task is for a robot to find a path +that covers every point of a confined area. Specifically, we consider the case +where the environment is unknown, and the agent needs to plan the path online +while mapping the environment. We bridge the sim-to-real gap through a +semi-virtual environment, including a real robot and real-time aspects, while +utilizing a simulated sensor and obstacles to enable environment randomization +and automated episode resetting. We investigate what level of fine-tuning is +needed for adapting to a realistic setting, comparing to an agent trained +solely in simulation. We find that a high inference frequency allows +first-order Markovian policies to transfer directly from simulation, while +higher-order policies can be fine-tuned to further reduce the sim-to-real gap. +Moreover, they can operate at a lower frequency, thus reducing computational +requirements. In both cases, our approaches transfer state-of-the-art results +from simulation to the real domain, where direct learning would take in the +order of weeks with manual interaction, that is, it would be completely +infeasible. + +
+
+
+
+
+ + ♻ ☆ Revisiting Day-ahead Electricity Price: Simple Model Save Millions + + +
+ Accurate day-ahead electricity price forecasting is essential for residential +welfare, yet current methods often fall short in forecast accuracy. We observe +that commonly used time series models struggle to utilize the prior correlation +between price and demand-supply, which, we found, can contribute a lot to a +reliable electricity price forecaster. Leveraging this prior, we propose a +simple piecewise linear model that significantly enhances forecast accuracy by +directly deriving prices from readily forecastable demand-supply values. +Experiments in the day-ahead electricity markets of Shanxi province and ISO New +England reveal that such forecasts could potentially save residents millions of +dollars a year compared to existing methods. Our findings underscore the value +of suitably integrating time series modeling with economic prior for enhanced +electricity price forecasting accuracy. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ♻ ☆ GradTree: Learning Axis-Aligned Decision Trees with Gradient Descent + + +
+ Decision Trees (DTs) are commonly used for many machine learning tasks due to +their high degree of interpretability. However, learning a DT from data is a +difficult optimization problem, as it is non-convex and non-differentiable. +Therefore, common approaches learn DTs using a greedy growth algorithm that +minimizes the impurity locally at each internal node. Unfortunately, this +greedy procedure can lead to inaccurate trees. In this paper, we present a +novel approach for learning hard, axis-aligned DTs with gradient descent. The +proposed method uses backpropagation with a straight-through operator on a +dense DT representation, to jointly optimize all tree parameters. Our approach +outperforms existing methods on binary classification benchmarks and achieves +competitive results for multi-class tasks. The method is available under: +https://github.com/s-marton/GradTree + +
+
+
+
+
+ + ♻ ☆ TED: Accelerate Model Training by Internal Generalization + + +
+ Large language models have demonstrated strong performance in recent years, +but the high cost of training drives the need for efficient methods to compress +dataset sizes. We propose TED pruning, a method that addresses the challenge of +overfitting under high pruning ratios by quantifying the model's ability to +improve performance on pruned data while fitting retained data, known as +Internal Generalization (IG). TED uses an optimization objective based on +Internal Generalization Distance (IGD), measuring changes in IG before and +after pruning to align with true generalization performance and achieve +implicit regularization. The IGD optimization objective was verified to allow +the model to achieve the smallest upper bound on generalization error. The +impact of small mask fluctuations on IG is studied through masks and Taylor +approximation, and fast estimation of IGD is enabled. In analyzing continuous +training dynamics, the prior effect of IGD is validated, and a progressive +pruning strategy is proposed. Experiments on image classification, natural +language understanding, and large language model fine-tuning show TED achieves +lossless performance with 60-70\% of the data. Upon acceptance, our code will +be made publicly available. + +
+
+
+
+
+ + ♻ ☆ Collaborative Multi-source Domain Adaptation Through Optimal Transport + + +
+ Multi-source Domain Adaptation (MDA) seeks to adapt models trained on data +from multiple labeled source domains to perform effectively on an unlabeled +target domain data, assuming access to sources data. To address the challenges +of model adaptation and data privacy, we introduce Collaborative MDA Through +Optimal Transport (CMDA-OT), a novel framework consisting of two key phases. In +the first phase, each source domain is independently adapted to the target +domain using optimal transport methods. In the second phase, a centralized +collaborative learning architecture is employed, which aggregates the N models +from the N sources without accessing their data, thereby safeguarding privacy. +During this process, the server leverages a small set of pseudo-labeled samples +from the target domain, known as the target validation subset, to refine and +guide the adaptation. This dual-phase approach not only improves model +performance on the target domain but also addresses vital privacy challenges +inherent in domain adaptation. + +
+
+
+
+
+ + ♻ ☆ Detectors for Safe and Reliable LLMs: Implementations, Uses, and + Limitations + + +
+ Large language models (LLMs) are susceptible to a variety of risks, from +non-faithful output to biased and toxic generations. Due to several limiting +factors surrounding LLMs (training cost, API access, data availability, etc.), +it may not always be feasible to impose direct safety constraints on a deployed +model. Therefore, an efficient and reliable alternative is required. To this +end, we present our ongoing efforts to create and deploy a library of +detectors: compact and easy-to-build classification models that provide labels +for various harms. In addition to the detectors themselves, we discuss a wide +range of uses for these detector models - from acting as guardrails to enabling +effective AI governance. We also deep dive into inherent challenges in their +development and discuss future work aimed at making the detectors more reliable +and broadening their scope. + +
+
+
+
+
+ + ♻ ☆ Memorization Capacity for Additive Fine-Tuning with Small ReLU Networks UAI 2024 + + +
+ Fine-tuning large pre-trained models is a common practice in machine learning +applications, yet its mathematical analysis remains largely unexplored. In this +paper, we study fine-tuning through the lens of memorization capacity. Our new +measure, the Fine-Tuning Capacity (FTC), is defined as the maximum number of +samples a neural network can fine-tune, or equivalently, as the minimum number +of neurons ($m$) needed to arbitrarily change $N$ labels among $K$ samples +considered in the fine-tuning process. In essence, FTC extends the memorization +capacity concept to the fine-tuning scenario. We analyze FTC for the additive +fine-tuning scenario where the fine-tuned network is defined as the summation +of the frozen pre-trained network $f$ and a neural network $g$ (with $m$ +neurons) designed for fine-tuning. When $g$ is a ReLU network with either 2 or +3 layers, we obtain tight upper and lower bounds on FTC; we show that $N$ +samples can be fine-tuned with $m=\Theta(N)$ neurons for 2-layer networks, and +with $m=\Theta(\sqrt{N})$ neurons for 3-layer networks, no matter how large $K$ +is. Our results recover the known memorization capacity results when $N = K$ as +a special case. + +
+
+ comment: 10 pages, 9 figures, UAI 2024 +
+
+
+
+
+ + ♻ ☆ BLAZE: Cross-Language and Cross-Project Bug Localization via Dynamic + Chunking and Hard Example Learning + + +
+ Software bugs require developers to exert significant effort to identify and +resolve them, often consuming about one-third of their time. Bug localization, +the process of pinpointing the exact source code files that need modification, +is crucial in reducing this effort. Existing bug localization tools, typically +reliant on deep learning techniques, face limitations in cross-project +applicability and effectiveness in multi-language environments. Recent +advancements with Large Language Models (LLMs) offer detailed representations +for bug localization. However, they encounter challenges with limited context +windows and mapping accuracy. To address these issues, we propose BLAZE, an +approach that employs dynamic chunking and hard example learning. First, BLAZE +dynamically segments source code to minimize continuity loss. Then, BLAZE +fine-tunes a GPT-based model using challenging bug cases, in order to enhance +cross-project and cross-language bug localization. To support the capability of +BLAZE, we create the BEETLEBOX dataset, which comprises 26,321 bugs from 29 +large and thriving open-source projects across five different programming +languages (Java, C++, Python, Go, and JavaScript). Our evaluations of BLAZE on +three benchmark datasets BEETLEBOX, SWE-Bench, and Ye et al. demonstrate +substantial improvements compared to six state-of-the-art baselines. +Specifically, BLAZE achieves up to an increase of 120% in Top 1 accuracy, 144% +in Mean Average Precision (MAP), and 100% in Mean Reciprocal Rank (MRR). An +extensive ablation study confirms the contributions of our pipeline components +to the overall performance enhancement. + +
+
+
+
+
+ + ♻ ☆ Compression Represents Intelligence Linearly + + +
+ There is a belief that learning to compress well will lead to intelligence. +Recently, language modeling has been shown to be equivalent to compression, +which offers a compelling rationale for the success of large language models +(LLMs): the development of more advanced language models is essentially +enhancing compression which facilitates intelligence. Despite such appealing +discussions, little empirical evidence is present for the interplay between +compression and intelligence. In this work, we examine their relationship in +the context of LLMs, treating LLMs as data compressors. Given the abstract +concept of "intelligence", we adopt the average downstream benchmark scores as +a surrogate, specifically targeting intelligence related to knowledge and +commonsense, coding, and mathematical reasoning. Across 12 benchmarks, our +study brings together 31 public LLMs that originate from diverse organizations. +Remarkably, we find that LLMs' intelligence -- reflected by average benchmark +scores -- almost linearly correlates with their ability to compress external +text corpora. These results provide concrete evidence supporting the belief +that superior compression indicates greater intelligence. Furthermore, our +findings suggest that compression efficiency, as an unsupervised metric derived +from raw text corpora, serves as a reliable evaluation measure that is linearly +associated with the model capabilities. We open-source our compression datasets +as well as our data collection pipelines to facilitate future researchers to +assess compression properly. + +
+
+ comment: COLM 2024. Data and code are available at + https://github.com/hkust-nlp/llm-compression-intelligence +
+
+
+
+
+ + ♻ ☆ ArcheType: A Novel Framework for Open-Source Column Type Annotation + using Large Language Models VLDB 2024 + + +
+ Existing deep-learning approaches to semantic column type annotation (CTA) +have important shortcomings: they rely on semantic types which are fixed at +training time; require a large number of training samples per type and incur +large run-time inference costs; and their performance can degrade when +evaluated on novel datasets, even when types remain constant. Large language +models have exhibited strong zero-shot classification performance on a wide +range of tasks and in this paper we explore their use for CTA. We introduce +ArcheType, a simple, practical method for context sampling, prompt +serialization, model querying, and label remapping, which enables large +language models to solve CTA problems in a fully zero-shot manner. We ablate +each component of our method separately, and establish that improvements to +context sampling and label remapping provide the most consistent gains. +ArcheType establishes a new state-of-the-art performance on zero-shot CTA +benchmarks (including three new domain-specific benchmarks which we release +along with this paper), and when used in conjunction with classical CTA +techniques, it outperforms a SOTA DoDuo model on the fine-tuned SOTAB +benchmark. Our code is available at https://github.com/penfever/ArcheType. + +
+
+ comment: VLDB 2024 +
+
+
+
+
+ + ♻ ☆ A Theoretical Framework for an Efficient Normalizing Flow-Based Solution + to the Electronic Schrodinger Equation + + +
+ A central problem in quantum mechanics involves solving the Electronic +Schrodinger Equation for a molecule or material. The Variational Monte Carlo +approach to this problem approximates a particular variational objective via +sampling, and then optimizes this approximated objective over a chosen +parameterized family of wavefunctions, known as the ansatz. Recently neural +networks have been used as the ansatz, with accompanying success. However, +sampling from such wavefunctions has required the use of a Markov Chain Monte +Carlo approach, which is inherently inefficient. In this work, we propose a +solution to this problem via an ansatz which is cheap to sample from, yet +satisfies the requisite quantum mechanical properties. We prove that a +normalizing flow using the following two essential ingredients satisfies our +requirements: (a) a base distribution which is constructed from Determinantal +Point Processes; (b) flow layers which are equivariant to a particular subgroup +of the permutation group. We then show how to construct both continuous and +discrete normalizing flows which satisfy the requisite equivariance. We further +demonstrate the manner in which the non-smooth nature ("cusps") of the +wavefunction may be captured, and how the framework may be generalized to +provide induction across multiple molecules. The resulting theoretical +framework entails an efficient approach to solving the Electronic Schrodinger +Equation. + +
+
+ comment: Added references +
+
+
+
+
+ + ♻ ☆ Exploring Vacant Classes in Label-Skewed Federated Learning + + +
+ Label skews, characterized by disparities in local label distribution across +clients, pose a significant challenge in federated learning. As minority +classes suffer from worse accuracy due to overfitting on local imbalanced data, +prior methods often incorporate class-balanced learning techniques during local +training. Although these methods improve the mean accuracy across all classes, +we observe that vacant classes-referring to categories absent from a client's +data distribution-remain poorly recognized. Besides, there is still a gap in +the accuracy of local models on minority classes compared to the global model. +This paper introduces FedVLS, a novel approach to label-skewed federated +learning that integrates both vacant-class distillation and logit suppression +simultaneously. Specifically, vacant-class distillation leverages knowledge +distillation during local training on each client to retain essential +information related to vacant classes from the global model. Moreover, logit +suppression directly penalizes network logits for non-label classes, +effectively addressing misclassifications in minority classes that may be +biased toward majority classes. Extensive experiments validate the efficacy of +FedVLS, demonstrating superior performance compared to previous +state-of-the-art (SOTA) methods across diverse datasets with varying degrees of +label skews. Code is available in the supplementary material. + +
+
+
+
+
+ + ♻ ☆ Source Matters: Source Dataset Impact on Model Robustness in Medical + Imaging + + +
+ Transfer learning has become an essential part of medical imaging +classification algorithms, often leveraging ImageNet weights. The domain shift +from natural to medical images has prompted alternatives such as RadImageNet, +often showing comparable classification performance. However, it remains +unclear whether the performance gains from transfer learning stem from improved +generalization or shortcut learning. To address this, we conceptualize +confounders by introducing the Medical Imaging Contextualized Confounder +Taxonomy (MICCAT) and investigate a range of confounders across it -- whether +synthetic or sampled from the data -- using two public chest X-ray and CT +datasets. We show that ImageNet and RadImageNet achieve comparable +classification performance, yet ImageNet is much more prone to overfitting to +confounders. We recommend that researchers using ImageNet-pretrained models +reexamine their model robustness by conducting similar experiments. Our code +and experiments are available at https://github.com/DovileDo/source-matters. + +
+
+
+
+
+ + ♻ ☆ Interpreting Learned Feedback Patterns in Large Language Models + + +
+ Reinforcement learning from human feedback (RLHF) is widely used to train +large language models (LLMs). However, it is unclear whether LLMs accurately +learn the underlying preferences in human feedback data. We coin the term +\textit{Learned Feedback Pattern} (LFP) for patterns in an LLM's activations +learned during RLHF that improve its performance on the fine-tuning task. We +hypothesize that LLMs with LFPs accurately aligned to the fine-tuning feedback +exhibit consistent activation patterns for outputs that would have received +similar feedback during RLHF. To test this, we train probes to estimate the +feedback signal implicit in the activations of a fine-tuned LLM. We then +compare these estimates to the true feedback, measuring how accurate the LFPs +are to the fine-tuning feedback. Our probes are trained on a condensed, sparse +and interpretable representation of LLM activations, making it easier to +correlate features of the input with our probe's predictions. We validate our +probes by comparing the neural features they correlate with positive feedback +inputs against the features GPT-4 describes and classifies as related to LFPs. +Understanding LFPs can help minimize discrepancies between LLM behavior and +training objectives, which is essential for the safety of LLMs. + +
+
+ comment: 19 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Kolmogorov-Arnold Network for Online Reinforcement Learning + + +
+ Kolmogorov-Arnold Networks (KANs) have shown potential as an alternative to +Multi-Layer Perceptrons (MLPs) in neural networks, providing universal function +approximation with fewer parameters and reduced memory usage. In this paper, we +explore the use of KANs as function approximators within the Proximal Policy +Optimization (PPO) algorithm. We evaluate this approach by comparing its +performance to the original MLP-based PPO using the DeepMind Control Proprio +Robotics benchmark. Our results indicate that the KAN-based reinforcement +learning algorithm can achieve comparable performance to its MLP-based +counterpart, often with fewer parameters. These findings suggest that KANs may +offer a more efficient option for reinforcement learning models. + +
+
+ comment: Paper accepted at 24th International Conference on Control, + Automation and Systems (ICCAS) +
+
+
+
+
+ + ♻ ☆ A new perspective on Bayesian Operational Modal Analysis + + +
+ In the field of operational modal analysis (OMA), obtained modal information +is frequently used to assess the current state of aerospace, mechanical, +offshore and civil structures. However, the stochasticity of operational +systems and the lack of forcing information can lead to inconsistent results. +Quantifying the uncertainty of the recovered modal parameters through OMA is +therefore of significant value. In this article, a new perspective on Bayesian +OMA is proposed: a Bayesian stochastic subspace identification (SSI) algorithm. +Distinct from existing approaches to Bayesian OMA, a hierarchical probabilistic +model is embedded at the core of covariance-driven SSI. Through substitution of +canonical correlation analysis with a Bayesian equivalent, posterior +distributions over the modal properties are obtained. Two inference schemes are +presented for the proposed Bayesian formulation: Markov Chain Monte Carlo and +variational Bayes. Two case studies are then explored. The first is benchmark +study using data from a simulated, multi degree-of-freedom, linear system. +Following application of Bayesian SSI, it is shown that the same posterior is +targeted and recovered by both inference schemes, with good agreement between +the posterior mean and the conventional SSI result. The second study applies +the variational form to data obtained from an in-service structure: The Z24 +bridge. The results of this study are presented at single model orders, and +then using a stabilisation diagram. The recovered posterior uncertainty is +presented and compared to the classic SSI result. It is observed that the +posterior distributions with mean values coinciding with the natural +frequencies exhibit lower variance than values situated away from the natural +frequencies. + +
+
+
+
+
+ + ♻ ☆ Topology-preserving Adversarial Training for Alleviating Natural + Accuracy Degradation BMVC 2024 + + +
+ Despite the effectiveness in improving the robustness of neural networks, +adversarial training has suffered from the natural accuracy degradation +problem, i.e., accuracy on natural samples has reduced significantly. In this +study, we reveal that natural accuracy degradation is highly related to the +disruption of the natural sample topology in the representation space by +quantitative and qualitative experiments. Based on this observation, we propose +Topology-pReserving Adversarial traINing (TRAIN) to alleviate the problem by +preserving the topology structure of natural samples from a standard model +trained only on natural samples during adversarial training. As an additional +regularization, our method can be combined with various popular adversarial +training algorithms, taking advantage of both sides. Extensive experiments on +CIFAR-10, CIFAR-100, and Tiny ImageNet show that our proposed method achieves +consistent and significant improvements over various strong baselines in most +cases. Specifically, without additional data, TRAIN achieves up to 8.86% +improvement in natural accuracy and 6.33% improvement in robust accuracy. + +
+
+ comment: BMVC 2024; Code will be released on https://github.com/KululuMi/TRAIN +
+
+
+
+
+ + ♻ ☆ EUvsDisinfo: a Dataset for Multilingual Detection of Pro-Kremlin + Disinformation in News Articles CIKM 2024 + + +
+ This work introduces EUvsDisinfo, a multilingual dataset of disinformation +articles originating from pro-Kremlin outlets, along with trustworthy articles +from credible / less biased sources. It is sourced directly from the debunk +articles written by experts leading the EUvsDisinfo project. Our dataset is the +largest to-date resource in terms of the overall number of articles and +distinct languages. It also provides the largest topical and temporal coverage. +Using this dataset, we investigate the dissemination of pro-Kremlin +disinformation across different languages, uncovering language-specific +patterns targeting certain disinformation topics. We further analyse the +evolution of topic distribution over an eight-year period, noting a significant +surge in disinformation content before the full-scale invasion of Ukraine in +2022. Lastly, we demonstrate the dataset's applicability in training models to +effectively distinguish between disinformation and trustworthy content in +multilingual settings. + +
+
+ comment: Published at CIKM 2024 +
+
+
+
+
+ + ♻ ☆ Data-driven criteria for quantum correlations + + +
+ We build a machine learning model to detect correlations in a three-qubit +system using a neural network trained in an unsupervised manner on randomly +generated states. The network is forced to recognize separable states, and +correlated states are detected as anomalies. Quite surprisingly, we find that +the proposed detector performs much better at distinguishing a weaker form of +quantum correlations, namely, the quantum discord, than entanglement. In fact, +it has a tendency to grossly overestimate the set of entangled states even at +the optimal threshold for entanglement detection, while it underestimates the +set of discordant states to a much lesser extent. In order to illustrate the +nature of states classified as quantum-correlated, we construct a diagram +containing various types of states -- entangled, as well as separable, both +discordant and non-discordant. We find that the near-zero value of the +recognition loss reproduces the shape of the non-discordant separable states +with high accuracy, especially considering the non-trivial shape of this set on +the diagram. The network architecture is designed carefully: it preserves +separability, and its output is equivariant with respect to qubit permutations. +We show that the choice of architecture is important to get the highest +detection accuracy, much better than for a baseline model that just utilizes a +partial trace operation. + +
+
+ comment: 7 pages, 3 figures, 3 tables, and extra 5 pages of supplementary + materials +
+
+
+
+
+ + ♻ ☆ Negative impact of heavy-tailed uncertainty and error distributions on + the reliability of calibration statistics for machine learning regression + tasks + + +
+ Average calibration of the (variance-based) prediction uncertainties of +machine learning regression tasks can be tested in two ways: one is to estimate +the calibration error (CE) as the difference between the mean absolute error +(MSE) and the mean variance (MV); the alternative is to compare the mean +squared z-scores (ZMS) to 1. The problem is that both approaches might lead to +different conclusions, as illustrated in this study for an ensemble of datasets +from the recent machine learning uncertainty quantification (ML-UQ) literature. +It is shown that the estimation of MV, MSE and their confidence intervals +becomes unreliable for heavy-tailed uncertainty and error distributions, which +seems to be a frequent feature of ML-UQ datasets. By contrast, the ZMS +statistic is less sensitive and offers the most reliable approach in this +context, still acknowledging that datasets with heavy-tailed z-scores +distributions should be considered with great care. Unfortunately, the same +problem is expected to affect also conditional calibrations statistics, such as +the popular ENCE, and very likely post-hoc calibration methods based on similar +statistics. Several solutions to circumvent the outlined problems are proposed. + +
+
+
+
+
+ + ♻ ☆ Toward cross-subject and cross-session generalization in EEG-based + emotion recognition: Systematic review, taxonomy, and methods + + +
+ A systematic review on machine-learning strategies for improving +generalizability (cross-subjects and cross-sessions) electroencephalography +(EEG) based in emotion classification was realized. In this context, the +non-stationarity of EEG signals is a critical issue and can lead to the Dataset +Shift problem. Several architectures and methods have been proposed to address +this issue, mainly based on transfer learning methods. 418 papers were +retrieved from the Scopus, IEEE Xplore and PubMed databases through a search +query focusing on modern machine learning techniques for generalization in +EEG-based emotion assessment. Among these papers, 75 were found eligible based +on their relevance to the problem. Studies lacking a specific cross-subject and +cross-session validation strategy and making use of other biosignals as support +were excluded. On the basis of the selected papers' analysis, a taxonomy of the +studies employing Machine Learning (ML) methods was proposed, together with a +brief discussion on the different ML approaches involved. The studies with the +best results in terms of average classification accuracy were identified, +supporting that transfer learning methods seem to perform better than other +approaches. A discussion is proposed on the impact of (i) the emotion +theoretical models and (ii) psychological screening of the experimental sample +on the classifier performances. + +
+
+ comment: This work has been published on Neurocomputing journal in its final + version. Please refer to the final Open Access version of the paper on + https://doi.org/10.1016/j.neucom.2024.128354 . Old title "Machine Learning + Strategies to Improve Generalization in EEG-based Emotion Assessment: a + Systematic Review" has been changed to the current one +
+
+
+
+
+ + ♻ ☆ iNeMo: Incremental Neural Mesh Models for Robust Class-Incremental + Learning ECCV-24 + + +
+ Different from human nature, it is still common practice today for vision +tasks to train deep learning models only initially and on fixed datasets. A +variety of approaches have recently addressed handling continual data streams. +However, extending these methods to manage out-of-distribution (OOD) scenarios +has not effectively been investigated. On the other hand, it has recently been +shown that non-continual neural mesh models exhibit strong performance in +generalizing to such OOD scenarios. To leverage this decisive property in a +continual learning setting, we propose incremental neural mesh models that can +be extended with new meshes over time. In addition, we present a latent space +initialization strategy that enables us to allocate feature space for future +unseen classes in advance and a positional regularization term that forces the +features of the different classes to consistently stay in respective latent +space regions. We demonstrate the effectiveness of our method through extensive +experiments on the Pascal3D and ObjectNet3D datasets and show that our approach +outperforms the baselines for classification by $2-6\%$ in the in-domain and by +$6-50\%$ in the OOD setting. Our work also presents the first incremental +learning approach for pose estimation. Our code and model can be found at +https://github.com/Fischer-Tom/iNeMo. + +
+
+ comment: ECCV-24 +
+
+
+
+
+ + ♻ ☆ Fast Benchmarking of Asynchronous Multi-Fidelity Optimization on + Zero-Cost Benchmarks + + +
+ While deep learning has celebrated many successes, its results often hinge on +the meticulous selection of hyperparameters (HPs). However, the time-consuming +nature of deep learning training makes HP optimization (HPO) a costly endeavor, +slowing down the development of efficient HPO tools. While zero-cost +benchmarks, which provide performance and runtime without actual training, +offer a solution for non-parallel setups, they fall short in parallel setups as +each worker must communicate its queried runtime to return its evaluation in +the exact order. This work addresses this challenge by introducing a +user-friendly Python package that facilitates efficient parallel HPO with +zero-cost benchmarks. Our approach calculates the exact return order based on +the information stored in file system, eliminating the need for long waiting +times and enabling much faster HPO evaluations. We first verify the correctness +of our approach through extensive testing and the experiments with 6 popular +HPO libraries show its applicability to diverse libraries and its ability to +achieve over 1000x speedup compared to a traditional approach. Our package can +be installed via pip install mfhpo-simulator. + +
+
+ comment: Accepted to AutoML Conference 2024 ABCD Track +
+
+
+
+
+ + ♻ ☆ InterrogateLLM: Zero-Resource Hallucination Detection in LLM-Generated + Answers + + +
+ Despite the many advances of Large Language Models (LLMs) and their +unprecedented rapid evolution, their impact and integration into every facet of +our daily lives is limited due to various reasons. One critical factor +hindering their widespread adoption is the occurrence of hallucinations, where +LLMs invent answers that sound realistic, yet drift away from factual truth. In +this paper, we present a novel method for detecting hallucinations in large +language models, which tackles a critical issue in the adoption of these models +in various real-world scenarios. Through extensive evaluations across multiple +datasets and LLMs, including Llama-2, we study the hallucination levels of +various recent LLMs and demonstrate the effectiveness of our method to +automatically detect them. Notably, we observe up to 87% hallucinations for +Llama-2 in a specific experiment, where our method achieves a Balanced Accuracy +of 81%, all without relying on external knowledge. + +
+
+
+
+
+ + ♻ ☆ Gemma Scope: Open Sparse Autoencoders Everywhere All At Once on Gemma 2 + + +
+ Sparse autoencoders (SAEs) are an unsupervised method for learning a sparse +decomposition of a neural network's latent representations into seemingly +interpretable features. Despite recent excitement about their potential, +research applications outside of industry are limited by the high cost of +training a comprehensive suite of SAEs. In this work, we introduce Gemma Scope, +an open suite of JumpReLU SAEs trained on all layers and sub-layers of Gemma 2 +2B and 9B and select layers of Gemma 2 27B base models. We primarily train SAEs +on the Gemma 2 pre-trained models, but additionally release SAEs trained on +instruction-tuned Gemma 2 9B for comparison. We evaluate the quality of each +SAE on standard metrics and release these results. We hope that by releasing +these SAE weights, we can help make more ambitious safety and interpretability +research easier for the community. Weights and a tutorial can be found at +https://huggingface.co/google/gemma-scope and an interactive demo can be found +at https://www.neuronpedia.org/gemma-scope + +
+
+ comment: 12 main text pages, and 14 pages of acknowledgements, references and + appendices +
+
+
+
+
+ + ♻ ☆ Heuristic-enhanced Candidates Selection strategy for GPTs tackle + Few-Shot Aspect-Based Sentiment Analysis + + +
+ Few-Shot Aspect-Based Sentiment Analysis (FSABSA) is an indispensable and +highly challenging task in natural language processing. However, methods based +on Pre-trained Language Models (PLMs) struggle to accommodate multiple +sub-tasks, and methods based on Generative Pre-trained Transformers (GPTs) +perform poorly. To address the above issues, the paper designs a +Heuristic-enhanced Candidates Selection (HCS) strategy and further proposes All +in One (AiO) model based on it. The model works in a two-stage, which +simultaneously accommodates the accuracy of PLMs and the generalization +capability of GPTs. Specifically, in the first stage, a backbone model based on +PLMs generates rough heuristic candidates for the input sentence. In the second +stage, AiO leverages LLMs' contextual learning capabilities to generate precise +predictions. The study conducted comprehensive comparative and ablation +experiments on five benchmark datasets. The experimental results demonstrate +that the proposed model can better adapt to multiple sub-tasks, and also +outperforms the methods that directly utilize GPTs. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Token-Mol 1.0: Tokenized drug design with large language model + + +
+ Significant interests have recently risen in leveraging sequence-based large +language models (LLMs) for drug design. However, most current applications of +LLMs in drug discovery lack the ability to comprehend three-dimensional (3D) +structures, thereby limiting their effectiveness in tasks that explicitly +involve molecular conformations. In this study, we introduced Token-Mol, a +token-only 3D drug design model. This model encodes all molecular information, +including 2D and 3D structures, as well as molecular property data, into +tokens, which transforms classification and regression tasks in drug discovery +into probabilistic prediction problems, thereby enabling learning through a +unified paradigm. Token-Mol is built on the transformer decoder architecture +and trained using random causal masking techniques. Additionally, we proposed +the Gaussian cross-entropy (GCE) loss function to overcome the challenges in +regression tasks, significantly enhancing the capacity of LLMs to learn +continuous numerical values. Through a combination of fine-tuning and +reinforcement learning (RL), Token-Mol achieves performance comparable to or +surpassing existing task-specific methods across various downstream tasks, +including pocket-based molecular generation, conformation generation, and +molecular property prediction. Compared to existing molecular pre-trained +models, Token-Mol exhibits superior proficiency in handling a wider range of +downstream tasks essential for drug design. Notably, our approach improves +regression task accuracy by approximately 30% compared to similar token-only +methods. Token-Mol overcomes the precision limitations of token-only models and +has the potential to integrate seamlessly with general models such as ChatGPT, +paving the way for the development of a universal artificial intelligence drug +design model that facilitates rapid and high-quality drug design by experts. + +
+
+
+
+
+ + ♻ ☆ IncDSI: Incrementally Updatable Document Retrieval + + +
+ Differentiable Search Index is a recently proposed paradigm for document +retrieval, that encodes information about a corpus of documents within the +parameters of a neural network and directly maps queries to corresponding +documents. These models have achieved state-of-the-art performances for +document retrieval across many benchmarks. These kinds of models have a +significant limitation: it is not easy to add new documents after a model is +trained. We propose IncDSI, a method to add documents in real time (about +20-50ms per document), without retraining the model on the entire dataset (or +even parts thereof). Instead we formulate the addition of documents as a +constrained optimization problem that makes minimal changes to the network +parameters. Although orders of magnitude faster, our approach is competitive +with re-training the model on the whole dataset and enables the development of +document retrieval systems that can be updated with new information in +real-time. Our code for IncDSI is available at +https://github.com/varshakishore/IncDSI. + +
+
+
+
+
+ + ♻ ☆ Dynamic Hypergraph-Enhanced Prediction of Sequential Medical Visits + + +
+ This study introduces a pioneering Dynamic Hypergraph Networks (DHCE) model +designed to predict future medical diagnoses from electronic health records +with enhanced accuracy. The DHCE model innovates by identifying and +differentiating acute and chronic diseases within a patient's visit history, +constructing dynamic hypergraphs that capture the complex, high-order +interactions between diseases. It surpasses traditional recurrent neural +networks and graph neural networks by effectively integrating clinical event +data, reflected through medical language model-assisted encoding, into a robust +patient representation. Through extensive experiments on two benchmark +datasets, MIMIC-III and MIMIC-IV, the DHCE model exhibits superior performance, +significantly outpacing established baseline models in the precision of +sequential diagnosis prediction. + +
+
+
+
+
+ + ♻ ☆ FLrce: Resource-Efficient Federated Learning with Early-Stopping + Strategy + + +
+ Federated Learning (FL) achieves great popularity in the Internet of Things +(IoT) as a powerful interface to offer intelligent services to customers while +maintaining data privacy. Under the orchestration of a server, edge devices +(also called clients in FL) collaboratively train a global deep-learning model +without sharing any local data. Nevertheless, the unequal training +contributions among clients have made FL vulnerable, as clients with heavily +biased datasets can easily compromise FL by sending malicious or heavily biased +parameter updates. Furthermore, the resource shortage issue of the network also +becomes a bottleneck. Due to overwhelming computation overheads generated by +training deep-learning models on edge devices, and significant communication +overheads for transmitting deep-learning models across the network, enormous +amounts of resources are consumed in the FL process. This encompasses +computation resources like energy and communication resources like bandwidth. +To comprehensively address these challenges, in this paper, we present FLrce, +an efficient FL framework with a relationship-based client selection and +early-stopping strategy. FLrce accelerates the FL process by selecting clients +with more significant effects, enabling the global model to converge to a high +accuracy in fewer rounds. FLrce also leverages an early stopping mechanism that +terminates FL in advance to save communication and computation resources. +Experiment results show that, compared with existing efficient FL frameworks, +FLrce improves the computation and communication efficiency by at least 30% and +43% respectively. + +
+
+ comment: Preprint, accepted by IEEE Transactions on Mobile Computing +
+
+
+
+
+ + ♻ ☆ Lory: Fully Differentiable Mixture-of-Experts for Autoregressive + Language Model Pre-training + + +
+ Mixture-of-experts (MoE) models facilitate efficient scaling; however, +training the router network introduces the challenge of optimizing a +non-differentiable, discrete objective. Recently, a fully-differentiable MoE +architecture, SMEAR, was proposed (Muqeeth et al., 2023), which softly merges +experts in the parameter space; nevertheless, its effectiveness was only +demonstrated in downstream fine-tuning on classification tasks. In this paper, +we present Lory, the first approach that scales such architectures to +autoregressive language model pre-training. Lory introduces two key techniques: +(1) a causal segment routing strategy that achieves high efficiency for expert +merging operations while preserving the autoregressive nature of language +models; (2) a similarity-based data batching method that encourages expert +specialization by grouping similar documents in training instances. We +pre-train a series of Lory models on 150B tokens from scratch, with up to 32 +experts and 30B (1.5B active) parameters. Experimental results show significant +performance gains over parameter-matched dense models on both perplexity +(+13.9%) and a variety of downstream tasks (+1.5%-11.1%). Despite segment-level +routing, Lory models achieve competitive performance compared to +state-of-the-art MoE models with token-level routing. We further demonstrate +that the trained experts in Lory capture domain-level specialization without +supervision. Our work highlights the potential of fully-differentiable MoE +architectures for language model pre-training and advocates future research in +this area. + +
+
+ comment: COLM 2024 +
+
+
+
+
+ + ♻ ☆ EasyDGL: Encode, Train and Interpret for Continuous-time Dynamic Graph + Learning + + +
+ Dynamic graphs arise in various real-world applications, and it is often +welcomed to model the dynamics directly in continuous time domain for its +flexibility. This paper aims to design an easy-to-use pipeline (termed as +EasyDGL which is also due to its implementation by DGL toolkit) composed of +three key modules with both strong fitting ability and interpretability. +Specifically the proposed pipeline which involves encoding, training and +interpreting: i) a temporal point process (TPP) modulated attention +architecture to endow the continuous-time resolution with the coupled +spatiotemporal dynamics of the observed graph with edge-addition events; ii) a +principled loss composed of task-agnostic TPP posterior maximization based on +observed events on the graph, and a task-aware loss with a masking strategy +over dynamic graph, where the covered tasks include dynamic link prediction, +dynamic node classification and node traffic forecasting; iii) interpretation +of the model outputs (e.g., representations and predictions) with scalable +perturbation-based quantitative analysis in the graph Fourier domain, which +could more comprehensively reflect the behavior of the learned model. Extensive +experimental results on public benchmarks show the superior performance of our +EasyDGL for time-conditioned predictive tasks, and in particular demonstrate +that EasyDGL can effectively quantify the predictive power of frequency content +that a model learn from the evolving graph data. + +
+
+ comment: Published in IEEE Transactions on Pattern Analysis and Machine + Intelligence +
+
+
+
+
+ + ♻ ☆ Optimal Bound for PCA with Outliers using Higher-Degree Voronoi Diagrams + + +
+ In this paper, we introduce new algorithms for Principal Component Analysis +(PCA) with outliers. Utilizing techniques from computational geometry, +specifically higher-degree Voronoi diagrams, we navigate to the optimal +subspace for PCA even in the presence of outliers. This approach achieves an +optimal solution with a time complexity of +$n^{d+\mathcal{O}(1)}\text{poly}(n,d)$. Additionally, we present a randomized +algorithm with a complexity of $2^{\mathcal{O}(r(d-r))} \times \text{poly}(n, +d)$. This algorithm samples subspaces characterized in terms of a Grassmannian +manifold. By employing such sampling method, we ensure a high likelihood of +capturing the optimal subspace, with the success probability $(1 - \delta)^T$. +Where $\delta$ represents the probability that a sampled subspace does not +contain the optimal solution, and $T$ is the number of subspaces sampled, +proportional to $2^{r(d-r)}$. Our use of higher-degree Voronoi diagrams and +Grassmannian based sampling offers a clearer conceptual pathway and practical +advantages, particularly in handling large datasets or higher-dimensional +settings. + +
+
+
+
+
+ + ♻ ☆ RAVEN: In-Context Learning with Retrieval-Augmented Encoder-Decoder + Language Models + + +
+ In this paper, we investigate the in-context learning ability of +retrieval-augmented encoder-decoder language models. We first conduct a +comprehensive analysis of existing models and identify their limitations in +in-context learning, primarily due to a mismatch between pretraining and +inference, as well as a restricted context length. To address these issues, we +propose RAVEN, a model that combines retrieval-augmented masked language +modeling and prefix language modeling. We further introduce Fusion-in-Context +Learning to enhance the few-shot performance by enabling the model to leverage +more in-context examples without requiring additional training. Through +extensive experiments, we demonstrate that our simple yet effective design +significantly improves performance, achieving results comparable to the most +advanced language models in certain scenarios, despite having substantially +fewer parameters. Our work underscores the potential of retrieval-augmented +encoder-decoder language models for in-context learning and encourages further +research in this direction. + +
+
+ comment: COLM 2024 +
+
+
+
+
+ + ♻ ☆ Coupling without Communication and Drafter-Invariant Speculative + Decoding + + +
+ Suppose Alice has a distribution $P$ and Bob has a distribution $Q$. Alice +wants to generate a sample $a\sim P$ and Bob a sample $b \sim Q$ such that $a = +b$ with has as high of probability as possible. It is well-known that, by +sampling from an optimal coupling between the distributions, Alice and Bob can +achieve $Pr[a = b] = 1 - D_{TV}(P,Q)$, where $D_{TV}(P,Q)$ is the total +variation distance. What if Alice and Bob must solve this same problem without +communicating at all? Perhaps surprisingly, with access to public randomness, +they can still achieve $Pr[a=b] \geq \frac{1-D_{TV}(P,Q)}{1+D_{TV}(P,Q)} \geq +1-2D_{TV}(P,Q)$. In fact, this bound can be obtained using a simple protocol +based on the Weighted MinHash algorithm. In this work, we explore the +communication-free coupling problem in greater depth. First, we show that an +equally simple protocol based on Gumbel sampling matches the worst-case +guarantees of the Weighted MinHash approach, but tends to perform better in +practice. Conversely, we prove that both approaches are actually sharp: no +communication-free protocol can achieve +$Pr[a=b]>\frac{1-D_{TV}(P,Q)}{1+D_{TV}(P,Q)}$ in the worst-case. Finally, we +prove that, for distributions over $n$ items, there exists a scheme that uses +just $O(\log(n/\epsilon))$ bits of communication to achieve $Pr[a = b] = 1 - +D_{TV}(P,Q) - \epsilon$, i.e. to essentially match optimal coupling. Beyond our +theoretical results, we demonstrate an application of communication-free +coupling to speculative decoding, a recent method for accelerating +autoregressive large language models [Leviathan, Kalman, Matias, ICML 2023]. We +show that communication-free protocols yield a variant of speculative decoding +that we call Drafter-Invariant Speculative Decoding, which has the desirable +property that the output of the method is fixed given a fixed random seed, +regardless of what drafter is used for speculation. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ♻ ☆ PEDAL: Enhancing Greedy Decoding with Large Language Models using + Diverse Exemplars + + +
+ Self-ensembling techniques with diverse reasoning paths such as +Self-Consistency have demonstrated remarkable performance gains in text +generation with Large Language Models (LLMs). However, such techniques depend +on the availability of an accurate answer extraction process to aggregate +across multiple outputs. Moreover, they acquire higher inference cost, in +comparison to Greedy Decoding, due to generation of relatively higher number of +output tokens. Research has shown that the free form text outputs from +Self-Consistency can be aggregated reliably using LLMs to produce the final +output. Additionally, recent advancements in LLM inference have demonstrated +that usage of diverse exemplars in prompts have the ability to induce diversity +in the LLM outputs. Such proven techniques can be easily extended to +self-ensembling based approaches to achieve enhanced results in text +generation. In this paper, we introduce PEDAL (Prompts based on Exemplar +Diversity Aggregated using LLMs), a hybrid self-ensembling approach, that +combines the strengths of diverse exemplar based prompts and LLM based +aggregation to achieve improvement in overall performance. On the publicly +available SVAMP and ARC datasets, our experiments reveal that PEDAL can achieve +better accuracy than Greedy Decoding based strategies with lower inference cost +compared to Self Consistency based approaches. + +
+
+
+
+
+ + ♻ ☆ Universal Approximation Theory: The Basic Theory for Transformer-based + Large Language Models + + +
+ Language models have emerged as a critical area of focus in artificial +intelligence, particularly with the introduction of groundbreaking innovations +like ChatGPT. Large-scale Transformer networks have quickly become the leading +approach for advancing natural language processing algorithms. Built on the +Transformer architecture, these models enable interactions that closely mimic +human communication and, equipped with extensive knowledge, can even assist in +guiding human tasks. Despite their impressive capabilities and growing +complexity, a key question remains-the theoretical foundations of large +language models (LLMs). What makes Transformer so effective for powering +intelligent language applications, such as translation and coding? What +underlies LLMs' ability for In-Context Learning (ICL)? How does the LoRA scheme +enhance the fine-tuning of LLMs? And what supports the practicality of pruning +LLMs? To address these critical questions and explore the technological +strategies within LLMs, we leverage the Universal Approximation Theory (UAT) to +offer a theoretical backdrop, shedding light on the mechanisms that underpin +these advancements. + +
+
+
+
+
+ + ♻ ☆ ProductAE: Toward Deep Learning Driven Error-Correction Codes of Large + Dimensions + + +
+ While decades of theoretical research have led to the invention of several +classes of error-correction codes, the design of such codes is an extremely +challenging task, mostly driven by human ingenuity. Recent studies demonstrate +that such designs can be effectively automated and accelerated via tools from +machine learning (ML), thus enabling ML-driven classes of error-correction +codes with promising performance gains compared to classical designs. A +fundamental challenge, however, is that it is prohibitively complex, if not +impossible, to design and train fully ML-driven encoder and decoder pairs for +large code dimensions. In this paper, we propose Product Autoencoder +(ProductAE) -- a computationally-efficient family of deep learning driven +(encoder, decoder) pairs -- aimed at enabling the training of relatively large +codes (both encoder and decoder) with a manageable training complexity. We +build upon ideas from classical product codes and propose constructing large +neural codes using smaller code components. ProductAE boils down the complex +problem of training the encoder and decoder for a large code dimension $k$ and +blocklength $n$ to less-complex sub-problems of training encoders and decoders +for smaller dimensions and blocklengths. Our training results show successful +training of ProductAEs of dimensions as large as $k = 300$ bits with meaningful +performance gains compared to state-of-the-art classical and neural designs. +Moreover, we demonstrate excellent robustness and adaptivity of ProductAEs to +channel models different than the ones used for training. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2110.04466 +
+
+
+
+
+ + ♻ ☆ HERA: High-efficiency Matrix Compression via Element Replacement + + +
+ Matrix quantization involves encoding matrix elements in a more +space-efficient manner to minimize storage requirements, with dequantization +used to reconstruct the original matrix for practical use. We define the +Quantization Error Minimization (QEM) problem as minimizing the difference +between a matrix before and after quantization while ensuring that the +quantized matrix occupies the same amount of memory. Matrix quantization is +essential in various fields, including weight quantization in Large Language +Models (LLMs), vector databases, KV cache quantization, graph compression, and +image compression. The growing scale of LLMs, such as GPT-4 and BERT, +underscores the need for matrix compression due to the large size of parameters +and KV caches, which are stored as matrices. + To address the QEM problem, we introduce HETA, an algorithm that leverages +the local orderliness of matrix elements by iteratively swapping elements to +create a locally ordered matrix. This matrix is then grouped and quantized by +columns. To further improve HETA, we present two optimizations: additional +quantization of residuals to reduce mean squared error (MSE) and the +application of masking and batch processing to accelerate the algorithm. + Our experiments show that HETA effectively reduces MSE to 12.3% of its +original value at the same compression ratio, outperforming leading baseline +algorithms. Our contributions include formalizing the QEM problem, developing +the HETA algorithm, and proposing two optimizations to enhance both accuracy +and processing speed. + +
+
+
+
+
+ + ♻ ☆ ReALM: Reference Resolution As Language Modeling SIGDIAL 2024 + + +
+ Reference resolution is an important problem, one that is essential to +understand and successfully handle context of different kinds. This context +includes both previous turns and context that pertains to non-conversational +entities, such as entities on the user's screen or those running in the +background. While LLMs have been shown to be extremely powerful for a variety +of tasks, their use in reference resolution, particularly for +non-conversational entities, remains underutilized. This paper demonstrates how +LLMs can be used to create an extremely effective system to resolve references +of various types, by showing how reference resolution can be converted into a +language modeling problem, despite involving forms of entities like those on +screen that are not traditionally conducive to being reduced to a text-only +modality. We demonstrate large improvements over an existing system with +similar functionality across different types of references, with our smallest +model obtaining absolute gains of over 5% for on-screen references. We also +benchmark against GPT-3.5 and GPT-4, with our smallest model achieving +performance comparable to that of GPT-4, and our larger models substantially +outperforming it. + +
+
+ comment: Accepted at SIGDIAL 2024 (Oral presentation) +
+
+
+
+
+ + ♻ ☆ Applications of Nature-Inspired Metaheuristic Algorithms for Tackling + Optimization Problems Across Disciplines + + +
+ Nature-inspired metaheuristic algorithms are important components of +artificial intelligence, and are increasingly used across disciplines to tackle +various types of challenging optimization problems. This paper demonstrates the +usefulness of such algorithms for solving a variety of challenging optimization +problems in statistics using a nature-inspired metaheuristic algorithm called +competitive swarm optimizer with mutated agents (CSO-MA). This algorithm was +proposed by one of the authors and its superior performance relative to many of +its competitors had been demonstrated in earlier work and again in this paper. +The main goal of this paper is to show a typical nature-inspired metaheuristic +algorithmi, like CSO-MA, is efficient for tackling many different types of +optimization problems in statistics. Our applications are new and include +finding maximum likelihood estimates of parameters in a single cell generalized +trend model to study pseudotime in bioinformatics, estimating parameters in the +commonly used Rasch model in education research, finding M-estimates for a Cox +regression in a Markov renewal model, performing matrix completion tasks to +impute missing data for a two compartment model, and selecting variables +optimally in an ecology problem in China. To further demonstrate the +flexibility of metaheuristics, we also find an optimal design for a car +refueling experiment in the auto industry using a logistic model with multiple +interacting factors. In addition, we show that metaheuristics can sometimes +outperform optimization algorithms commonly used in statistics. + +
+
+
+
+
+ + ♻ ☆ Universal Approximation Theory: Foundations for Parallelism in Neural + Networks + + +
+ Neural networks are increasingly evolving towards training large models with +big data, a method that has demonstrated superior performance across many +tasks. However, this approach introduces an urgent problem: current deep +learning models are predominantly serial, meaning that as the number of network +layers increases, so do the training and inference times. This is unacceptable +if deep learning is to continue advancing. Therefore, this paper proposes a +deep learning parallelization strategy based on the Universal Approximation +Theorem (UAT). From this foundation, we designed a parallel network called +Para-Former to test our theory. Unlike traditional serial models, the inference +time of Para-Former does not increase with the number of layers, significantly +accelerating the inference speed of multi-layer networks. Experimental results +validate the effectiveness of this network. + +
+
+
+
+
+ + ♻ ☆ Multiscale Representation Enhanced Temporal Flow Fusion Model for + Long-Term Workload Forecasting CIKM '24 + + +
+ Accurate workload forecasting is critical for efficient resource management +in cloud computing systems, enabling effective scheduling and autoscaling. +Despite recent advances with transformer-based forecasting models, challenges +remain due to the non-stationary, nonlinear characteristics of workload time +series and the long-term dependencies. In particular, inconsistent performance +between long-term history and near-term forecasts hinders long-range +predictions. This paper proposes a novel framework leveraging self-supervised +multiscale representation learning to capture both long-term and near-term +workload patterns. The long-term history is encoded through multiscale +representations while the near-term observations are modeled via temporal flow +fusion. These representations of different scales are fused using an attention +mechanism and characterized with normalizing flows to handle +non-Gaussian/non-linear distributions of time series. Extensive experiments on +9 benchmarks demonstrate superiority over existing methods. + +
+
+ comment: Proceedings of the 33rd ACM International Conference on Information + and Knowledge Management (CIKM '24), October 21--25, 2024, Boise, ID, USA +
+
+
+
+
+ + ♻ ☆ Masked Language Modeling Becomes Conditional Density Estimation for + Tabular Data Synthesis + + +
+ In this paper, our goal is to generate synthetic data for heterogeneous +(mixed-type) tabular datasets with high machine learning utility (MLu). Since +the MLu performance depends on accurately approximating the conditional +distributions, we focus on devising a synthetic data generation method based on +conditional distribution estimation. We introduce MaCoDE by redefining the +consecutive multi-class classification task of Masked Language Modeling (MLM) +as histogram-based non-parametric conditional density estimation. Our approach +enables the estimation of conditional densities across arbitrary combinations +of target and conditional variables. We bridge the theoretical gap between +distributional learning and MLM by demonstrating that minimizing the orderless +multi-class classification loss leads to minimizing the total variation +distance between conditional distributions. To validate our proposed model, we +evaluate its performance in synthetic data generation across 10 real-world +datasets, demonstrating its ability to adjust data privacy levels easily +without re-training. Additionally, since masked input tokens in MLM are +analogous to missing data, we further assess its effectiveness in handling +training datasets with missing values, including multiple imputations of the +missing entries. + +
+
+
+
+
+ + ♻ ☆ Kolmogorov-Arnold Networks (KAN) for Time Series Classification and + Robust Analysis + + +
+ Kolmogorov-Arnold Networks (KAN) has recently attracted significant attention +as a promising alternative to traditional Multi-Layer Perceptrons (MLP). +Despite their theoretical appeal, KAN require validation on large-scale +benchmark datasets. Time series data, which has become increasingly prevalent +in recent years, especially univariate time series are naturally suited for +validating KAN. Therefore, we conducted a fair comparison among KAN, MLP, and +mixed structures. The results indicate that KAN can achieve performance +comparable to, or even slightly better than, MLP across 128 time series +datasets. We also performed an ablation study on KAN, revealing that the output +is primarily determined by the base component instead of b-spline function. +Furthermore, we assessed the robustness of these models and found that KAN and +the hybrid structure MLP\_KAN exhibit significant robustness advantages, +attributed to their lower Lipschitz constants. This suggests that KAN and KAN +layers hold strong potential to be robust models or to improve the adversarial +robustness of other models. + +
+
+ comment: 14 pages, 8 figs +
+
+
+
+
+ + ♻ ☆ Robust quantum dots charge autotuning using neural networks uncertainty + + +
+ This study presents a machine-learning-based procedure to automate the charge +tuning of semiconductor spin qubits with minimal human intervention, addressing +one of the significant challenges in scaling up quantum dot technologies. This +method exploits artificial neural networks to identify noisy transition lines +in stability diagrams, guiding a robust exploration strategy leveraging neural +networks' uncertainty estimations. Tested across three distinct offline +experimental datasets representing different single quantum dot technologies, +the approach achieves over 99% tuning success rate in optimal cases, where more +than 10% of the success is directly attributable to uncertainty exploitation. +The challenging constraints of small training sets containing high +diagram-to-diagram variability allowed us to evaluate the capabilities and +limits of the proposed procedure. + +
+
+ comment: 12 pages (main) + 13 pages (supplementary) +
+
+
+
+
+ + ♻ ☆ Valid Inference After Causal Discovery + + +
+ Causal discovery and causal effect estimation are two fundamental tasks in +causal inference. While many methods have been developed for each task +individually, statistical challenges arise when applying these methods jointly: +estimating causal effects after running causal discovery algorithms on the same +data leads to "double dipping," invalidating the coverage guarantees of +classical confidence intervals. To this end, we develop tools for valid +post-causal-discovery inference. Across empirical studies, we show that a naive +combination of causal discovery and subsequent inference algorithms leads to +highly inflated miscoverage rates; on the other hand, applying our method +provides reliable coverage while achieving more accurate causal discovery than +data splitting. + +
+
+
+
+
+ + ♻ ☆ Bridging the Language Gap: An Empirical Study of Bindings for Open + Source Machine Learning Libraries Across Software Package Ecosystems + + +
+ Open source machine learning (ML) libraries enable developers to integrate +advanced ML functionality into their own applications. However, popular ML +libraries, such as TensorFlow, are not available natively in all programming +languages and software package ecosystems. Hence, developers who wish to use an +ML library which is not available in their programming language or ecosystem of +choice, may need to resort to using a so-called binding library (or binding). +Bindings provide support across programming languages and package ecosystems +for reusing a host library. For example, the Keras .NET binding provides +support for the Keras library in the NuGet (.NET) ecosystem even though the +Keras library was written in Python. In this paper, we collect 2,436 +cross-ecosystem bindings for 546 ML libraries across 13 software package +ecosystems by using an approach called BindFind, which can automatically +identify bindings and link them to their host libraries. Furthermore, we +conduct an in-depth study of 133 cross-ecosystem bindings and their development +for 40 popular open source ML libraries. Our findings reveal that the majority +of ML library bindings are maintained by the community, with npm being the most +popular ecosystem for these bindings. Our study also indicates that most +bindings cover only a limited range of the host library's releases, often +experience considerable delays in supporting new releases, and have widespread +technical lag. Our findings highlight key factors to consider for developers +integrating bindings for ML libraries and open avenues for researchers to +further investigate bindings in software package ecosystems. + +
+
+
+
+
+ + ♻ ☆ Impossible Distillation: from Low-Quality Model to High-Quality Dataset + & Model for Summarization and Paraphrasing NAACL 2024 + + +
+ We present Impossible Distillation, a novel framework for paraphrasing and +sentence summarization, that distills a high-quality dataset and model from a +low-quality teacher that itself cannot perform these tasks. Unlike prior works +that rely on an extreme-scale teacher model (e.g., GPT3) or task-specific +architecture, we hypothesize and verify the paraphrastic proximity intrinsic to +pre-trained LMs (e.g., GPT2), where paraphrases occupy a proximal subspace in +the LM distribution. By identifying and distilling generations from these +subspaces, Impossible Distillation produces a high-quality dataset and model +even from GPT2-scale LMs. We evaluate our method on multiple benchmarks +spanning unconstrained / syntax-controlled paraphrase generation and sentence +summarization. Our model with 770M parameters consistently outperforms strong +baselines, including models distilled from ChatGPT, and sometimes, even ChatGPT +itself. Also, we find that our distilled dataset from 1.5B LMs exhibits higher +diversity and fidelity than up to 13 times larger datasets. + +
+
+ comment: NAACL 2024 +
+
+
+
+
+ + ♻ ☆ Simple and Nearly-Optimal Sampling for Rank-1 Tensor Completion via + Gauss-Jordan + + +
+ We revisit the sample and computational complexity of completing a rank-1 +tensor in $\otimes_{i=1}^{N} \mathbb{R}^{d}$, given a uniformly sampled subset +of its entries. We present a characterization of the problem (i.e. nonzero +entries) which admits an algorithm amounting to Gauss-Jordan on a pair of +random linear systems. For example, when $N = \Theta(1)$, we prove it uses no +more than $m = O(d^2 \log d)$ samples and runs in $O(md^2)$ time. Moreover, we +show any algorithm requires $\Omega(d\log d)$ samples. + By contrast, existing upper bounds on the sample complexity are at least as +large as $d^{1.5} \mu^{\Omega(1)} \log^{\Omega(1)} d$, where $\mu$ can be +$\Theta(d)$ in the worst case. Prior work obtained these looser guarantees in +higher rank versions of our problem, and tend to involve more complicated +algorithms. + +
+
+ comment: 16 pages; corrected typos in Prior Work section & Theorem 1.5 +
+
+
+
+
+ + ♻ ☆ Adapt and Diffuse: Sample-adaptive Reconstruction via Latent Diffusion + Models + + +
+ Inverse problems arise in a multitude of applications, where the goal is to +recover a clean signal from noisy and possibly (non)linear observations. The +difficulty of a reconstruction problem depends on multiple factors, such as the +ground truth signal structure, the severity of the degradation and the complex +interactions between the above. This results in natural sample-by-sample +variation in the difficulty of a reconstruction problem. Our key observation is +that most existing inverse problem solvers lack the ability to adapt their +compute power to the difficulty of the reconstruction task, resulting in subpar +performance and wasteful resource allocation. We propose a novel method, +$\textit{severity encoding}$, to estimate the degradation severity of corrupted +signals in the latent space of an autoencoder. We show that the estimated +severity has strong correlation with the true corruption level and can provide +useful hints on the difficulty of reconstruction problems on a sample-by-sample +basis. Furthermore, we propose a reconstruction method based on latent +diffusion models that leverages the predicted degradation severities to +fine-tune the reverse diffusion sampling trajectory and thus achieve +sample-adaptive inference times. Our framework, Flash-Diffusion, acts as a +wrapper that can be combined with any latent diffusion-based baseline solver, +imbuing it with sample-adaptivity and acceleration. We perform experiments on +both linear and nonlinear inverse problems and demonstrate that our technique +greatly improves the performance of the baseline solver and achieves up to +$10\times$ acceleration in mean sampling speed. + +
+
+ comment: 30 pages, 21 figures, published at the 41st International Conference + on Machine Learning, Vienna, Austria, 2024 +
+
+
+
+
+ + ♻ ☆ Sequential Bayesian Neural Subnetwork Ensembles + + +
+ Deep ensembles have emerged as a powerful technique for improving predictive +performance and enhancing model robustness across various applications by +leveraging model diversity. However, traditional deep ensemble methods are +often computationally expensive and rely on deterministic models, which may +limit their flexibility. Additionally, while sparse subnetworks of dense models +have shown promise in matching the performance of their dense counterparts and +even enhancing robustness, existing methods for inducing sparsity typically +incur training costs comparable to those of training a single dense model, as +they either gradually prune the network during training or apply thresholding +post-training. In light of these challenges, we propose an approach for +sequential ensembling of dynamic Bayesian neural subnetworks that consistently +maintains reduced model complexity throughout the training process while +generating diverse ensembles in a single forward pass. Our approach involves an +initial exploration phase to identify high-performing regions within the +parameter space, followed by multiple exploitation phases that take advantage +of the compactness of the sparse model. These exploitation phases quickly +converge to different minima in the energy landscape, corresponding to +high-performing subnetworks that together form a diverse and robust ensemble. +We empirically demonstrate that our proposed approach outperforms traditional +dense and sparse deterministic and Bayesian ensemble models in terms of +prediction accuracy, uncertainty estimation, out-of-distribution detection, and +adversarial robustness. + +
+
+
+
+
+ + ♻ ☆ DiracDiffusion: Denoising and Incremental Reconstruction with Assured + Data-Consistency + + +
+ Diffusion models have established new state of the art in a multitude of +computer vision tasks, including image restoration. Diffusion-based inverse +problem solvers generate reconstructions of exceptional visual quality from +heavily corrupted measurements. However, in what is widely known as the +perception-distortion trade-off, the price of perceptually appealing +reconstructions is often paid in declined distortion metrics, such as PSNR. +Distortion metrics measure faithfulness to the observation, a crucial +requirement in inverse problems. In this work, we propose a novel framework for +inverse problem solving, namely we assume that the observation comes from a +stochastic degradation process that gradually degrades and noises the original +clean image. We learn to reverse the degradation process in order to recover +the clean image. Our technique maintains consistency with the original +measurement throughout the reverse process, and allows for great flexibility in +trading off perceptual quality for improved distortion metrics and sampling +speedup via early-stopping. We demonstrate the efficiency of our method on +different high-resolution datasets and inverse problems, achieving great +improvements over other state-of-the-art diffusion-based methods with respect +to both perceptual and distortion metrics. + +
+
+ comment: 30 pages, 15 figures, published at the 41st International Conference + on Machine Learning, Vienna, Austria, 2024 +
+
+
+
+
+
+
+
+ + Multimedia 9 + +
+
+
+ + ☆ Perceptual Depth Quality Assessment of Stereoscopic Omnidirectional + Images + + +
+ Depth perception plays an essential role in the viewer experience for +immersive virtual reality (VR) visual environments. However, previous research +investigations in the depth quality of 3D/stereoscopic images are rather +limited, and in particular, are largely lacking for 3D viewing of 360-degree +omnidirectional content. In this work, we make one of the first attempts to +develop an objective quality assessment model named depth quality index (DQI) +for efficient no-reference (NR) depth quality assessment of stereoscopic +omnidirectional images. Motivated by the perceptual characteristics of the +human visual system (HVS), the proposed DQI is built upon multi-color-channel, +adaptive viewport selection, and interocular discrepancy features. Experimental +results demonstrate that the proposed method outperforms state-of-the-art image +quality assessment (IQA) and depth quality assessment (DQA) approaches in +predicting the perceptual depth quality when tested using both single-viewport +and omnidirectional stereoscopic image databases. Furthermore, we demonstrate +that combining the proposed depth quality model with existing IQA methods +significantly boosts the performance in predicting the overall quality of 3D +omnidirectional images. + +
+
+ comment: Accepted by IEEE TCSVT +
+
+
+
+
+ + ☆ Sliced Maximal Information Coefficient: A Training-Free Approach for + Image Quality Assessment Enhancement ICME2024 + + +
+ Full-reference image quality assessment (FR-IQA) models generally operate by +measuring the visual differences between a degraded image and its reference. +However, existing FR-IQA models including both the classical ones (eg, PSNR and +SSIM) and deep-learning based measures (eg, LPIPS and DISTS) still exhibit +limitations in capturing the full perception characteristics of the human +visual system (HVS). In this paper, instead of designing a new FR-IQA measure, +we aim to explore a generalized human visual attention estimation strategy to +mimic the process of human quality rating and enhance existing IQA models. In +particular, we model human attention generation by measuring the statistical +dependency between the degraded image and the reference image. The dependency +is captured in a training-free manner by our proposed sliced maximal +information coefficient and exhibits surprising generalization in different IQA +measures. Experimental results verify the performance of existing IQA models +can be consistently improved when our attention module is incorporated. The +source code is available at https://github.com/KANGX99/SMIC. + +
+
+ comment: 6 pages, 5 figures, accepted by ICME2024 +
+
+
+
+
+ + ☆ Anim-Director: A Large Multimodal Model Powered Agent for Controllable + Animation Video Generation SIGGRAPH + + +
+ Traditional animation generation methods depend on training generative models +with human-labelled data, entailing a sophisticated multi-stage pipeline that +demands substantial human effort and incurs high training costs. Due to limited +prompting plans, these methods typically produce brief, information-poor, and +context-incoherent animations. To overcome these limitations and automate the +animation process, we pioneer the introduction of large multimodal models +(LMMs) as the core processor to build an autonomous animation-making agent, +named Anim-Director. This agent mainly harnesses the advanced understanding and +reasoning capabilities of LMMs and generative AI tools to create animated +videos from concise narratives or simple instructions. Specifically, it +operates in three main stages: Firstly, the Anim-Director generates a coherent +storyline from user inputs, followed by a detailed director's script that +encompasses settings of character profiles and interior/exterior descriptions, +and context-coherent scene descriptions that include appearing characters, +interiors or exteriors, and scene events. Secondly, we employ LMMs with the +image generation tool to produce visual images of settings and scenes. These +images are designed to maintain visual consistency across different scenes +using a visual-language prompting method that combines scene descriptions and +images of the appearing character and setting. Thirdly, scene images serve as +the foundation for producing animated videos, with LMMs generating prompts to +guide this process. The whole process is notably autonomous without manual +intervention, as the LMMs interact seamlessly with generative tools to generate +prompts, evaluate visual quality, and select the best one to optimize the final +output. + +
+
+ comment: Accepted by SIGGRAPH Asia 2024, Project and Codes: + https://github.com/HITsz-TMG/Anim-Director +
+
+
+
+
+ + ☆ ExpoMamba: Exploiting Frequency SSM Blocks for Efficient and Effective + Image Enhancement + + +
+ Low-light image enhancement remains a challenging task in computer vision, +with existing state-of-the-art models often limited by hardware constraints and +computational inefficiencies, particularly in handling high-resolution images. +Recent foundation models, such as transformers and diffusion models, despite +their efficacy in various domains, are limited in use on edge devices due to +their computational complexity and slow inference times. We introduce +ExpoMamba, a novel architecture that integrates components of the frequency +state space within a modified U-Net, offering a blend of efficiency and +effectiveness. This model is specifically optimized to address mixed exposure +challenges, a common issue in low-light image enhancement, while ensuring +computational efficiency. Our experiments demonstrate that ExpoMamba enhances +low-light images up to 2-3x faster than traditional models with an inference +time of 36.6 ms and achieves a PSNR improvement of approximately 15-20% over +competing models, making it highly suitable for real-time image processing +applications. + +
+
+
+
+
+ + ☆ Kubrick: Multimodal Agent Collaborations for Synthetic Video Generation + + +
+ Text-to-video generation has been dominated by end-to-end diffusion-based or +autoregressive models. On one hand, those novel models provide plausible +versatility, but they are criticized for physical correctness, shading and +illumination, camera motion, and temporal consistency. On the other hand, film +industry relies on manually-edited Computer-Generated Imagery (CGI) using 3D +modeling software. Human-directed 3D synthetic videos and animations address +the aforementioned shortcomings, but it is extremely tedious and requires tight +collaboration between movie makers and 3D rendering experts. In this paper, we +introduce an automatic synthetic video generation pipeline based on Vision +Large Language Model (VLM) agent collaborations. Given a natural language +description of a video, multiple VLM agents auto-direct various processes of +the generation pipeline. They cooperate to create Blender scripts which render +a video that best aligns with the given description. Based on film making +inspiration and augmented with Blender-based movie making knowledge, the +Director agent decomposes the input text-based video description into +sub-processes. For each sub-process, the Programmer agent produces Python-based +Blender scripts based on customized function composing and API calling. Then, +the Reviewer agent, augmented with knowledge of video reviewing, character +motion coordinates, and intermediate screenshots uses its compositional +reasoning ability to provide feedback to the Programmer agent. The Programmer +agent iteratively improves the scripts to yield the best overall video outcome. +Our generated videos show better quality than commercial video generation +models in 5 metrics on video quality and instruction-following performance. +Moreover, our framework outperforms other approaches in a comprehensive user +study on quality, consistency, and rationality. + +
+
+
+
+
+ + ☆ Webcam-based Pupil Diameter Prediction Benefits from Upscaling + + +
+ Capturing pupil diameter is essential for assessing psychological and +physiological states such as stress levels and cognitive load. However, the low +resolution of images in eye datasets often hampers precise measurement. This +study evaluates the impact of various upscaling methods, ranging from bicubic +interpolation to advanced super-resolution, on pupil diameter predictions. We +compare several pre-trained methods, including CodeFormer, GFPGAN, Real-ESRGAN, +HAT, and SRResNet. Our findings suggest that pupil diameter prediction models +trained on upscaled datasets are highly sensitive to the selected upscaling +method and scale. Our results demonstrate that upscaling methods consistently +enhance the accuracy of pupil diameter prediction models, highlighting the +importance of upscaling in pupilometry. Overall, our work provides valuable +insights for selecting upscaling techniques, paving the way for more accurate +assessments in psychological and physiological research. + +
+
+
+
+
+ + ☆ Harmonizing Attention: Training-free Texture-aware Geometry Transfer + + +
+ Extracting geometry features from photographic images independently of +surface texture and transferring them onto different materials remains a +complex challenge. In this study, we introduce Harmonizing Attention, a novel +training-free approach that leverages diffusion models for texture-aware +geometry transfer. Our method employs a simple yet effective modification of +self-attention layers, allowing the model to query information from multiple +reference images within these layers. This mechanism is seamlessly integrated +into the inversion process as Texture-aligning Attention and into the +generation process as Geometry-aligning Attention. This dual-attention approach +ensures the effective capture and transfer of material-independent geometry +features while maintaining material-specific textural continuity, all without +the need for model fine-tuning. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ A Multi-Stream Fusion Approach with One-Class Learning for Audio-Visual + Deepfake Detection + + +
+ This paper addresses the challenge of developing a robust audio-visual +deepfake detection model. In practical use cases, new generation algorithms are +continually emerging, and these algorithms are not encountered during the +development of detection methods. This calls for the generalization ability of +the method. Additionally, to ensure the credibility of detection methods, it is +beneficial for the model to interpret which cues from the video indicate it is +fake. Motivated by these considerations, we then propose a multi-stream fusion +approach with one-class learning as a representation-level regularization +technique. We study the generalization problem of audio-visual deepfake +detection by creating a new benchmark by extending and re-splitting the +existing FakeAVCeleb dataset. The benchmark contains four categories of fake +videos (Real Audio-Fake Visual, Fake Audio-Fake Visual, Fake Audio-Real Visual, +and Unsynchronized videos). The experimental results demonstrate that our +approach surpasses the previous models by a large margin. Furthermore, our +proposed framework offers interpretability, indicating which modality the model +identifies as more likely to be fake. The source code is released at +https://github.com/bok-bok/MSOC. + +
+
+
+
+
+ + ♻ ☆ Dynamic Resolution Guidance for Facial Expression Recognition + + +
+ Facial expression recognition (FER) is vital for human-computer interaction +and emotion analysis, yet recognizing expressions in low-resolution images +remains challenging. This paper introduces a practical method called Dynamic +Resolution Guidance for Facial Expression Recognition (DRGFER) to effectively +recognize facial expressions in images with varying resolutions without +compromising FER model accuracy. Our framework comprises two main components: +the Resolution Recognition Network (RRN) and the Multi-Resolution Adaptation +Facial Expression Recognition Network (MRAFER). The RRN determines image +resolution, outputs a binary vector, and the MRAFER assigns images to suitable +facial expression recognition networks based on resolution. We evaluated DRGFER +on widely-used datasets RAFDB and FERPlus, demonstrating that our method +retains optimal model performance at each resolution and outperforms +alternative resolution approaches. The proposed framework exhibits robustness +against resolution variations and facial expressions, offering a promising +solution for real-world applications. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 49 + +
+
+
+ + ☆ PhysBERT: A Text Embedding Model for Physics Scientific Literature + + +
+ The specialized language and complex concepts in physics pose significant +challenges for information extraction through Natural Language Processing +(NLP). Central to effective NLP applications is the text embedding model, which +converts text into dense vector representations for efficient information +retrieval and semantic analysis. In this work, we introduce PhysBERT, the first +physics-specific text embedding model. Pre-trained on a curated corpus of 1.2 +million arXiv physics papers and fine-tuned with supervised data, PhysBERT +outperforms leading general-purpose models on physics-specific tasks including +the effectiveness in fine-tuning for specific physics subdomains. + +
+
+
+
+
+ + ☆ Grammatical Error Feedback: An Implicit Evaluation Approach + + +
+ Grammatical feedback is crucial for consolidating second language (L2) +learning. Most research in computer-assisted language learning has focused on +feedback through grammatical error correction (GEC) systems, rather than +examining more holistic feedback that may be more useful for learners. This +holistic feedback will be referred to as grammatical error feedback (GEF). In +this paper, we present a novel implicit evaluation approach to GEF that +eliminates the need for manual feedback annotations. Our method adopts a +grammatical lineup approach where the task is to pair feedback and essay +representations from a set of possible alternatives. This matching process can +be performed by appropriately prompting a large language model (LLM). An +important aspect of this process, explored here, is the form of the lineup, +i.e., the selection of foils. This paper exploits this framework to examine the +quality and need for GEC to generate feedback, as well as the system used to +generate feedback, using essays from the Cambridge Learner Corpus. + +
+
+
+
+
+ + ☆ HiAgent: Hierarchical Working Memory Management for Solving Long-Horizon + Agent Tasks with Large Language Model + + +
+ Large Language Model (LLM)-based agents exhibit significant potential across +various domains, operating as interactive systems that process environmental +observations to generate executable actions for target tasks. The effectiveness +of these agents is significantly influenced by their memory mechanism, which +records historical experiences as sequences of action-observation pairs. We +categorize memory into two types: cross-trial memory, accumulated across +multiple attempts, and in-trial memory (working memory), accumulated within a +single attempt. While considerable research has optimized performance through +cross-trial memory, the enhancement of agent performance through improved +working memory utilization remains underexplored. Instead, existing approaches +often involve directly inputting entire historical action-observation pairs +into LLMs, leading to redundancy in long-horizon tasks. Inspired by human +problem-solving strategies, this paper introduces HiAgent, a framework that +leverages subgoals as memory chunks to manage the working memory of LLM-based +agents hierarchically. Specifically, HiAgent prompts LLMs to formulate subgoals +before generating executable actions and enables LLMs to decide proactively to +replace previous subgoals with summarized observations, retaining only the +action-observation pairs relevant to the current subgoal. Experimental results +across five long-horizon tasks demonstrate that HiAgent achieves a twofold +increase in success rate and reduces the average number of steps required by +3.8. Additionally, our analysis shows that HiAgent consistently improves +performance across various steps, highlighting its robustness and +generalizability. Project Page: https://github.com/HiAgent2024/HiAgent . + +
+
+ comment: Project Page: https://github.com/HiAgent2024/HiAgent +
+
+
+
+
+ + ☆ No Such Thing as a General Learner: Language models and their dual + optimization + + +
+ What role can the otherwise successful Large Language Models (LLMs) play in +the understanding of human cognition, and in particular in terms of informing +language acquisition debates? To contribute to this question, we first argue +that neither humans nor LLMs are general learners, in a variety of senses. We +make a novel case for how in particular LLMs follow a dual-optimization +process: they are optimized during their training (which is typically compared +to language acquisition), and modern LLMs have also been selected, through a +process akin to natural selection in a species. From this perspective, we argue +that the performance of LLMs, whether similar or dissimilar to that of humans, +does not weigh easily on important debates about the importance of human +cognitive biases for language. + +
+
+ comment: 11 pages, 4 figures +
+
+
+
+
+ + ☆ Using ChatGPT to Score Essays and Short-Form Constructed Responses + + +
+ This study aimed to determine if ChatGPT's large language models could match +the scoring accuracy of human and machine scores from the ASAP competition. The +investigation focused on various prediction models, including linear +regression, random forest, gradient boost, and boost. ChatGPT's performance was +evaluated against human raters using quadratic weighted kappa (QWK) metrics. +Results indicated that while ChatGPT's gradient boost model achieved QWKs close +to human raters for some data sets, its overall performance was inconsistent +and often lower than human scores. The study highlighted the need for further +refinement, particularly in handling biases and ensuring scoring fairness. +Despite these challenges, ChatGPT demonstrated potential for scoring +efficiency, especially with domain-specific fine-tuning. The study concludes +that ChatGPT can complement human scoring but requires additional development +to be reliable for high-stakes assessments. Future research should improve +model accuracy, address ethical considerations, and explore hybrid models +combining ChatGPT with empirical methods. + +
+
+ comment: 35 pages, 8 tables, 2 Figures, 27 references +
+
+
+
+
+ + ☆ Revisiting the Graph Reasoning Ability of Large Language Models: Case + Studies in Translation, Connectivity and Shortest Path + + +
+ Large Language Models (LLMs) have achieved great success in various reasoning +tasks. In this work, we focus on the graph reasoning ability of LLMs. Although +theoretical studies proved that LLMs are capable of handling graph reasoning +tasks, empirical evaluations reveal numerous failures. To deepen our +understanding on this discrepancy, we revisit the ability of LLMs on three +fundamental graph tasks: graph description translation, graph connectivity, and +the shortest-path problem. Our findings suggest that LLMs can fail to +understand graph structures through text descriptions and exhibit varying +performance for all these three fundamental tasks. Meanwhile, we perform a +real-world investigation on knowledge graphs and make consistent observations +with our findings. The codes and datasets are available. + +
+
+
+
+
+ + ☆ Out-of-distribution generalization via composition: a lens through + induction heads in Transformers + + +
+ Large language models (LLMs) such as GPT-4 sometimes appear to be creative, +solving novel tasks often with a few demonstrations in the prompt. These tasks +require the models to generalize on distributions different from those from +training data -- which is known as out-of-distribution (OOD) generalization. +Despite the tremendous success of LLMs, how they approach OOD generalization +remains an open and underexplored question. We examine OOD generalization in +settings where instances are generated according to hidden rules, including +in-context learning with symbolic reasoning. Models are required to infer the +hidden rules behind input prompts without any fine-tuning. + We empirically examined the training dynamics of Transformers on a synthetic +example and conducted extensive experiments on a variety of pretrained LLMs, +focusing on a type of components known as induction heads. We found that OOD +generalization and composition are tied together -- models can learn rules by +composing two self-attention layers, thereby achieving OOD generalization. +Furthermore, a shared latent subspace in the embedding (or feature) space acts +as a bridge for composition by aligning early layers and later layers, which we +refer to as the common bridge representation hypothesis. + +
+
+ comment: 41 pages, 25 figures +
+
+
+
+
+ + ☆ REFINE-LM: Mitigating Language Model Stereotypes via Reinforcement + Learning + + +
+ With the introduction of (large) language models, there has been significant +concern about the unintended bias such models may inherit from their training +data. A number of studies have shown that such models propagate gender +stereotypes, as well as geographical and racial bias, among other biases. While +existing works tackle this issue by preprocessing data and debiasing +embeddings, the proposed methods require a lot of computational resources and +annotation effort while being limited to certain types of biases. To address +these issues, we introduce REFINE-LM, a debiasing method that uses +reinforcement learning to handle different types of biases without any +fine-tuning. By training a simple model on top of the word probability +distribution of a LM, our bias agnostic reinforcement learning method enables +model debiasing without human annotations or significant computational +resources. Experiments conducted on a wide range of models, including several +LMs, show that our method (i) significantly reduces stereotypical biases while +preserving LMs performance; (ii) is applicable to different types of biases, +generalizing across contexts such as gender, ethnicity, religion, and +nationality-based biases; and (iii) it is not expensive to train. + +
+
+
+
+
+ + ☆ Activated Parameter Locating via Causal Intervention for Model Merging + + +
+ Model merging combines multiple homologous models into one model, achieving +convincing generalization without the necessity of additional training. A key +challenge in this problem is resolving parameter redundancies and conflicts +across multiple models. Existing models have demonstrated that dropping a +portion of delta parameters can alleviate conflicts while maintaining +performance. However, these methods often drop parameters either randomly or +based on magnitude, overlooking task-specific information embedded in +fine-tuned models. In this paper, we propose an Activated Parameter Locating +(APL) method that utilizes causal intervention to estimate parameter +importance, enabling more precise parameter drops and better conflict +mitigation. Moreover, to reduce the computational complexity associated with a +large number of parameter partitions, we also introduce a theoretically +supported gradient approximation strategy for APL. Experiments on model merging +within both in-domain and out-of-domain settings, along with associated +analyses, showcase the effectiveness of APL. + +
+
+
+
+
+ + ☆ PanoSent: A Panoptic Sextuple Extraction Benchmark for Multimodal + Conversational Aspect-based Sentiment Analysis ACM MM 2024 + + +
+ While existing Aspect-based Sentiment Analysis (ABSA) has received extensive +effort and advancement, there are still gaps in defining a more holistic +research target seamlessly integrating multimodality, conversation context, +fine-granularity, and also covering the changing sentiment dynamics as well as +cognitive causal rationales. This paper bridges the gaps by introducing a +multimodal conversational ABSA, where two novel subtasks are proposed: 1) +Panoptic Sentiment Sextuple Extraction, panoramically recognizing holder, +target, aspect, opinion, sentiment, rationale from multi-turn multi-party +multimodal dialogue. 2) Sentiment Flipping Analysis, detecting the dynamic +sentiment transformation throughout the conversation with the causal reasons. +To benchmark the tasks, we construct PanoSent, a dataset annotated both +manually and automatically, featuring high quality, large scale, multimodality, +multilingualism, multi-scenarios, and covering both implicit and explicit +sentiment elements. To effectively address the tasks, we devise a novel +Chain-of-Sentiment reasoning framework, together with a novel multimodal large +language model (namely Sentica) and a paraphrase-based verification mechanism. +Extensive evaluations demonstrate the superiority of our methods over strong +baselines, validating the efficacy of all our proposed methods. The work is +expected to open up a new era for the ABSA community, and thus all our codes +and data are open at https://PanoSent.github.io/ + +
+
+ comment: Accepted by ACM MM 2024 (Oral) +
+
+
+
+
+ + ☆ Image-Based Geolocation Using Large Vision-Language Models + + +
+ Geolocation is now a vital aspect of modern life, offering numerous benefits +but also presenting serious privacy concerns. The advent of large +vision-language models (LVLMs) with advanced image-processing capabilities +introduces new risks, as these models can inadvertently reveal sensitive +geolocation information. This paper presents the first in-depth study analyzing +the challenges posed by traditional deep learning and LVLM-based geolocation +methods. Our findings reveal that LVLMs can accurately determine geolocations +from images, even without explicit geographic training. + To address these challenges, we introduce \tool{}, an innovative framework +that significantly enhances image-based geolocation accuracy. \tool{} employs a +systematic chain-of-thought (CoT) approach, mimicking human geoguessing +strategies by carefully analyzing visual and contextual cues such as vehicle +types, architectural styles, natural landscapes, and cultural elements. +Extensive testing on a dataset of 50,000 ground-truth data points shows that +\tool{} outperforms both traditional models and human benchmarks in accuracy. +It achieves an impressive average score of 4550.5 in the GeoGuessr game, with +an 85.37\% win rate, and delivers highly precise geolocation predictions, with +the closest distances as accurate as 0.3 km. Furthermore, our study highlights +issues related to dataset integrity, leading to the creation of a more robust +dataset and a refined framework that leverages LVLMs' cognitive capabilities to +improve geolocation precision. These findings underscore \tool{}'s superior +ability to interpret complex visual data, the urgent need to address emerging +security vulnerabilities posed by LVLMs, and the importance of responsible AI +development to ensure user privacy protection. + +
+
+
+
+
+ + ☆ WPN: An Unlearning Method Based on N-pair Contrastive Learning in + Language Models ECAI 2024 + + +
+ Generative language models (LMs) offer numerous advantages but may produce +inappropriate or harmful outputs due to the harmful knowledge acquired during +pre-training. This knowledge often manifests as undesirable correspondences, +such as "harmful prompts" leading to "harmful outputs," which our research aims +to mitigate through unlearning techniques.However, existing unlearning methods +based on gradient ascent can significantly impair the performance of LMs. To +address this issue, we propose a novel approach called Weighted Positional +N-pair (WPN) Learning, which leverages position-weighted mean pooling within an +n-pair contrastive learning framework. WPN is designed to modify the output +distribution of LMs by eliminating specific harmful outputs (e.g., replacing +toxic responses with neutral ones), thereby transforming the model's behavior +from "harmful prompt-harmful output" to "harmful prompt-harmless +response".Experiments on OPT and GPT-NEO LMs show that WPN effectively reduces +the proportion of harmful responses, achieving a harmless rate of up to 95.8\% +while maintaining stable performance on nine common benchmarks (with less than +2\% degradation on average). Moreover, we provide empirical evidence to +demonstrate WPN's ability to weaken the harmful correspondences in terms of +generalizability and robustness, as evaluated on out-of-distribution test sets +and under adversarial attacks. + +
+
+ comment: ECAI 2024 +
+
+
+
+
+ + ☆ Identifying Speakers and Addressees of Quotations in Novels with Prompt + Learning NLPCC 2024 + + +
+ Quotations in literary works, especially novels, are important to create +characters, reflect character relationships, and drive plot development. +Current research on quotation extraction in novels primarily focuses on +quotation attribution, i.e., identifying the speaker of the quotation. However, +the addressee of the quotation is also important to construct the relationship +between the speaker and the addressee. To tackle the problem of dataset +scarcity, we annotate the first Chinese quotation corpus with elements +including speaker, addressee, speaking mode and linguistic cue. We propose +prompt learning-based methods for speaker and addressee identification based on +fine-tuned pre-trained models. Experiments on both Chinese and English datasets +show the effectiveness of the proposed methods, which outperform methods based +on zero-shot and few-shot large language models. + +
+
+ comment: This paper has been accepted by NLPCC 2024 +
+
+
+
+
+ + ☆ Hindi-BEIR : A Large Scale Retrieval Benchmark in Hindi + + +
+ Given the large number of Hindi speakers worldwide, there is a pressing need +for robust and efficient information retrieval systems for Hindi. Despite +ongoing research, there is a lack of comprehensive benchmark for evaluating +retrieval models in Hindi. To address this gap, we introduce the Hindi version +of the BEIR benchmark, which includes a subset of English BEIR datasets +translated to Hindi, existing Hindi retrieval datasets, and synthetically +created datasets for retrieval. The benchmark is comprised of $15$ datasets +spanning across $8$ distinct tasks. We evaluate state-of-the-art multilingual +retrieval models on this benchmark to identify task and domain-specific +challenges and their impact on retrieval performance. By releasing this +benchmark and a set of relevant baselines, we enable researchers to understand +the limitations and capabilities of current Hindi retrieval models, promoting +advancements in this critical area. The datasets from Hindi-BEIR are publicly +available. + +
+
+
+
+
+ + ☆ HySem: A context length optimized LLM pipeline for unstructured tabular + extraction + + +
+ Regulatory compliance reporting in the pharmaceutical industry relies on +detailed tables, but these are often under-utilized beyond compliance due to +their unstructured format and arbitrary content. Extracting and semantically +representing tabular data is challenging due to diverse table presentations. +Large Language Models (LLMs) demonstrate substantial potential for semantic +representation, yet they encounter challenges related to accuracy and context +size limitations, which are crucial considerations for the industry +applications. We introduce HySem, a pipeline that employs a novel context +length optimization technique to generate accurate semantic JSON +representations from HTML tables. This approach utilizes a custom fine-tuned +model specifically designed for cost- and privacy-sensitive small and medium +pharmaceutical enterprises. Running on commodity hardware and leveraging +open-source models, our auto-correcting agents rectify both syntax and semantic +errors in LLM-generated content. HySem surpasses its peer open-source models in +accuracy and provides competitive performance when benchmarked against OpenAI +GPT-4o and effectively addresses context length limitations, which is a crucial +factor for supporting larger tables. + +
+
+ comment: 9 pages, 4 tables, 3 figures, 1 algorithm +
+
+
+
+
+ + ☆ FASST: Fast LLM-based Simultaneous Speech Translation + + +
+ Simultaneous speech translation (SST) takes streaming speech input and +generates text translation on the fly. Existing methods either have high +latency due to recomputation of input representations, or fall behind of +offline ST in translation quality. In this paper, we propose FASST, a fast +large language model based method for streaming speech translation. We propose +blockwise-causal speech encoding and consistency mask, so that streaming speech +input can be encoded incrementally without recomputation. Furthermore, we +develop a two-stage training strategy to optimize FASST for simultaneous +inference. We evaluate FASST and multiple strong prior models on MuST-C +dataset. Experiment results show that FASST achieves the best quality-latency +trade-off. It outperforms the previous best model by an average of 1.5 BLEU +under the same latency for English to Spanish translation. + +
+
+
+
+
+ + ☆ Distinguish Confusion in Legal Judgment Prediction via Revised Relation + Knowledge + + +
+ Legal Judgment Prediction (LJP) aims to automatically predict a law case's +judgment results based on the text description of its facts. In practice, the +confusing law articles (or charges) problem frequently occurs, reflecting that +the law cases applicable to similar articles (or charges) tend to be misjudged. +Although some recent works based on prior knowledge solve this issue well, they +ignore that confusion also occurs between law articles with a high posterior +semantic similarity due to the data imbalance problem instead of only between +the prior highly similar ones, which is this work's further finding. This paper +proposes an end-to-end model named \textit{D-LADAN} to solve the above +challenges. On the one hand, D-LADAN constructs a graph among law articles +based on their text definition and proposes a graph distillation operation +(GDO) to distinguish the ones with a high prior semantic similarity. On the +other hand, D-LADAN presents a novel momentum-updated memory mechanism to +dynamically sense the posterior similarity between law articles (or charges) +and a weighted GDO to adaptively capture the distinctions for revising the +inductive bias caused by the data imbalance problem. We perform extensive +experiments to demonstrate that D-LADAN significantly outperforms +state-of-the-art methods in accuracy and robustness. + +
+
+ comment: Accepted by ACM TOIS +
+
+
+
+
+ + ☆ Enhancing Startup Success Predictions in Venture Capital: A GraphRAG + Augmented Multivariate Time Series Method + + +
+ In the Venture Capital(VC) industry, predicting the success of startups is +challenging due to limited financial data and the need for subjective revenue +forecasts. Previous methods based on time series analysis or deep learning +often fall short as they fail to incorporate crucial inter-company +relationships such as competition and collaboration. Regarding the issues, we +propose a novel approach using GrahphRAG augmented time series model. With +GraphRAG, time series predictive methods are enhanced by integrating these +vital relationships into the analysis framework, allowing for a more dynamic +understanding of the startup ecosystem in venture capital. Our experimental +results demonstrate that our model significantly outperforms previous models in +startup success predictions. To the best of our knowledge, our work is the +first application work of GraphRAG. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2312.13936, + arXiv:2312.04876, arXiv:2402.11454 by other authors +
+
+
+
+
+ + ☆ Challenges and Responses in the Practice of Large Language Models + + +
+ This paper carefully summarizes extensive and profound questions from all +walks of life, focusing on the current high-profile AI field, covering multiple +dimensions such as industry trends, academic research, technological innovation +and business applications. This paper meticulously curates questions that are +both thought-provoking and practically relevant, providing nuanced and +insightful answers to each. To facilitate readers' understanding and reference, +this paper specifically classifies and organizes these questions systematically +and meticulously from the five core dimensions of computing power +infrastructure, software architecture, data resources, application scenarios, +and brain science. This work aims to provide readers with a comprehensive, +in-depth and cutting-edge AI knowledge framework to help people from all walks +of life grasp the pulse of AI development, stimulate innovative thinking, and +promote industrial progress. + +
+
+
+
+
+ + ☆ Comparison between the Structures of Word Co-occurrence and Word + Similarity Networks for Ill-formed and Well-formed Texts in Taiwan Mandarin + + +
+ The study of word co-occurrence networks has attracted the attention of +researchers due to their potential significance as well as applications. +Understanding the structure of word co-occurrence networks is therefore +important to fully realize their significance and usages. In past studies, word +co-occurrence networks built on well-formed texts have been found to possess +certain characteristics, including being small-world, following a two-regime +power law distribution, and being generally disassortative. On the flip side, +past studies have found that word co-occurrence networks built from ill-formed +texts such as microblog posts may behave differently from those built from +well-formed documents. While both kinds of word co-occurrence networks are +small-world and disassortative, word co-occurrence networks built from +ill-formed texts are scale-free and follow the power law distribution instead +of the two-regime power law distribution. However, since past studies on the +behavior of word co-occurrence networks built from ill-formed texts only +investigated English, the universality of such characteristics remains to be +seen among different languages. In addition, it is yet to be investigated +whether there could be possible similitude/differences between word +co-occurrence networks and other potentially comparable networks. This study +therefore investigates and compares the structure of word co-occurrence +networks and word similarity networks based on Taiwan Mandarin ill-formed +internet forum posts and compare them with those built with well-formed +judicial judgments, and seeks to find out whether the three aforementioned +properties (scale-free, small-world, and disassortative) for ill-formed and +well-formed texts are universal among different languages and between word +co-occurrence and word similarity networks. + +
+
+ comment: 4 pages, 1 figure, 5 tables +
+
+
+
+
+ + ☆ Game Development as Human-LLM Interaction + + +
+ Game development is a highly specialized task that relies on a complex game +engine powered by complex programming languages, preventing many gaming +enthusiasts from handling it. This paper introduces the Interaction-driven Game +Engine (IGE) powered by LLM, which allows everyone to develop a custom game +using natural language through Human-LLM interaction. To enable an LLM to +function as an IGE, we instruct it to perform the following processes in each +turn: (1) $P_{script}$ : configure the game script segment based on the user's +input; (2) $P_{code}$ : generate the corresponding code snippet based on the +game script segment; (3) $P_{utter}$ : interact with the user, including +guidance and feedback. We propose a data synthesis pipeline based on the LLM to +generate game script-code pairs and interactions from a few manually crafted +seed data. We propose a three-stage progressive training strategy to transfer +the dialogue-based LLM to our IGE smoothly. We construct an IGE for poker games +as a case study and comprehensively evaluate it from two perspectives: +interaction quality and code correctness. The code and data are available at +\url{https://github.com/alterego238/IGE}. + +
+
+
+
+
+ + ☆ Offline RLHF Methods Need More Accurate Supervision Signals + + +
+ With the rapid advances in Large Language Models (LLMs), aligning LLMs with +human preferences become increasingly important. Although Reinforcement +Learning with Human Feedback (RLHF) proves effective, it is complicated and +highly resource-intensive. As such, offline RLHF has been introduced as an +alternative solution, which directly optimizes LLMs with ranking losses on a +fixed preference dataset. Current offline RLHF only captures the ``ordinal +relationship'' between responses, overlooking the crucial aspect of ``how +much'' one is preferred over the others. To address this issue, we propose a +simple yet effective solution called \textbf{R}eward \textbf{D}ifference +\textbf{O}ptimization, shorted as \textbf{RDO}. Specifically, we introduce {\it +reward difference coefficients} to reweigh sample pairs in offline RLHF. We +then develop a {\it difference model} involving rich interactions between a +pair of responses for predicting these difference coefficients. Experiments +with 7B LLMs on the HH and TL;DR datasets substantiate the effectiveness of our +method in both automatic metrics and human evaluation, thereby highlighting its +potential for aligning LLMs with human intent and values. + +
+
+ comment: under review +
+
+
+
+
+ + ☆ Improving and Assessing the Fidelity of Large Language Models Alignment + to Online Communities + + +
+ Large language models (LLMs) have shown promise in representing individuals +and communities, offering new ways to study complex social dynamics. However, +effectively aligning LLMs with specific human groups and systematically +assessing the fidelity of the alignment remains a challenge. This paper +presents a robust framework for aligning LLMs with online communities via +instruction-tuning and comprehensively evaluating alignment across various +aspects of language, including authenticity, emotional tone, toxicity, and +harm. We demonstrate the utility of our approach by applying it to online +communities centered on dieting and body image. We administer an eating +disorder psychometric test to the aligned LLMs to reveal unhealthy beliefs and +successfully differentiate communities with varying levels of eating disorder +risk. Our results highlight the potential of LLMs in automated moderation and +broader applications in public health and social science research. + +
+
+
+
+
+ + ☆ Concept Distillation from Strong to Weak Models via + Hypotheses-to-Theories Prompting + + +
+ Hand-crafting high quality prompts to optimize the performance of language +models is a complicated and labor-intensive process. Furthermore, when +migrating to newer, smaller, or weaker models (possibly due to latency or cost +gains), prompts need to be updated to re-optimize the task performance. We +propose Concept Distillation (CD), an automatic prompt optimization technique +for enhancing weaker models on complex tasks. CD involves: (1) collecting +mistakes made by weak models with a base prompt (initialization), (2) using a +strong model to generate reasons for these mistakes and create rules/concepts +for weak models (induction), and (3) filtering these rules based on validation +set performance and integrating them into the base prompt +(deduction/verification). We evaluated CD on NL2Code and mathematical reasoning +tasks, observing significant performance boosts for small and weaker language +models. Notably, Mistral-7B's accuracy on Multi-Arith increased by 20%, and +Phi-3-mini-3.8B's accuracy on HumanEval rose by 34%. Compared to other +automated methods, CD offers an effective, cost-efficient strategy for +improving weak models' performance on complex tasks and enables seamless +workload migration across different language models without compromising +performance. + +
+
+ comment: 13 pages, 8 figures, conference +
+
+
+
+
+ + ☆ SkyScript-100M: 1,000,000,000 Pairs of Scripts and Shooting Scripts for + Short Drama + + +
+ Generating high-quality shooting scripts containing information such as scene +and shot language is essential for short drama script generation. We collect +6,660 popular short drama episodes from the Internet, each with an average of +100 short episodes, and the total number of short episodes is about 80,000, +with a total duration of about 2,000 hours and totaling 10 terabytes (TB). We +perform keyframe extraction and annotation on each episode to obtain about +10,000,000 shooting scripts. We perform 100 script restorations on the +extracted shooting scripts based on our self-developed large short drama +generation model SkyReels. This leads to a dataset containing 1,000,000,000 +pairs of scripts and shooting scripts for short dramas, called SkyScript-100M. +We compare SkyScript-100M with the existing dataset in detail and demonstrate +some deeper insights that can be achieved based on SkyScript-100M. Based on +SkyScript-100M, researchers can achieve several deeper and more far-reaching +script optimization goals, which may drive a paradigm shift in the entire field +of text-to-video and significantly advance the field of short drama video +generation. The data and code are available at +https://github.com/vaew/SkyScript-100M. + +
+
+ comment: 18 pages, 12 figures +
+
+
+
+
+ + ☆ Fostering Natural Conversation in Large Language Models with NICO: a + Natural Interactive COnversation dataset + + +
+ Benefiting from diverse instruction datasets, contemporary Large Language +Models (LLMs) perform effectively as AI assistants in collaborating with +humans. However, LLMs still struggle to generate natural and colloquial +responses in real-world applications such as chatbots and psychological +counseling that require more human-like interactions. To address these +limitations, we introduce NICO, a Natural Interactive COnversation dataset in +Chinese. We first use GPT-4-turbo to generate dialogue drafts and make them +cover 20 daily-life topics and 5 types of social interactions. Then, we hire +workers to revise these dialogues to ensure that they are free of grammatical +errors and unnatural utterances. We define two dialogue-level natural +conversation tasks and two sentence-level tasks for identifying and rewriting +unnatural sentences. Multiple open-source and closed-source LLMs are tested and +analyzed in detail. The experimental results highlight the challenge of the +tasks and demonstrate how NICO can help foster the natural dialogue +capabilities of LLMs. The dataset will be released. + +
+
+ comment: 16 pages, 3 figures, 10 tables +
+
+
+
+
+ + ☆ Threshold Filtering Packing for Supervised Fine-Tuning: Training Related + Samples within Packs + + +
+ Packing for Supervised Fine-Tuning (SFT) in autoregressive models involves +concatenating data points of varying lengths until reaching the designed +maximum length to facilitate GPU processing. However, randomly concatenating +data points and feeding them into an autoregressive transformer can lead to +cross-contamination of sequences due to the significant difference in their +subject matter. The mainstream approaches in SFT ensure that each token in the +attention calculation phase only focuses on tokens within its own short +sequence, without providing additional learning signals for the preceding +context. To address these challenges, we introduce Threshold Filtering Packing +(TFP), a method that selects samples with related context while maintaining +sufficient diversity within the same pack. Our experiments show that TFP offers +a simple-to-implement and scalable approach that significantly enhances SFT +performance, with observed improvements of up to 7\% on GSM8K, 4\% on +HumanEval, and 15\% on the adult-census-income dataset. + +
+
+ comment: 13 pages, 4 figures +
+
+
+
+
+ + ☆ Characterizing and Evaluating the Reliability of LLMs against Jailbreak + Attacks + + +
+ Large Language Models (LLMs) have increasingly become pivotal in content +generation with notable societal impact. These models hold the potential to +generate content that could be deemed harmful.Efforts to mitigate this risk +include implementing safeguards to ensure LLMs adhere to social ethics.However, +despite such measures, the phenomenon of "jailbreaking" -- where carefully +crafted prompts elicit harmful responses from models -- persists as a +significant challenge. Recognizing the continuous threat posed by jailbreaking +tactics and their repercussions for the trustworthy use of LLMs, a rigorous +assessment of the models' robustness against such attacks is essential. This +study introduces an comprehensive evaluation framework and conducts an +large-scale empirical experiment to address this need. We concentrate on 10 +cutting-edge jailbreak strategies across three categories, 1525 questions from +61 specific harmful categories, and 13 popular LLMs. We adopt multi-dimensional +metrics such as Attack Success Rate (ASR), Toxicity Score, Fluency, Token +Length, and Grammatical Errors to thoroughly assess the LLMs' outputs under +jailbreak. By normalizing and aggregating these metrics, we present a detailed +reliability score for different LLMs, coupled with strategic recommendations to +reduce their susceptibility to such vulnerabilities. Additionally, we explore +the relationships among the models, attack strategies, and types of harmful +content, as well as the correlations between the evaluation metrics, which +proves the validity of our multifaceted evaluation framework. Our extensive +experimental results demonstrate a lack of resilience among all tested LLMs +against certain strategies, and highlight the need to concentrate on the +reliability facets of LLMs. We believe our study can provide valuable insights +into enhancing the security evaluation of LLMs against jailbreak within the +domain. + +
+
+
+
+
+ + ♻ ☆ IsoBench: Benchmarking Multimodal Foundation Models on Isomorphic + Representations + + +
+ Current foundation models exhibit impressive capabilities when prompted +either with text only or with both image and text inputs. But do their +capabilities change depending on the input modality? In this work, we propose +$\textbf{IsoBench}$, a benchmark dataset containing problems from four major +areas: math, science, algorithms, and games. Each example is presented with +multiple $\textbf{isomorphic representations}$ of inputs, such as visual, +textual, and mathematical presentations. IsoBench provides fine-grained +feedback to diagnose performance gaps caused by the form of the representation. +Across various foundation models, we observe that on the same problem, models +have a consistent preference towards textual representations. Most prominently, +when evaluated on all IsoBench problems, Claude-3 Opus performs 28.7 points +worse when provided with images instead of text; similarly, GPT-4 Turbo is 18.7 +points worse and Gemini Pro is 14.9 points worse. Finally, we present two +prompting techniques, $\textit{IsoCombination}$ and $\textit{IsoScratchPad}$, +which improve model performance by considering combinations of, and +translations between, different input representations. + +
+
+ comment: 1st Conference on Language Modeling (COLM), 2024 +
+
+
+
+
+ + ♻ ☆ Could a Large Language Model be Conscious? NeurIPS + + +
+ There has recently been widespread discussion of whether large language +models might be sentient. Should we take this idea seriously? I will break down +the strongest reasons for and against. Given mainstream assumptions in the +science of consciousness, there are significant obstacles to consciousness in +current models: for example, their lack of recurrent processing, a global +workspace, and unified agency. At the same time, it is quite possible that +these obstacles will be overcome in the next decade or so. I conclude that +while it is somewhat unlikely that current large language models are conscious, +we should take seriously the possibility that successors to large language +models may be conscious in the not-too-distant future. + +
+
+ comment: Invited lecture at NeurIPS, November 28, 2022 +
+
+
+
+
+ + ♻ ☆ Bergeron: Combating Adversarial Attacks through a Conscience-Based + Alignment Framework + + +
+ Research into AI alignment has grown considerably since the recent +introduction of increasingly capable Large Language Models (LLMs). +Unfortunately, modern methods of alignment still fail to fully prevent harmful +responses when models are deliberately attacked. Such vulnerabilities can lead +to LLMs being manipulated into generating hazardous content: from instructions +for creating dangerous materials to inciting violence or endorsing unethical +behaviors. To help mitigate this issue, we introduce Bergeron: a framework +designed to improve the robustness of LLMs against attacks without any +additional parameter fine-tuning. Bergeron is organized into two tiers; with a +secondary LLM acting as a guardian to the primary LLM. This framework better +safeguards the primary model against incoming attacks while monitoring its +output for any harmful content. Empirical analysis reviews that by using +Bergeron to complement models with existing alignment training, we can +significantly improve the robustness and safety of multiple, commonly used +commercial and open-source LLMs. Specifically, we found that models integrated +with Bergeron are, on average, nearly seven times more resistant to attacks +compared to models without such support. + +
+
+
+
+
+ + ♻ ☆ CogErgLLM: Exploring Large Language Model Systems Design Perspective + Using Cognitive Ergonomics ICML 2024 + + +
+ Integrating cognitive ergonomics with LLMs is essential for enhancing safety, +reliability, and user satisfaction in human-AI interactions. Current LLM design +often lacks this integration, leading to systems that may not fully align with +human cognitive capabilities and limitations. Insufficient focus on +incorporating cognitive science methods exacerbates biases in LLM outputs, +while inconsistent application of user-centered design principles results in +sub-optimal user experiences. To address these challenges, our position paper +explores the critical integration of cognitive ergonomics principles into LLM +design, aiming to provide a comprehensive framework and practical guidelines +for ethical LLM development. Through our contributions, we seek to advance +understanding and practice in integrating cognitive ergonomics into LLM +systems, fostering safer, more reliable, and ethically sound human-AI +interactions. + +
+
+ comment: 8 Page, 3 Figures. Accepted to Large Language Models and Cognition @ + ICML 2024 (https://llm-cognition.github.io/#:~:text=CogErgLLM); Read in + OpenReview: https://openreview.net/forum?id=63C9YSc77p +
+
+
+
+
+ + ♻ ☆ Data Science Kitchen at GermEval 2021: A Fine Selection of Hand-Picked + Features, Delivered Fresh from the Oven + + +
+ This paper presents the contribution of the Data Science Kitchen at GermEval +2021 shared task on the identification of toxic, engaging, and fact-claiming +comments. The task aims at extending the identification of offensive language, +by including additional subtasks that identify comments which should be +prioritized for fact-checking by moderators and community managers. Our +contribution focuses on a feature-engineering approach with a conventional +classification backend. We combine semantic and writing style embeddings +derived from pre-trained deep neural networks with additional numerical +features, specifically designed for this task. Classifier ensembles are used to +derive predictions for each subtask via a majority voting scheme. Our best +submission achieved macro-averaged F1-scores of 66.8\%,\,69.9\% and 72.5\% for +the identification of toxic, engaging, and fact-claiming comments. + +
+
+ comment: Accepted at 17th Conference on Natural Language Processing (KONVENS + 2021) +
+
+
+
+
+ + ♻ ☆ Show, Don't Tell: Evaluating Large Language Models Beyond Textual + Understanding with ChildPlay + + +
+ We explore the hypothesis that LLMs, such as GPT-3.5 and GPT-4, possess +broader cognitive functions, particularly in non-linguistic domains. Our +approach extends beyond standard linguistic benchmarks by incorporating games +like Tic-Tac-Toe, Connect Four, and Battleship, encoded via ASCII, to assess +strategic thinking and decision-making. To evaluate the models' ability to +generalize beyond their training data, we introduce two additional games. The +first game, LEGO Connect Language (LCL), tests the models' capacity to +understand spatial logic and follow assembly instructions. The second game, the +game of shapes, challenges the models to identify shapes represented by 1s +within a matrix of zeros, further testing their spatial reasoning skills. This +"show, don't tell" strategy uses games instead of simply querying the models. +Our results show that despite their proficiency on standard benchmarks, GPT-3.5 +and GPT-4's abilities to play and reason about fully observable games without +pre-training is mediocre. Both models fail to anticipate losing moves in +Tic-Tac-Toe and Connect Four, and they are unable to play Battleship correctly. +While GPT-4 shows some success in the game of shapes, both models fail at the +assembly tasks presented in the LCL game. These results suggest that while GPT +models can emulate conversational proficiency and basic rule comprehension, +their performance in strategic gameplay and spatial reasoning tasks is very +limited. Importantly, this reveals a blind spot in current LLM benchmarks that +we highlight with our gameplay benchmark suite ChildPlay +(https://github.com/child-play-neurips/child-play). Our findings provide a +cautionary tale about claims of emergent intelligence and reasoning +capabilities of LLMs that are roughly the size of GPT-3.5 and GPT-4. + +
+
+
+
+
+ + ♻ ☆ Textless Unit-to-Unit training for Many-to-Many Multilingual + Speech-to-Speech Translation + + +
+ This paper proposes a textless training method for many-to-many multilingual +speech-to-speech translation that can also benefit the transfer of pre-trained +knowledge to text-based systems, text-to-speech synthesis and text-to-speech +translation. To this end, we represent multilingual speech with speech units +that are the discretized representations of speech features derived from a +self-supervised speech model. By treating the speech units as pseudo-text, we +can focus on the linguistic content of the speech, which can be easily +associated with both speech and text modalities at the phonetic level +information. By setting both the inputs and outputs of our learning problem as +speech units, we propose to train an encoder-decoder model in a many-to-many +spoken language translation setting, namely Unit-to-Unit Translation (UTUT). +Specifically, the encoder is conditioned on the source language token to +correctly understand the input spoken language, while the decoder is +conditioned on the target language token to generate the translated speech in +the target language. Therefore, during the training, the model can build the +knowledge of how languages are comprehended and how to relate them to different +languages. Since speech units can be easily associated from both audio and text +by quantization and phonemization respectively, the trained model can easily +transferred to text-related tasks, even if it is trained in a textless manner. +We demonstrate that the proposed UTUT model can be effectively utilized not +only for Speech-to-Speech Translation (S2ST) but also for multilingual +Text-to-Speech Synthesis (T2S) and Text-to-Speech Translation (T2ST), requiring +only minimal fine-tuning steps on text inputs. By conducting comprehensive +experiments encompassing various languages, we validate the efficacy of the +proposed method across diverse multilingual tasks. + +
+
+ comment: TASLP +
+
+
+
+
+ + ♻ ☆ The Death of Schema Linking? Text-to-SQL in the Age of Well-Reasoned + Language Models + + +
+ Schema linking is a crucial step in Text-to-SQL pipelines. Its goal is to +retrieve the relevant tables and columns of a target database for a user's +query while disregarding irrelevant ones. However, imperfect schema linking can +often exclude required columns needed for accurate query generation. In this +work, we revisit schema linking when using the latest generation of large +language models (LLMs). We find empirically that newer models are adept at +utilizing relevant schema elements during generation even in the presence of +large numbers of irrelevant ones. As such, our Text-to-SQL pipeline entirely +forgoes schema linking in cases where the schema fits within the model's +context window in order to minimize issues due to filtering required schema +elements. Furthermore, instead of filtering contextual information, we +highlight techniques such as augmentation, selection, and correction, and adopt +them to improve the accuracy of our Text-to-SQL pipeline. Our approach ranks +first on the BIRD benchmark achieving an accuracy of 71.83%. + +
+
+
+
+
+ + ♻ ☆ Post-Training Sparse Attention with Double Sparsity + + +
+ The inference process for large language models is slow and memory-intensive, +with one of the most critical bottlenecks being excessive Key-Value (KV) cache +accesses. This paper introduces "Double Sparsity," a novel post-training sparse +attention technique designed to alleviate this bottleneck by reducing KV cache +access. Double Sparsity combines token sparsity, which focuses on utilizing +only the important tokens for computing self-attention, with channel sparsity, +an approach that uses important feature channels for identifying important +tokens. Our key insight is that the pattern of channel sparsity is relatively +static, allowing us to use offline calibration to make it efficient at runtime, +thereby enabling accurate and efficient identification of important tokens. +Moreover, this method can be combined with offloading to achieve significant +memory usage reduction. Experimental results demonstrate that Double Sparsity +can achieve $\frac{1}{16}$ token and channel sparsity with minimal impact on +accuracy across various tasks, including wiki-2 perplexity, key-value +retrieval, and long context benchmarks with models including Llama-2-7B, +Llama-2-70B, and Mixtral-8x7B. It brings up to a 14.1$\times$ acceleration in +attention operations and a 1.9$\times$ improvement in end-to-end inference on +GPUs. With offloading, it achieves a decoding speed acceleration of +16.3$\times$ compared to state-of-the-art solutions at a sequence length of +256K. Our code is publicly available at +https://github.com/andy-yang-1/DoubleSparse. + +
+
+
+
+
+ + ♻ ☆ Towards A Unified View of Answer Calibration for Multi-Step Reasoning ACL2024 + + +
+ Large Language Models (LLMs) employing Chain-of-Thought (CoT) prompting have +broadened the scope for improving multi-step reasoning capabilities. We +generally divide multi-step reasoning into two phases: path generation to +generate the reasoning path(s); and answer calibration post-processing the +reasoning path(s) to obtain a final answer. However, the existing literature +lacks systematic analysis on different answer calibration approaches. In this +paper, we summarize the taxonomy of recent answer calibration techniques and +break them down into step-level and path-level strategies. We then conduct a +thorough evaluation on these strategies from a unified view, systematically +scrutinizing step-level and path-level answer calibration across multiple +paths. Experimental results reveal that integrating the dominance of both +strategies tends to derive optimal outcomes. Our study holds the potential to +illuminate key insights for optimizing multi-step reasoning with answer +calibration. + +
+
+ comment: Accepted by NLRSE@ACL2024 +
+
+
+
+
+ + ♻ ☆ Edisum: Summarizing and Explaining Wikipedia Edits at Scale + + +
+ An edit summary is a succinct comment written by a Wikipedia editor +explaining the nature of, and reasons for, an edit to a Wikipedia page. Edit +summaries are crucial for maintaining the encyclopedia: they are the first +thing seen by content moderators and they help them decide whether to accept or +reject an edit. Additionally, edit summaries constitute a valuable data source +for researchers. Unfortunately, as we show, for many edits, summaries are +either missing or incomplete. To overcome this problem and help editors write +useful edit summaries, we propose a model for recommending edit summaries +generated by a language model trained to produce good edit summaries given the +representation of an edit diff. To overcome the challenges of mixed-quality +training data and efficiency requirements imposed by the scale of Wikipedia, we +fine-tune a small generative language model on a curated mix of human and +synthetic data. Our model performs on par with human editors. Commercial large +language models are able to solve this task better than human editors, but are +not well suited for Wikipedia, while open-source ones fail on this task. More +broadly, we showcase how language modeling technology can be used to support +humans in maintaining one of the largest and most visible projects on the Web. + +
+
+
+
+
+ + ♻ ☆ Effects of diversity incentives on sample diversity and downstream model + performance in LLM-based text augmentation ACL'24 + + +
+ The latest generative large language models (LLMs) have found their +application in data augmentation tasks, where small numbers of text samples are +LLM-paraphrased and then used to fine-tune downstream models. However, more +research is needed to assess how different prompts, seed data selection +strategies, filtering methods, or model settings affect the quality of +paraphrased data (and downstream models). In this study, we investigate three +text diversity incentive methods well established in crowdsourcing: taboo +words, hints by previous outlier solutions, and chaining on previous outlier +solutions. Using these incentive methods as part of instructions to LLMs +augmenting text datasets, we measure their effects on generated texts lexical +diversity and downstream model performance. We compare the effects over 5 +different LLMs, 6 datasets and 2 downstream models. We show that diversity is +most increased by taboo words, but downstream model performance is highest with +hints. + +
+
+ comment: ACL'24 version, 24 pages +
+
+
+
+
+ + ♻ ☆ RAGEval: Scenario Specific RAG Evaluation Dataset Generation Framework + + +
+ Retrieval-Augmented Generation (RAG) systems have demonstrated their +advantages in alleviating the hallucination of Large Language Models (LLMs). +Existing RAG benchmarks mainly focus on evaluating whether LLMs can correctly +answer the general knowledge. However, they are unable to evaluate the +effectiveness of the RAG system in dealing with the data from different +vertical domains. This paper introduces RAGEval, a framework for automatically +generating evaluation datasets to evaluate the knowledge usage ability of +different LLMs in different scenarios. Specifically, RAGEval summarizes a +schema from seed documents, applies the configurations to generate diverse +documents, and constructs question-answering pairs according to both articles +and configurations. We propose three novel metrics, Completeness, +Hallucination, and Irrelevance, to carefully evaluate the responses generated +by LLMs. By benchmarking RAG models in vertical domains, RAGEval has the +ability to better evaluate the knowledge usage ability of LLMs, which avoids +the confusion regarding the source of knowledge in answering question in +existing QA datasets--whether it comes from parameterized memory or retrieval. +The code and dataset will be released. + +
+
+ comment: 16 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ LLMs for Knowledge Graph Construction and Reasoning: Recent Capabilities + and Future Opportunities + + +
+ This paper presents an exhaustive quantitative and qualitative evaluation of +Large Language Models (LLMs) for Knowledge Graph (KG) construction and +reasoning. We engage in experiments across eight diverse datasets, focusing on +four representative tasks encompassing entity and relation extraction, event +extraction, link prediction, and question-answering, thereby thoroughly +exploring LLMs' performance in the domain of construction and inference. +Empirically, our findings suggest that LLMs, represented by GPT-4, are more +suited as inference assistants rather than few-shot information extractors. +Specifically, while GPT-4 exhibits good performance in tasks related to KG +construction, it excels further in reasoning tasks, surpassing fine-tuned +models in certain cases. Moreover, our investigation extends to the potential +generalization ability of LLMs for information extraction, leading to the +proposition of a Virtual Knowledge Extraction task and the development of the +corresponding VINE dataset. Based on these empirical findings, we further +propose AutoKG, a multi-agent-based approach employing LLMs and external +sources for KG construction and reasoning. We anticipate that this research can +provide invaluable insights for future undertakings in the field of knowledge +graphs. The code and datasets are in https://github.com/zjunlp/AutoKG. + +
+
+ comment: World Wide Web Journal +
+
+
+
+
+ + ♻ ☆ Improving Logits-based Detector without Logits from Black-box LLMs + + +
+ The advent of Large Language Models (LLMs) has revolutionized text +generation, producing outputs that closely mimic human writing. This blurring +of lines between machine- and human-written text presents new challenges in +distinguishing one from the other a task further complicated by the frequent +updates and closed nature of leading proprietary LLMs. Traditional logits-based +detection methods leverage surrogate models for identifying LLM-generated +content when the exact logits are unavailable from black-box LLMs. However, +these methods grapple with the misalignment between the distributions of the +surrogate and the often undisclosed target models, leading to performance +degradation, particularly with the introduction of new, closed-source models. +Furthermore, while current methodologies are generally effective when the +source model is identified, they falter in scenarios where the model version +remains unknown, or the test set comprises outputs from various source models. +To address these limitations, we present Distribution-Aligned LLMs Detection +(DALD), an innovative framework that redefines the state-of-the-art performance +in black-box text detection even without logits from source LLMs. DALD is +designed to align the surrogate model's distribution with that of unknown +target LLMs, ensuring enhanced detection capability and resilience against +rapid model iterations with minimal training investment. By leveraging corpus +samples from publicly accessible outputs of advanced models such as ChatGPT, +GPT-4 and Claude-3, DALD fine-tunes surrogate models to synchronize with +unknown source model distributions effectively. + +
+
+
+
+
+ + ♻ ☆ The optimal placement of the head in the noun phrase. The case of + demonstrative, numeral, adjective and noun + + +
+ The word order of a sentence is shaped by multiple principles. The principle +of syntactic dependency distance minimization is in conflict with the principle +of surprisal minimization (or predictability maximization) in single head +syntactic dependency structures: while the former predicts that the head should +be placed at the center of the linear arrangement, the latter predicts that the +head should be placed at one of the ends (either first or last). A critical +question is when surprisal minimization (or predictability maximization) should +surpass syntactic dependency distance minimization. In the context of single +head structures, it has been predicted that this is more likely to happen when +two conditions are met, i.e. (a) fewer words are involved and (b) words are +shorter. Here we test the prediction on the noun phrase when it is composed of +a demonstrative, a numeral, an adjective and a noun. We find that, across +preferred orders in languages, the noun tends to be placed at one of the ends, +confirming the theoretical prediction. We also show evidence of anti locality +effects: syntactic dependency distances in preferred orders are longer than +expected by chance. + +
+
+ comment: In press in the Journal of Quantitative Linguistics +
+
+
+
+
+ + ♻ ☆ A Formal Perspective on Byte-Pair Encoding ACL 2023 + + +
+ Byte-Pair Encoding (BPE) is a popular algorithm used for tokenizing data in +NLP, despite being devised initially as a compression method. BPE appears to be +a greedy algorithm at face value, but the underlying optimization problem that +BPE seeks to solve has not yet been laid down. We formalize BPE as a +combinatorial optimization problem. Via submodular functions, we prove that the +iterative greedy version is a +$\frac{1}{{\sigma(\boldsymbol{\mu}^\star)}}(1-e^{-{\sigma(\boldsymbol{\mu}^\star)}})$-approximation +of an optimal merge sequence, where ${\sigma(\boldsymbol{\mu}^\star)}$ is the +total backward curvature with respect to the optimal merge sequence +$\boldsymbol{\mu}^\star$. Empirically the lower bound of the approximation is +$\approx 0.37$. + We provide a faster implementation of BPE which improves the runtime +complexity from $\mathcal{O}\left(N M\right)$ to $\mathcal{O}\left(N \log +M\right)$, where $N$ is the sequence length and $M$ is the merge count. +Finally, we optimize the brute-force algorithm for optimal BPE using +memoization. + +
+
+ comment: ACL 2023 +
+
+
+
+
+ + ♻ ☆ MathVerse: Does Your Multi-modal LLM Truly See the Diagrams in Visual + Math Problems? ECCV 2024 + + +
+ The remarkable progress of Multi-modal Large Language Models (MLLMs) has +garnered unparalleled attention, due to their superior performance in visual +contexts. However, their capabilities in visual math problem-solving remain +insufficiently evaluated and understood. We investigate current benchmarks to +incorporate excessive visual content within textual questions, which +potentially assist MLLMs in deducing answers without truly interpreting the +input diagrams. To this end, we introduce MathVerse, an all-around visual math +benchmark designed for an equitable and in-depth evaluation of MLLMs. We +meticulously collect 2,612 high-quality, multi-subject math problems with +diagrams from publicly available sources. Each problem is then transformed by +human annotators into six distinct versions, each offering varying degrees of +information content in multi-modality, contributing to 15K test samples in +total. This approach allows MathVerse to comprehensively assess whether and how +much MLLMs can truly understand the visual diagrams for mathematical reasoning. +In addition, we propose a Chain-of-Thought (CoT) evaluation strategy for a +fine-grained assessment of the output answers. Rather than naively judging True +or False, we employ GPT-4(V) to adaptively extract crucial reasoning steps, and +then score each step with detailed error analysis, which can reveal the +intermediate CoT reasoning quality by MLLMs. We hope the MathVerse benchmark +may provide unique insights to guide the future development of MLLMs. Project +page: https://mathverse-cuhk.github.io + +
+
+ comment: Accepted by ECCV 2024, 46 Pages, Benchmark Project Page: + https://mathverse-cuhk.github.io +
+
+
+
+
+ + ♻ ☆ Beyond Instruction Following: Evaluating Inferential Rule Following of + Large Language Models + + +
+ Although Large Language Models (LLMs) have demonstrated strong +instruction-following ability, they are further supposed to be controlled and +guided by rules in real-world scenarios to be safe, accurate, and intelligent. +This demands the possession of inferential rule-following capability of LLMs. +However, few works have made a clear evaluation of the inferential +rule-following capability of LLMs. Previous studies that try to evaluate the +inferential rule-following capability of LLMs fail to distinguish the +inferential rule-following scenarios from the instruction-following scenarios. +Therefore, this paper first clarifies the concept of inferential rule-following +and proposes a comprehensive benchmark, RuleBench, to evaluate a diversified +range of inferential rule-following abilities. Our experimental results on a +variety of LLMs show that they are still limited in following rules. Our +analysis based on the evaluation results provides insights into the +improvements for LLMs toward a better inferential rule-following intelligent +agent. We further propose Inferential Rule-Following Tuning (IRFT), which +outperforms IFT in helping LLMs solve RuleBench. The data and code can be found +at: https://anonymous.4open.science/r/llm-rule-following-B3E3/ + +
+
+
+
+
+ + ♻ ☆ TokenRec: Learning to Tokenize ID for LLM-based Generative + Recommendation + + +
+ There is a growing interest in utilizing large-scale language models (LLMs) +to advance next-generation Recommender Systems (RecSys), driven by their +outstanding language understanding and in-context learning capabilities. In +this scenario, tokenizing (i.e., indexing) users and items becomes essential +for ensuring a seamless alignment of LLMs with recommendations. While several +studies have made progress in representing users and items through textual +contents or latent representations, challenges remain in efficiently capturing +high-order collaborative knowledge into discrete tokens that are compatible +with LLMs. Additionally, the majority of existing tokenization approaches often +face difficulties in generalizing effectively to new/unseen users or items that +were not in the training corpus. To address these challenges, we propose a +novel framework called TokenRec, which introduces not only an effective ID +tokenization strategy but also an efficient retrieval paradigm for LLM-based +recommendations. Specifically, our tokenization strategy, Masked +Vector-Quantized (MQ) Tokenizer, involves quantizing the masked user/item +representations learned from collaborative filtering into discrete tokens, thus +achieving a smooth incorporation of high-order collaborative knowledge and a +generalizable tokenization of users and items for LLM-based RecSys. Meanwhile, +our generative retrieval paradigm is designed to efficiently recommend top-$K$ +items for users to eliminate the need for the time-consuming auto-regressive +decoding and beam search processes used by LLMs, thus significantly reducing +inference time. Comprehensive experiments validate the effectiveness of the +proposed methods, demonstrating that TokenRec outperforms competitive +benchmarks, including both traditional recommender systems and emerging +LLM-based recommender systems. + +
+
+ comment: Submitted to IEEE TKDE. Our code and dataset will be made available + upon acceptance of the paper +
+
+
+
+
+ + ♻ ☆ $\rm SP^3$: Enhancing Structured Pruning via PCA Projection + + +
+ Structured pruning is a widely used technique for reducing the size of +pre-trained language models (PLMs), but current methods often overlook the +potential of compressing the hidden dimension (d) in PLMs, a dimension critical +to model size and efficiency. This paper introduces a novel structured pruning +approach, Structured Pruning with PCA Projection (SP3), targeting the effective +reduction of d by projecting features into a space defined by principal +components before masking. Extensive experiments on benchmarks (GLUE and SQuAD) +show that SP3 can reduce d by 70%, compress 94% of the BERTbase model, maintain +over 96% accuracy, and outperform other methods that compress d by 6% in +accuracy at the same compression ratio. SP3 has also proven effective with +other models, including OPT and Llama. Our data and code are available at an +anonymous repo. + +
+
+ comment: 21 pages +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 29 + +
+
+
+ + ☆ The First Competition on Resource-Limited Infrared Small Target + Detection Challenge: Methods and Results + + +
+ In this paper, we briefly summarize the first competition on resource-limited +infrared small target detection (namely, LimitIRSTD). This competition has two +tracks, including weakly-supervised infrared small target detection (Track 1) +and lightweight infrared small target detection (Track 2). 46 and 60 teams +successfully registered and took part in Tracks 1 and Track 2, respectively. +The top-performing methods and their results in each track are described with +details. This competition inspires the community to explore the tough problems +in the application of infrared small target detection, and ultimately promote +the deployment of this technology under limited resource. + +
+
+
+
+
+ + ☆ Enhancing ASL Recognition with GCNs and Successive Residual Connections SP + + +
+ This study presents a novel approach for enhancing American Sign Language +(ASL) recognition using Graph Convolutional Networks (GCNs) integrated with +successive residual connections. The method leverages the MediaPipe framework +to extract key landmarks from each hand gesture, which are then used to +construct graph representations. A robust preprocessing pipeline, including +translational and scale normalization techniques, ensures consistency across +the dataset. The constructed graphs are fed into a GCN-based neural +architecture with residual connections to improve network stability. The +architecture achieves state-of-the-art results, demonstrating superior +generalization capabilities with a validation accuracy of 99.14%. + +
+
+ comment: To be submitted in G2-SP CV 2024. Contains 7 pages, 5 figures +
+
+
+
+
+ + ☆ Generating Automatically Print/Scan Textures for Morphing Attack + Detection Applications + + +
+ Morphing Attack Detection (MAD) is a relevant topic that aims to detect +attempts by unauthorised individuals to access a "valid" identity. One of the +main scenarios is printing morphed images and submitting the respective print +in a passport application process. Today, small datasets are available to train +the MAD algorithm because of privacy concerns and the limitations resulting +from the effort associated with the printing and scanning of images at large +numbers. In order to improve the detection capabilities and spot such morphing +attacks, it will be necessary to have a larger and more realistic dataset +representing the passport application scenario with the diversity of devices +and the resulting printed scanned or compressed images. Creating training data +representing the diversity of attacks is a very demanding task because the +training material is developed manually. This paper proposes two different +methods based on transfer-transfer for automatically creating digital +print/scan face images and using such images in the training of a Morphing +Attack Detection algorithm. Our proposed method can reach an Equal Error Rate +(EER) of 3.84% and 1.92% on the FRGC/FERET database when including our +synthetic and texture-transfer print/scan with 600 dpi to handcrafted images, +respectively. + +
+
+ comment: Paper under revision process in Journal +
+
+
+
+
+ + ☆ Screen Them All: High-Throughput Pan-Cancer Genetic and Phenotypic + Biomarker Screening from H\&E Whole Slide Images + + +
+ Many molecular alterations serve as clinically prognostic or +therapy-predictive biomarkers, typically detected using single or multi-gene +molecular assays. However, these assays are expensive, tissue destructive and +often take weeks to complete. Using AI on routine H&E WSIs offers a fast and +economical approach to screen for multiple molecular biomarkers. We present a +high-throughput AI-based system leveraging Virchow2, a foundation model +pre-trained on 3 million slides, to interrogate genomic features previously +determined by an next-generation sequencing (NGS) assay, using 47,960 scanned +hematoxylin and eosin (H&E) whole slide images (WSIs) from 38,984 cancer +patients. Unlike traditional methods that train individual models for each +biomarker or cancer type, our system employs a unified model to simultaneously +predict a wide range of clinically relevant molecular biomarkers across cancer +types. By training the network to replicate the MSK-IMPACT targeted biomarker +panel of 505 genes, it identified 80 high performing biomarkers with a mean +AU-ROC of 0.89 in 15 most common cancer types. In addition, 40 biomarkers +demonstrated strong associations with specific cancer histologic subtypes. +Furthermore, 58 biomarkers were associated with targets frequently assayed +clinically for therapy selection and response prediction. The model can also +predict the activity of five canonical signaling pathways, identify defects in +DNA repair mechanisms, and predict genomic instability measured by tumor +mutation burden, microsatellite instability (MSI), and chromosomal instability +(CIN). The proposed model can offer potential to guide therapy selection, +improve treatment efficacy, accelerate patient screening for clinical trials +and provoke the interrogation of new therapeutic targets. + +
+
+
+
+
+ + ☆ AnomalyFactory: Regard Anomaly Generation as Unsupervised Anomaly + Localization ECCV 2024 + + +
+ Recent advances in anomaly generation approaches alleviate the effect of data +insufficiency on task of anomaly localization. While effective, most of them +learn multiple large generative models on different datasets and cumbersome +anomaly prediction models for different classes. To address the limitations, we +propose a novel scalable framework, named AnomalyFactory, that unifies +unsupervised anomaly generation and localization with same network +architecture. It starts with a BootGenerator that combines structure of a +target edge map and appearance of a reference color image with the guidance of +a learned heatmap. Then, it proceeds with a FlareGenerator that receives +supervision signals from the BootGenerator and reforms the heatmap to indicate +anomaly locations in the generated image. Finally, it easily transforms the +same network architecture to a BlazeDetector that localizes anomaly pixels with +the learned heatmap by converting the anomaly images generated by the +FlareGenerator to normal images. By manipulating the target edge maps and +combining them with various reference images, AnomalyFactory generates +authentic and diversity samples cross domains. Comprehensive experiments +carried on 5 datasets, including MVTecAD, VisA, MVTecLOCO, MADSim and RealIAD, +demonstrate that our approach is superior to competitors in generation +capability and scalability. + +
+
+ comment: Accepted to the 2nd workshop on Vision-based InduStrial InspectiON + (VISION) at ECCV 2024 +
+
+
+
+
+ + ☆ NAVERO: Unlocking Fine-Grained Semantics for Video-Language + Compositionality + + +
+ We study the capability of Video-Language (VidL) models in understanding +compositions between objects, attributes, actions and their relations. +Composition understanding becomes particularly challenging for video data since +the compositional relations rapidly change over time in videos. We first build +a benchmark named AARO to evaluate composition understanding related to actions +on top of spatial concepts. The benchmark is constructed by generating negative +texts with incorrect action descriptions for a given video and the model is +expected to pair a positive text with its corresponding video. Furthermore, we +propose a training method called NAVERO which utilizes video-text data +augmented with negative texts to enhance composition understanding. We also +develop a negative-augmented visual-language matching loss which is used +explicitly to benefit from the generated negative text. We compare NAVERO with +other state-of-the-art methods in terms of compositional understanding as well +as video-text retrieval performance. NAVERO achieves significant improvement +over other methods for both video-language and image-language composition +understanding, while maintaining strong performance on traditional text-video +retrieval tasks. + +
+
+
+
+
+ + ☆ StyleBrush: Style Extraction and Transfer from a Single Image + + +
+ Stylization for visual content aims to add specific style patterns at the +pixel level while preserving the original structural features. Compared with +using predefined styles, stylization guided by reference style images is more +challenging, where the main difficulty is to effectively separate style from +structural elements. In this paper, we propose StyleBrush, a method that +accurately captures styles from a reference image and ``brushes'' the extracted +style onto other input visual content. Specifically, our architecture consists +of two branches: ReferenceNet, which extracts style from the reference image, +and Structure Guider, which extracts structural features from the input image, +thus enabling image-guided stylization. We utilize LLM and T2I models to create +a dataset comprising 100K high-quality style images, encompassing a diverse +range of styles and contents with high aesthetic score. To construct training +pairs, we crop different regions of the same training image. Experiments show +that our approach achieves state-of-the-art results through both qualitative +and quantitative analyses. We will release our code and dataset upon acceptance +of the paper. + +
+
+ comment: 9 pages, 6figures, Under Review +
+
+
+
+
+ + ☆ Source-Free Test-Time Adaptation For Online Surface-Defect Detection ICPR 2024 + + +
+ Surface defect detection is significant in industrial production. However, +detecting defects with varying textures and anomaly classes during the test +time is challenging. This arises due to the differences in data distributions +between source and target domains. Collecting and annotating new data from the +target domain and retraining the model is time-consuming and costly. In this +paper, we propose a novel test-time adaptation surface-defect detection +approach that adapts pre-trained models to new domains and classes during +inference. Our approach involves two core ideas. Firstly, we introduce a +supervisor to filter samples and select only those with high confidence to +update the model. This ensures that the model is not excessively biased by +incorrect data. Secondly, we propose the augmented mean prediction to generate +robust pseudo labels and a dynamically-balancing loss to facilitate the model +in effectively integrating classification and segmentation results to improve +surface-defect detection accuracy. Our approach is real-time and does not +require additional offline retraining. Experiments demonstrate it outperforms +state-of-the-art techniques. + +
+
+ comment: Accepted to ICPR 2024 +
+
+
+
+
+ + ☆ Advances in Multiple Instance Learning for Whole Slide Image Analysis: + Techniques, Challenges, and Future Directions + + +
+ Whole slide images (WSIs) are gigapixel-scale digital images of H\&E-stained +tissue samples widely used in pathology. The substantial size and complexity of +WSIs pose unique analytical challenges. Multiple Instance Learning (MIL) has +emerged as a powerful approach for addressing these challenges, particularly in +cancer classification and detection. This survey provides a comprehensive +overview of the challenges and methodologies associated with applying MIL to +WSI analysis, including attention mechanisms, pseudo-labeling, transformers, +pooling functions, and graph neural networks. Additionally, it explores the +potential of MIL in discovering cancer cell morphology, constructing +interpretable machine learning models, and quantifying cancer grading. By +summarizing the current challenges, methodologies, and potential applications +of MIL in WSI analysis, this survey aims to inform researchers about the state +of the field and inspire future research directions. + +
+
+
+
+
+ + ☆ Image-Based Geolocation Using Large Vision-Language Models + + +
+ Geolocation is now a vital aspect of modern life, offering numerous benefits +but also presenting serious privacy concerns. The advent of large +vision-language models (LVLMs) with advanced image-processing capabilities +introduces new risks, as these models can inadvertently reveal sensitive +geolocation information. This paper presents the first in-depth study analyzing +the challenges posed by traditional deep learning and LVLM-based geolocation +methods. Our findings reveal that LVLMs can accurately determine geolocations +from images, even without explicit geographic training. + To address these challenges, we introduce \tool{}, an innovative framework +that significantly enhances image-based geolocation accuracy. \tool{} employs a +systematic chain-of-thought (CoT) approach, mimicking human geoguessing +strategies by carefully analyzing visual and contextual cues such as vehicle +types, architectural styles, natural landscapes, and cultural elements. +Extensive testing on a dataset of 50,000 ground-truth data points shows that +\tool{} outperforms both traditional models and human benchmarks in accuracy. +It achieves an impressive average score of 4550.5 in the GeoGuessr game, with +an 85.37\% win rate, and delivers highly precise geolocation predictions, with +the closest distances as accurate as 0.3 km. Furthermore, our study highlights +issues related to dataset integrity, leading to the creation of a more robust +dataset and a refined framework that leverages LVLMs' cognitive capabilities to +improve geolocation precision. These findings underscore \tool{}'s superior +ability to interpret complex visual data, the urgent need to address emerging +security vulnerabilities posed by LVLMs, and the importance of responsible AI +development to ensure user privacy protection. + +
+
+
+
+
+ + ☆ MedMAP: Promoting Incomplete Multi-modal Brain Tumor Segmentation with + Alignment + + +
+ Brain tumor segmentation is often based on multiple magnetic resonance +imaging (MRI). However, in clinical practice, certain modalities of MRI may be +missing, which presents a more difficult scenario. To cope with this challenge, +Knowledge Distillation, Domain Adaption, and Shared Latent Space have emerged +as commonly promising strategies. However, recent efforts typically overlook +the modality gaps and thus fail to learn important invariant feature +representations across different modalities. Such drawback consequently leads +to limited performance for missing modality models. To ameliorate these +problems, pre-trained models are used in natural visual segmentation tasks to +minimize the gaps. However, promising pre-trained models are often unavailable +in medical image segmentation tasks. Along this line, in this paper, we propose +a novel paradigm that aligns latent features of involved modalities to a +well-defined distribution anchor as the substitution of the pre-trained model}. +As a major contribution, we prove that our novel training paradigm ensures a +tight evidence lower bound, thus theoretically certifying its effectiveness. +Extensive experiments on different backbones validate that the proposed +paradigm can enable invariant feature representations and produce models with +narrowed modality gaps. Models with our alignment paradigm show their superior +performance on both BraTS2018 and BraTS2020 datasets. + +
+
+
+
+
+ + ☆ 3C: Confidence-Guided Clustering and Contrastive Learning for + Unsupervised Person Re-Identification + + +
+ Unsupervised person re-identification (Re-ID) aims to learn a feature network +with cross-camera retrieval capability in unlabelled datasets. Although the +pseudo-label based methods have achieved great progress in Re-ID, their +performance in the complex scenario still needs to sharpen up. In order to +reduce potential misguidance, including feature bias, noise pseudo-labels and +invalid hard samples, accumulated during the learning process, in this pa per, +a confidence-guided clustering and contrastive learning (3C) framework is +proposed for unsupervised person Re-ID. This 3C framework presents three +confidence degrees. i) In the clustering stage, the confidence of the +discrepancy between samples and clusters is proposed to implement a harmonic +discrepancy clustering algorithm (HDC). ii) In the forward-propagation training +stage, the confidence of the camera diversity of a cluster is evaluated via a +novel camera information entropy (CIE). Then, the clusters with high CIE values +will play leading roles in training the model. iii) In the back-propagation +training stage, the confidence of the hard sample in each cluster is designed +and further used in a confidence integrated harmonic discrepancy (CHD), to +select the informative sample for updating the memory in contrastive learning. +Extensive experiments on three popular Re-ID benchmarks demonstrate the +superiority of the proposed framework. Particularly, the 3C framework achieves +state-of-the-art results: 86.7%/94.7%, 45.3%/73.1% and 47.1%/90.6% in terms of +mAP/Rank-1 accuracy on Market-1501, the com plex datasets MSMT17 and VeRi-776, +respectively. Code is available at https://github.com/stone5265/3C-reid. + +
+
+
+
+
+ + ☆ Fine-Grained Building Function Recognition from Street-View Images via + Geometry-Aware Semi-Supervised Learning + + +
+ In this work, we propose a geometry-aware semi-supervised method for +fine-grained building function recognition. This method leverages the geometric +relationships between multi-source data to improve the accuracy of pseudo +labels in semi-supervised learning, extending the task's scope and making it +applicable to cross-categorization systems of building function recognition. +Firstly, we design an online semi-supervised pre-training stage, which +facilitates the precise acquisition of building facade location information in +street-view images. In the second stage, we propose a geometry-aware coarse +annotation generation module. This module effectively combines GIS data and +street-view data based on the geometric relationships, improving the accuracy +of pseudo annotations. In the third stage, we combine the newly generated +coarse annotations with the existing labeled dataset to achieve fine-grained +functional recognition of buildings across multiple cities at a large scale. +Extensive experiments demonstrate that our proposed framework exhibits superior +performance in fine-grained functional recognition of buildings. Within the +same categorization system, it achieves improvements of 7.6% and 4.8% compared +to fully-supervised methods and state-of-the-art semi-supervised methods, +respectively. Additionally, our method also performs well in cross-city tasks, +i.e., extending the model trained on OmniCity (New York) to new areas (i.e., +Los Angeles and Boston). This study provides a novel solution for the +fine-grained function recognition of large-scale buildings across multiple +cities, offering essential data for understanding urban infrastructure +planning, human activity patterns, and the interactions between humans and +buildings. + +
+
+ comment: This paper is currently under review +
+
+
+
+
+ + ☆ G2Face: High-Fidelity Reversible Face Anonymization via Generative and + Geometric Priors + + +
+ Reversible face anonymization, unlike traditional face pixelization, seeks to +replace sensitive identity information in facial images with synthesized +alternatives, preserving privacy without sacrificing image clarity. Traditional +methods, such as encoder-decoder networks, often result in significant loss of +facial details due to their limited learning capacity. Additionally, relying on +latent manipulation in pre-trained GANs can lead to changes in ID-irrelevant +attributes, adversely affecting data utility due to GAN inversion inaccuracies. +This paper introduces G\textsuperscript{2}Face, which leverages both generative +and geometric priors to enhance identity manipulation, achieving high-quality +reversible face anonymization without compromising data utility. We utilize a +3D face model to extract geometric information from the input face, integrating +it with a pre-trained GAN-based decoder. This synergy of generative and +geometric priors allows the decoder to produce realistic anonymized faces with +consistent geometry. Moreover, multi-scale facial features are extracted from +the original face and combined with the decoder using our novel identity-aware +feature fusion blocks (IFF). This integration enables precise blending of the +generated facial patterns with the original ID-irrelevant features, resulting +in accurate identity manipulation. Extensive experiments demonstrate that our +method outperforms existing state-of-the-art techniques in face anonymization +and recovery, while preserving high data utility. Code is available at +https://github.com/Harxis/G2Face. + +
+
+
+
+
+ + ☆ Retina-inspired Object Motion Segmentation + + +
+ Dynamic Vision Sensors (DVS) have emerged as a revolutionary technology with +a high temporal resolution that far surpasses RGB cameras. DVS technology draws +biological inspiration from photoreceptors and the initial retinal synapse. Our +research showcases the potential of additional retinal functionalities to +extract visual features. We provide a domain-agnostic and efficient algorithm +for ego-motion compensation based on Object Motion Sensitivity (OMS), one of +the multiple robust features computed within the mammalian retina. We develop a +framework based on experimental neuroscience that translates OMS' biological +circuitry to a low-overhead algorithm. OMS processes DVS data from dynamic +scenes to perform pixel-wise object motion segmentation. Using a real and a +synthetic dataset, we highlight OMS' ability to differentiate object motion +from ego-motion, bypassing the need for deep networks. This paper introduces a +bio-inspired computer vision method that dramatically reduces the number of +parameters by a factor of 1000 compared to prior works. Our work paves the way +for robust, high-speed, and low-bandwidth decision-making for in-sensor +computations. + +
+
+
+
+
+ + ☆ Attention Is Not What You Need: Revisiting Multi-Instance Learning for + Whole Slide Image Classification + + +
+ Although attention-based multi-instance learning algorithms have achieved +impressive performances on slide-level whole slide image (WSI) classification +tasks, they are prone to mistakenly focus on irrelevant patterns such as +staining conditions and tissue morphology, leading to incorrect patch-level +predictions and unreliable interpretability. Moreover, these attention-based +MIL algorithms tend to focus on salient instances and struggle to recognize +hard-to-classify instances. In this paper, we first demonstrate that +attention-based WSI classification methods do not adhere to the standard MIL +assumptions. From the standard MIL assumptions, we propose a surprisingly +simple yet effective instance-based MIL method for WSI classification +(FocusMIL) based on max-pooling and forward amortized variational inference. We +argue that synergizing the standard MIL assumption with variational inference +encourages the model to focus on tumour morphology instead of spurious +correlations. Our experimental evaluations show that FocusMIL significantly +outperforms the baselines in patch-level classification tasks on the Camelyon16 +and TCGA-NSCLC benchmarks. Visualization results show that our method also +achieves better classification boundaries for identifying hard instances and +mitigates the effect of spurious correlations between bags and labels. + +
+
+
+
+
+ + ☆ CLIP-CID: Efficient CLIP Distillation via Cluster-Instance + Discrimination + + +
+ Contrastive Language-Image Pre-training (CLIP) has achieved excellent +performance over a wide range of tasks. However, the effectiveness of CLIP +heavily relies on a substantial corpus of pre-training data, resulting in +notable consumption of computational resources. Although knowledge distillation +has been widely applied in single modality models, how to efficiently expand +knowledge distillation to vision-language foundation models with extensive data +remains relatively unexplored. In this paper, we introduce CLIP-CID, a novel +distillation mechanism that effectively transfers knowledge from a large +vision-language foundation model to a smaller model. We initially propose a +simple but efficient image semantic balance method to reduce transfer learning +bias and improve distillation efficiency. This method filters out 43.7% of +image-text pairs from the LAION400M while maintaining superior performance. +After that, we leverage cluster-instance discrimination to facilitate knowledge +transfer from the teacher model to the student model, thereby empowering the +student model to acquire a holistic semantic comprehension of the pre-training +data. Experimental results demonstrate that CLIP-CID achieves state-of-the-art +performance on various downstream tasks including linear probe and zero-shot +classification. + +
+
+ comment: 11 pages,8 figures +
+
+
+
+
+ + ☆ Deformation-aware GAN for Medical Image Synthesis with Substantially + Misaligned Pairs + + +
+ Medical image synthesis generates additional imaging modalities that are +costly, invasive or harmful to acquire, which helps to facilitate the clinical +workflow. When training pairs are substantially misaligned (e.g., lung MRI-CT +pairs with respiratory motion), accurate image synthesis remains a critical +challenge. Recent works explored the directional registration module to adjust +misalignment in generative adversarial networks (GANs); however, substantial +misalignment will lead to 1) suboptimal data mapping caused by correspondence +ambiguity, and 2) degraded image fidelity caused by morphology influence on +discriminators. To address the challenges, we propose a novel Deformation-aware +GAN (DA-GAN) to dynamically correct the misalignment during the image synthesis +based on multi-objective inverse consistency. Specifically, in the generative +process, three levels of inverse consistency cohesively optimise symmetric +registration and image generation for improved correspondence. In the +adversarial process, to further improve image fidelity under misalignment, we +design deformation-aware discriminators to disentangle the mismatched spatial +morphology from the judgement of image fidelity. Experimental results show that +DA-GAN achieved superior performance on a public dataset with simulated +misalignments and a real-world lung MRI-CT dataset with respiratory motion +misalignment. The results indicate the potential for a wide range of medical +image synthesis tasks such as radiotherapy planning. + +
+
+ comment: Accepted by MIDL2024 +
+
+
+
+
+ + ♻ ☆ M&M: Unsupervised Mamba-based Mastoidectomy for Cochlear Implant Surgery + with Noisy Data + + +
+ Cochlear Implant (CI) procedures involve inserting an array of electrodes +into the cochlea located inside the inner ear. Mastoidectomy is a surgical +procedure that uses a high-speed drill to remove part of the mastoid region of +the temporal bone, providing safe access to the cochlea through the middle and +inner ear. We aim to develop an intraoperative navigation system that registers +plans created using 3D preoperative Computerized Tomography (CT) volumes with +the 2D surgical microscope view. Herein, we propose a method to synthesize the +mastoidectomy volume using only the preoperative CT scan, where the mastoid is +intact. We introduce an unsupervised learning framework designed to synthesize +mastoidectomy. For model training purposes, this method uses postoperative CT +scans to avoid manual data cleaning or labeling, even when the region removed +during mastoidectomy is visible but affected by metal artifacts, low +signal-to-noise ratio, or electrode wiring. Our approach estimates +mastoidectomy regions with a mean dice score of 70.0%. This approach represents +a major step forward for CI intraoperative navigation by predicting realistic +mastoidectomy-removed regions in preoperative planning that can be used to +register the pre-surgery plan to intraoperative microscopy. + +
+
+
+
+
+ + ♻ ☆ Latent Guard: a Safety Framework for Text-to-image Generation ECCV 2024 + + +
+ With the ability to generate high-quality images, text-to-image (T2I) models +can be exploited for creating inappropriate content. To prevent misuse, +existing safety measures are either based on text blacklists, which can be +easily circumvented, or harmful content classification, requiring large +datasets for training and offering low flexibility. Hence, we propose Latent +Guard, a framework designed to improve safety measures in text-to-image +generation. Inspired by blacklist-based approaches, Latent Guard learns a +latent space on top of the T2I model's text encoder, where it is possible to +check the presence of harmful concepts in the input text embeddings. Our +proposed framework is composed of a data generation pipeline specific to the +task using large language models, ad-hoc architectural components, and a +contrastive learning strategy to benefit from the generated data. The +effectiveness of our method is verified on three datasets and against four +baselines. Code and data will be shared at https://latentguard.github.io/. + +
+
+ comment: This paper has been accepted to ECCV 2024 +
+
+
+
+
+ + ♻ ☆ SynopGround: A Large-Scale Dataset for Multi-Paragraph Video Grounding + from TV Dramas and Synopses ACM MM 2024 + + +
+ Video grounding is a fundamental problem in multimodal content understanding, +aiming to localize specific natural language queries in an untrimmed video. +However, current video grounding datasets merely focus on simple events and are +either limited to shorter videos or brief sentences, which hinders the model +from evolving toward stronger multimodal understanding capabilities. To address +these limitations, we present a large-scale video grounding dataset named +SynopGround, in which more than 2800 hours of videos are sourced from popular +TV dramas and are paired with accurately localized human-written synopses. Each +paragraph in the synopsis serves as a language query and is manually annotated +with precise temporal boundaries in the long video. These paragraph queries are +tightly correlated to each other and contain a wealth of abstract expressions +summarizing video storylines and specific descriptions portraying event +details, which enables the model to learn multimodal perception on more +intricate concepts over longer context dependencies. Based on the dataset, we +further introduce a more complex setting of video grounding dubbed +Multi-Paragraph Video Grounding (MPVG), which takes as input multiple +paragraphs and a long video for grounding each paragraph query to its temporal +interval. In addition, we propose a novel Local-Global Multimodal Reasoner +(LGMR) to explicitly model the local-global structures of long-term multimodal +inputs for MPVG. Our method provides an effective baseline solution to the +multi-paragraph video grounding problem. Extensive experiments verify the +proposed model's effectiveness as well as its superiority in long-term +multi-paragraph video grounding over prior state-of-the-arts. Dataset and code +are publicly available. Project page: https://synopground.github.io/. + +
+
+ comment: Accepted to ACM MM 2024. Project page: https://synopground.github.io/ +
+
+
+
+
+ + ♻ ☆ Interpreting Global Perturbation Robustness of Image Models using + Axiomatic Spectral Importance Decomposition + + +
+ Perturbation robustness evaluates the vulnerabilities of models, arising from +a variety of perturbations, such as data corruptions and adversarial attacks. +Understanding the mechanisms of perturbation robustness is critical for global +interpretability. We present a model-agnostic, global mechanistic +interpretability method to interpret the perturbation robustness of image +models. This research is motivated by two key aspects. First, previous global +interpretability works, in tandem with robustness benchmarks, e.g. mean +corruption error (mCE), are not designed to directly interpret the mechanisms +of perturbation robustness within image models. Second, we notice that the +spectral signal-to-noise ratios (SNR) of perturbed natural images exponentially +decay over the frequency. This power-law-like decay implies that: Low-frequency +signals are generally more robust than high-frequency signals -- yet high +classification accuracy can not be achieved by low-frequency signals alone. By +applying Shapley value theory, our method axiomatically quantifies the +predictive powers of robust features and non-robust features within an +information theory framework. Our method, dubbed as \textbf{I-ASIDE} +(\textbf{I}mage \textbf{A}xiomatic \textbf{S}pectral \textbf{I}mportance +\textbf{D}ecomposition \textbf{E}xplanation), provides a unique insight into +model robustness mechanisms. We conduct extensive experiments over a variety of +vision models pre-trained on ImageNet to show that \textbf{I-ASIDE} can not +only \textbf{measure} the perturbation robustness but also \textbf{provide +interpretations} of its mechanisms. + +
+
+ comment: Accepted by Transactions on Machine Learning Research (TMLR 2024) +
+
+
+
+
+ + ♻ ☆ Scene-wise Adaptive Network for Dynamic Cold-start Scenes Optimization + in CTR Prediction + + +
+ In the realm of modern mobile E-commerce, providing users with nearby +commercial service recommendations through location-based online services has +become increasingly vital. While machine learning approaches have shown promise +in multi-scene recommendation, existing methodologies often struggle to address +cold-start problems in unprecedented scenes: the increasing diversity of +commercial choices, along with the short online lifespan of scenes, give rise +to the complexity of effective recommendations in online and dynamic scenes. In +this work, we propose Scene-wise Adaptive Network (SwAN), a novel approach that +emphasizes high-performance cold-start online recommendations for new scenes. +Our approach introduces several crucial capabilities, including scene +similarity learning, user-specific scene transition cognition, scene-specific +information construction for the new scene, and enhancing the diverged logical +information between scenes. We demonstrate SwAN's potential to optimize dynamic +multi-scene recommendation problems by effectively online handling cold-start +recommendations for any newly arrived scenes. More encouragingly, SwAN has been +successfully deployed in Meituan's online catering recommendation service, +which serves millions of customers per day, and SwAN has achieved a 5.64% CTR +index improvement relative to the baselines and a 5.19% increase in daily order +volume proportion. + +
+
+ comment: 10 pages, 6 figures, accepted by Recsys 2024 +
+
+
+
+
+ + ♻ ☆ A Synthetic Benchmarking Pipeline to Compare Camera Calibration + Algorithms ICPR 2024 + + +
+ Accurate camera calibration is crucial for various computer vision +applications. However, measuring calibration accuracy in the real world is +challenging due to the lack of datasets with ground truth to evaluate them. In +this paper, we present SynthCal, a synthetic camera calibration benchmarking +pipeline that generates images of calibration patterns to measure and enable +accurate quantification of calibration algorithm performance in camera +parameter estimation. We present a SynthCal generated calibration dataset with +four common patterns, two camera types, and two environments with varying view, +distortion, lighting, and noise levels for both monocular and multi-camera +systems. The dataset evaluates both single and multi-view calibration +algorithms by measuring re-projection and root-mean-square errors for identical +patterns and camera settings. Additionally, we analyze the significance of +different patterns using different calibration configurations. The experimental +results demonstrate the effectiveness of SynthCal in evaluating various +calibration algorithms and patterns. + +
+
+ comment: ICPR 2024 +
+
+
+
+
+ + ♻ ☆ Diffusion Feedback Helps CLIP See Better + + +
+ Contrastive Language-Image Pre-training (CLIP), which excels at abstracting +open-world representations across domains and modalities, has become a +foundation for a variety of vision and multimodal tasks. However, recent +studies reveal that CLIP has severe visual shortcomings, such as which can +hardly distinguish orientation, quantity, color, structure, etc. These visual +shortcomings also limit the perception capabilities of multimodal large +language models (MLLMs) built on CLIP. The main reason could be that the +image-text pairs used to train CLIP are inherently biased, due to the lack of +the distinctiveness of the text and the diversity of images. In this work, we +present a simple post-training approach for CLIP models, which largely +overcomes its visual shortcomings via a self-supervised diffusion process. We +introduce DIVA, which uses the DIffusion model as a Visual Assistant for CLIP. +Specifically, DIVA leverages generative feedback from text-to-image diffusion +models to optimize CLIP representations, with only images (without +corresponding text). We demonstrate that DIVA improves CLIP's performance on +the challenging MMVP-VLM benchmark which assesses fine-grained visual abilities +to a large extent (e.g., 3-7%), and enhances the performance of MLLMs and +vision models on multimodal understanding and segmentation tasks. Extensive +evaluation on 29 image classification and retrieval benchmarks confirms that +our framework preserves CLIP's strong zero-shot capabilities. The code is +available at https://github.com/baaivision/DIVA. + +
+
+
+
+
+ + ♻ ☆ Street Gaussians: Modeling Dynamic Urban Scenes with Gaussian Splatting + + +
+ This paper aims to tackle the problem of modeling dynamic urban streets for +autonomous driving scenes. Recent methods extend NeRF by incorporating tracked +vehicle poses to animate vehicles, enabling photo-realistic view synthesis of +dynamic urban street scenes. However, significant limitations are their slow +training and rendering speed. We introduce Street Gaussians, a new explicit +scene representation that tackles these limitations. Specifically, the dynamic +urban scene is represented as a set of point clouds equipped with semantic +logits and 3D Gaussians, each associated with either a foreground vehicle or +the background. To model the dynamics of foreground object vehicles, each +object point cloud is optimized with optimizable tracked poses, along with a 4D +spherical harmonics model for the dynamic appearance. The explicit +representation allows easy composition of object vehicles and background, which +in turn allows for scene editing operations and rendering at 135 FPS (1066 +$\times$ 1600 resolution) within half an hour of training. The proposed method +is evaluated on multiple challenging benchmarks, including KITTI and Waymo Open +datasets. Experiments show that the proposed method consistently outperforms +state-of-the-art methods across all datasets. The code will be released to +ensure reproducibility. + +
+
+ comment: Project page: https://zju3dv.github.io/street_gaussians/ +
+
+
+
+
+ + ♻ ☆ Exploring Diversity-based Active Learning for 3D Object Detection in + Autonomous Driving + + +
+ 3D object detection has recently received much attention due to its great +potential in autonomous vehicle (AV). The success of deep learning based object +detectors relies on the availability of large-scale annotated datasets, which +is time-consuming and expensive to compile, especially for 3D bounding box +annotation. In this work, we investigate diversity-based active learning (AL) +as a potential solution to alleviate the annotation burden. Given limited +annotation budget, only the most informative frames and objects are +automatically selected for human to annotate. Technically, we take the +advantage of the multimodal information provided in an AV dataset, and propose +a novel acquisition function that enforces spatial and temporal diversity in +the selected samples. We benchmark the proposed method against other AL +strategies under realistic annotation cost measurement, where the realistic +costs for annotating a frame and a 3D bounding box are both taken into +consideration. We demonstrate the effectiveness of the proposed method on the +nuScenes dataset and show that it outperforms existing AL strategies +significantly. + +
+
+
+
+
+ + ♻ ☆ XAI-Based Detection of Adversarial Attacks on Deepfake Detectors + + +
+ We introduce a novel methodology for identifying adversarial attacks on +deepfake detectors using eXplainable Artificial Intelligence (XAI). In an era +characterized by digital advancement, deepfakes have emerged as a potent tool, +creating a demand for efficient detection systems. However, these systems are +frequently targeted by adversarial attacks that inhibit their performance. We +address this gap, developing a defensible deepfake detector by leveraging the +power of XAI. The proposed methodology uses XAI to generate interpretability +maps for a given method, providing explicit visualizations of decision-making +factors within the AI models. We subsequently employ a pretrained feature +extractor that processes both the input image and its corresponding XAI image. +The feature embeddings extracted from this process are then used for training a +simple yet effective classifier. Our approach contributes not only to the +detection of deepfakes but also enhances the understanding of possible +adversarial attacks, pinpointing potential vulnerabilities. Furthermore, this +approach does not change the performance of the deepfake detector. The paper +demonstrates promising results suggesting a potential pathway for future +deepfake detection mechanisms. We believe this study will serve as a valuable +contribution to the community, sparking much-needed discourse on safeguarding +deepfake detectors. + +
+
+ comment: Accepted at TMLR 2024 +
+
+
+
+
+ + ♻ ☆ GABInsight: Exploring Gender-Activity Binding Bias in Vision-Language + Models + + +
+ Vision-language models (VLMs) are intensively used in many downstream tasks, +including those requiring assessments of individuals appearing in the images. +While VLMs perform well in simple single-person scenarios, in real-world +applications, we often face complex situations in which there are persons of +different genders doing different activities. We show that in such cases, VLMs +are biased towards identifying the individual with the expected gender +(according to ingrained gender stereotypes in the model or other forms of +sample selection bias) as the performer of the activity. We refer to this bias +in associating an activity with the gender of its actual performer in an image +or text as the Gender-Activity Binding (GAB) bias and analyze how this bias is +internalized in VLMs. To assess this bias, we have introduced the GAB dataset +with approximately 5500 AI-generated images that represent a variety of +activities, addressing the scarcity of real-world images for some scenarios. To +have extensive quality control, the generated images are evaluated for their +diversity, quality, and realism. We have tested 12 renowned pre-trained VLMs on +this dataset in the context of text-to-image and image-to-text retrieval to +measure the effect of this bias on their predictions. Additionally, we have +carried out supplementary experiments to quantify the bias in VLMs' text +encoders and to evaluate VLMs' capability to recognize activities. Our +experiments indicate that VLMs experience an average performance decline of +about 13.2% when confronted with gender-activity binding bias. + +
+
+
+
+
+
+
+
+ + Information Retrieval 13 + +
+
+
+ + ☆ On the Necessity of World Knowledge for Mitigating Missing Labels in + Extreme Classification + + +
+ Extreme Classification (XC) aims to map a query to the most relevant +documents from a very large document set. XC algorithms used in real-world +applications learn this mapping from datasets curated from implicit feedback, +such as user clicks. However, these datasets inevitably suffer from missing +labels. In this work, we observe that systematic missing labels lead to missing +knowledge, which is critical for accurately modelling relevance between queries +and documents. We formally show that this absence of knowledge cannot be +recovered using existing methods such as propensity weighting and data +imputation strategies that solely rely on the training dataset. While LLMs +provide an attractive solution to augment the missing knowledge, leveraging +them in applications with low latency requirements and large document sets is +challenging. To incorporate missing knowledge at scale, we propose SKIM +(Scalable Knowledge Infusion for Missing Labels), an algorithm that leverages a +combination of small LM and abundant unstructured meta-data to effectively +mitigate the missing label problem. We show the efficacy of our method on +large-scale public datasets through exhaustive unbiased evaluation ranging from +human annotations to simulations inspired from industrial settings. SKIM +outperforms existing methods on Recall@100 by more than 10 absolute points. +Additionally, SKIM scales to proprietary query-ad retrieval datasets containing +10 million documents, outperforming contemporary methods by 12% in offline +evaluation and increased ad click-yield by 1.23% in an online A/B test +conducted on a popular search engine. We release our code, prompts, trained XC +models and finetuned SLMs at: https://github.com/bicycleman15/skim + +
+
+ comment: Preprint, 23 pages +
+
+
+
+
+ + ☆ WPN: An Unlearning Method Based on N-pair Contrastive Learning in + Language Models ECAI 2024 + + +
+ Generative language models (LMs) offer numerous advantages but may produce +inappropriate or harmful outputs due to the harmful knowledge acquired during +pre-training. This knowledge often manifests as undesirable correspondences, +such as "harmful prompts" leading to "harmful outputs," which our research aims +to mitigate through unlearning techniques.However, existing unlearning methods +based on gradient ascent can significantly impair the performance of LMs. To +address this issue, we propose a novel approach called Weighted Positional +N-pair (WPN) Learning, which leverages position-weighted mean pooling within an +n-pair contrastive learning framework. WPN is designed to modify the output +distribution of LMs by eliminating specific harmful outputs (e.g., replacing +toxic responses with neutral ones), thereby transforming the model's behavior +from "harmful prompt-harmful output" to "harmful prompt-harmless +response".Experiments on OPT and GPT-NEO LMs show that WPN effectively reduces +the proportion of harmful responses, achieving a harmless rate of up to 95.8\% +while maintaining stable performance on nine common benchmarks (with less than +2\% degradation on average). Moreover, we provide empirical evidence to +demonstrate WPN's ability to weaken the harmful correspondences in terms of +generalizability and robustness, as evaluated on out-of-distribution test sets +and under adversarial attacks. + +
+
+ comment: ECAI 2024 +
+
+
+
+
+ + ☆ Towards Boosting LLMs-driven Relevance Modeling with Progressive + Retrieved Behavior-augmented Prompting + + +
+ Relevance modeling is a critical component for enhancing user experience in +search engines, with the primary objective of identifying items that align with +users' queries. Traditional models only rely on the semantic congruence between +queries and items to ascertain relevance. However, this approach represents +merely one aspect of the relevance judgement, and is insufficient in isolation. +Even powerful Large Language Models (LLMs) still cannot accurately judge the +relevance of a query and an item from a semantic perspective. To augment +LLMs-driven relevance modeling, this study proposes leveraging user +interactions recorded in search logs to yield insights into users' implicit +search intentions. The challenge lies in the effective prompting of LLMs to +capture dynamic search intentions, which poses several obstacles in real-world +relevance scenarios, i.e., the absence of domain-specific knowledge, the +inadequacy of an isolated prompt, and the prohibitive costs associated with +deploying LLMs. In response, we propose ProRBP, a novel Progressive Retrieved +Behavior-augmented Prompting framework for integrating search scenario-oriented +knowledge with LLMs effectively. Specifically, we perform the user-driven +behavior neighbors retrieval from the daily search logs to obtain +domain-specific knowledge in time, retrieving candidates that users consider to +meet their expectations. Then, we guide LLMs for relevance modeling by +employing advanced prompting techniques that progressively improve the outputs +of the LLMs, followed by a progressive aggregation with comprehensive +consideration of diverse aspects. For online serving, we have developed an +industrial application framework tailored for the deployment of LLMs in +relevance modeling. Experiments on real-world industry data and online A/B +testing demonstrate our proposal achieves promising performance. + +
+
+
+
+
+ + ☆ Hindi-BEIR : A Large Scale Retrieval Benchmark in Hindi + + +
+ Given the large number of Hindi speakers worldwide, there is a pressing need +for robust and efficient information retrieval systems for Hindi. Despite +ongoing research, there is a lack of comprehensive benchmark for evaluating +retrieval models in Hindi. To address this gap, we introduce the Hindi version +of the BEIR benchmark, which includes a subset of English BEIR datasets +translated to Hindi, existing Hindi retrieval datasets, and synthetically +created datasets for retrieval. The benchmark is comprised of $15$ datasets +spanning across $8$ distinct tasks. We evaluate state-of-the-art multilingual +retrieval models on this benchmark to identify task and domain-specific +challenges and their impact on retrieval performance. By releasing this +benchmark and a set of relevant baselines, we enable researchers to understand +the limitations and capabilities of current Hindi retrieval models, promoting +advancements in this critical area. The datasets from Hindi-BEIR are publicly +available. + +
+
+
+
+
+ + ☆ ELASTIC: Efficient Linear Attention for Sequential Interest Compression AAAI 2025 + + +
+ State-of-the-art sequential recommendation models heavily rely on +transformer's attention mechanism. However, the quadratic computational and +memory complexities of self attention have limited its scalability for modeling +users' long range behaviour sequences. To address this problem, we propose +ELASTIC, an Efficient Linear Attention for SequenTial Interest Compression, +requiring only linear time complexity and decoupling model capacity from +computational cost. Specifically, ELASTIC introduces a fixed length interest +experts with linear dispatcher attention mechanism which compresses the +long-term behaviour sequences to a significantly more compact representation +which reduces up to 90% GPU memory usage with x2.7 inference speed up. The +proposed linear dispatcher attention mechanism significantly reduces the +quadratic complexity and makes the model feasible for adequately modeling +extremely long sequences. Moreover, in order to retain the capacity for +modeling various user interests, ELASTIC initializes a vast learnable interest +memory bank and sparsely retrieves compressed user's interests from the memory +with a negligible computational overhead. The proposed interest memory +retrieval technique significantly expands the cardinality of available interest +space while keeping the same computational cost, thereby striking a trade-off +between recommendation accuracy and efficiency. To validate the effectiveness +of our proposed ELASTIC, we conduct extensive experiments on various public +datasets and compare it with several strong sequential recommenders. +Experimental results demonstrate that ELASTIC consistently outperforms +baselines by a significant margin and also highlight the computational +efficiency of ELASTIC when modeling long sequences. We will make our +implementation code publicly available. + +
+
+ comment: Submitted to AAAI 2025 +
+
+
+
+
+ + ☆ Gender Dynamics in Russian Online Political Discourse + + +
+ The digital landscape provides a dynamic platform for political discourse +crucial for understanding shifts in public opinion and engagement especially +under authoritarian governments This study examines YouTube user behavior +during the Russian-Ukrainian war analyzing 2168 videos with over 36000 comments +from January 2022 to February 2024 We observe distinct patterns of +participation and gender dynamics that correlate with major political and +military events Notably females were more active in antigovernment channels +especially during peak conflict periods Contrary to assumptions about online +engagement in authoritarian contexts our findings suggest a complex interplay +where women emerge as pivotal digital communicators This highlights online +platforms role in facilitating political expression under authoritarian regimes +demonstrating its potential as a barometer for public sentiment. + +
+
+
+
+
+ + ☆ Deep Code Search with Naming-Agnostic Contrastive Multi-View Learning + + +
+ Software development is a repetitive task, as developers usually reuse or get +inspiration from existing implementations. Code search, which refers to the +retrieval of relevant code snippets from a codebase according to the +developer's intent that has been expressed as a query, has become increasingly +important in the software development process. Due to the success of deep +learning in various applications, a great number of deep learning based code +search approaches have sprung up and achieved promising results. However, +developers may not follow the same naming conventions and the same variable may +have different variable names in different implementations, bringing a +challenge to deep learning based code search methods that rely on explicit +variable correspondences to understand source code. To overcome this challenge, +we propose a naming-agnostic code search method (NACS) based on contrastive +multi-view code representation learning. NACS strips information bound to +variable names from Abstract Syntax Tree (AST), the representation of the +abstract syntactic structure of source code, and focuses on capturing intrinsic +properties solely from AST structures. We use semantic-level and syntax-level +augmentation techniques to prepare realistically rational data and adopt +contrastive learning to design a graph-view modeling component in NACS to +enhance the understanding of code snippets. We further model ASTs in a path +view to strengthen the graph-view modeling component through multi-view +learning. Extensive experiments show that NACS provides superior code search +performance compared to baselines and NACS can be adapted to help existing code +search methods overcome the impact of different naming conventions. + +
+
+
+
+
+ + ♻ ☆ Towards A Unified View of Answer Calibration for Multi-Step Reasoning ACL2024 + + +
+ Large Language Models (LLMs) employing Chain-of-Thought (CoT) prompting have +broadened the scope for improving multi-step reasoning capabilities. We +generally divide multi-step reasoning into two phases: path generation to +generate the reasoning path(s); and answer calibration post-processing the +reasoning path(s) to obtain a final answer. However, the existing literature +lacks systematic analysis on different answer calibration approaches. In this +paper, we summarize the taxonomy of recent answer calibration techniques and +break them down into step-level and path-level strategies. We then conduct a +thorough evaluation on these strategies from a unified view, systematically +scrutinizing step-level and path-level answer calibration across multiple +paths. Experimental results reveal that integrating the dominance of both +strategies tends to derive optimal outcomes. Our study holds the potential to +illuminate key insights for optimizing multi-step reasoning with answer +calibration. + +
+
+ comment: Accepted by NLRSE@ACL2024 +
+
+
+
+
+ + ♻ ☆ Scene-wise Adaptive Network for Dynamic Cold-start Scenes Optimization + in CTR Prediction + + +
+ In the realm of modern mobile E-commerce, providing users with nearby +commercial service recommendations through location-based online services has +become increasingly vital. While machine learning approaches have shown promise +in multi-scene recommendation, existing methodologies often struggle to address +cold-start problems in unprecedented scenes: the increasing diversity of +commercial choices, along with the short online lifespan of scenes, give rise +to the complexity of effective recommendations in online and dynamic scenes. In +this work, we propose Scene-wise Adaptive Network (SwAN), a novel approach that +emphasizes high-performance cold-start online recommendations for new scenes. +Our approach introduces several crucial capabilities, including scene +similarity learning, user-specific scene transition cognition, scene-specific +information construction for the new scene, and enhancing the diverged logical +information between scenes. We demonstrate SwAN's potential to optimize dynamic +multi-scene recommendation problems by effectively online handling cold-start +recommendations for any newly arrived scenes. More encouragingly, SwAN has been +successfully deployed in Meituan's online catering recommendation service, +which serves millions of customers per day, and SwAN has achieved a 5.64% CTR +index improvement relative to the baselines and a 5.19% increase in daily order +volume proportion. + +
+
+ comment: 10 pages, 6 figures, accepted by Recsys 2024 +
+
+
+
+
+ + ♻ ☆ RAGEval: Scenario Specific RAG Evaluation Dataset Generation Framework + + +
+ Retrieval-Augmented Generation (RAG) systems have demonstrated their +advantages in alleviating the hallucination of Large Language Models (LLMs). +Existing RAG benchmarks mainly focus on evaluating whether LLMs can correctly +answer the general knowledge. However, they are unable to evaluate the +effectiveness of the RAG system in dealing with the data from different +vertical domains. This paper introduces RAGEval, a framework for automatically +generating evaluation datasets to evaluate the knowledge usage ability of +different LLMs in different scenarios. Specifically, RAGEval summarizes a +schema from seed documents, applies the configurations to generate diverse +documents, and constructs question-answering pairs according to both articles +and configurations. We propose three novel metrics, Completeness, +Hallucination, and Irrelevance, to carefully evaluate the responses generated +by LLMs. By benchmarking RAG models in vertical domains, RAGEval has the +ability to better evaluate the knowledge usage ability of LLMs, which avoids +the confusion regarding the source of knowledge in answering question in +existing QA datasets--whether it comes from parameterized memory or retrieval. +The code and dataset will be released. + +
+
+ comment: 16 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Pattern-wise Transparent Sequential Recommendation + + +
+ A transparent decision-making process is essential for developing reliable +and trustworthy recommender systems. For sequential recommendation, it means +that the model can identify critical items asthe justifications for its +recommendation results. However, achieving both model transparency and +recommendation performance simultaneously is challenging, especially for models +that take the entire sequence of items as input without screening. In this +paper,we propose an interpretable framework (named PTSR) that enables a +pattern-wise transparent decision-making process. It breaks the sequence of +items into multi-level patterns that serve as atomic units for the entire +recommendation process. The contribution of each pattern to the outcome is +quantified in the probability space. With a carefully designed pattern +weighting correction, the pattern contribution can be learned in the absence of +ground-truth critical patterns. The final recommended items are those items +that most critical patterns strongly endorse. Extensive experiments on four +public datasets demonstrate remarkable recommendation performance, while case +studies validate the model transparency. Our code is available at +https://anonymous.4open.science/r/PTSR-2237. + +
+
+
+
+
+ + ♻ ☆ LLMs for Knowledge Graph Construction and Reasoning: Recent Capabilities + and Future Opportunities + + +
+ This paper presents an exhaustive quantitative and qualitative evaluation of +Large Language Models (LLMs) for Knowledge Graph (KG) construction and +reasoning. We engage in experiments across eight diverse datasets, focusing on +four representative tasks encompassing entity and relation extraction, event +extraction, link prediction, and question-answering, thereby thoroughly +exploring LLMs' performance in the domain of construction and inference. +Empirically, our findings suggest that LLMs, represented by GPT-4, are more +suited as inference assistants rather than few-shot information extractors. +Specifically, while GPT-4 exhibits good performance in tasks related to KG +construction, it excels further in reasoning tasks, surpassing fine-tuned +models in certain cases. Moreover, our investigation extends to the potential +generalization ability of LLMs for information extraction, leading to the +proposition of a Virtual Knowledge Extraction task and the development of the +corresponding VINE dataset. Based on these empirical findings, we further +propose AutoKG, a multi-agent-based approach employing LLMs and external +sources for KG construction and reasoning. We anticipate that this research can +provide invaluable insights for future undertakings in the field of knowledge +graphs. The code and datasets are in https://github.com/zjunlp/AutoKG. + +
+
+ comment: World Wide Web Journal +
+
+
+
+
+ + ♻ ☆ TokenRec: Learning to Tokenize ID for LLM-based Generative + Recommendation + + +
+ There is a growing interest in utilizing large-scale language models (LLMs) +to advance next-generation Recommender Systems (RecSys), driven by their +outstanding language understanding and in-context learning capabilities. In +this scenario, tokenizing (i.e., indexing) users and items becomes essential +for ensuring a seamless alignment of LLMs with recommendations. While several +studies have made progress in representing users and items through textual +contents or latent representations, challenges remain in efficiently capturing +high-order collaborative knowledge into discrete tokens that are compatible +with LLMs. Additionally, the majority of existing tokenization approaches often +face difficulties in generalizing effectively to new/unseen users or items that +were not in the training corpus. To address these challenges, we propose a +novel framework called TokenRec, which introduces not only an effective ID +tokenization strategy but also an efficient retrieval paradigm for LLM-based +recommendations. Specifically, our tokenization strategy, Masked +Vector-Quantized (MQ) Tokenizer, involves quantizing the masked user/item +representations learned from collaborative filtering into discrete tokens, thus +achieving a smooth incorporation of high-order collaborative knowledge and a +generalizable tokenization of users and items for LLM-based RecSys. Meanwhile, +our generative retrieval paradigm is designed to efficiently recommend top-$K$ +items for users to eliminate the need for the time-consuming auto-regressive +decoding and beam search processes used by LLMs, thus significantly reducing +inference time. Comprehensive experiments validate the effectiveness of the +proposed methods, demonstrating that TokenRec outperforms competitive +benchmarks, including both traditional recommender systems and emerging +LLM-based recommender systems. + +
+
+ comment: Submitted to IEEE TKDE. Our code and dataset will be made available + upon acceptance of the paper +
+
+
+
+
+
+
+
+ + Machine Learning 16 + +
+
+
+ + ☆ Circuit design in biology and machine learning. I. Random networks and + dimensional reduction + + +
+ A biological circuit is a neural or biochemical cascade, taking inputs and +producing outputs. How have biological circuits learned to solve environmental +challenges over the history of life? The answer certainly follows Dobzhansky's +famous quote that ``nothing in biology makes sense except in the light of +evolution.'' But that quote leaves out the mechanistic basis by which natural +selection's trial-and-error learning happens, which is exactly what we have to +understand. How does the learning process that designs biological circuits +actually work? How much insight can we gain about the form and function of +biological circuits by studying the processes that have made those circuits? +Because life's circuits must often solve the same problems as those faced by +machine learning, such as environmental tracking, homeostatic control, +dimensional reduction, or classification, we can begin by considering how +machine learning designs computational circuits to solve problems. We can then +ask: How much insight do those computational circuits provide about the design +of biological circuits? How much does biology differ from computers in the +particular circuit designs that it uses to solve problems? This article steps +through two classic machine learning models to set the foundation for analyzing +broad questions about the design of biological circuits. One insight is the +surprising power of randomly connected networks. Another is the central role of +internal models of the environment embedded within biological circuits, +illustrated by a model of dimensional reduction and trend prediction. Overall, +many challenges in biology have machine learning analogs, suggesting hypotheses +about how biology's circuits are designed. + +
+
+
+
+
+ + ☆ On the Necessity of World Knowledge for Mitigating Missing Labels in + Extreme Classification + + +
+ Extreme Classification (XC) aims to map a query to the most relevant +documents from a very large document set. XC algorithms used in real-world +applications learn this mapping from datasets curated from implicit feedback, +such as user clicks. However, these datasets inevitably suffer from missing +labels. In this work, we observe that systematic missing labels lead to missing +knowledge, which is critical for accurately modelling relevance between queries +and documents. We formally show that this absence of knowledge cannot be +recovered using existing methods such as propensity weighting and data +imputation strategies that solely rely on the training dataset. While LLMs +provide an attractive solution to augment the missing knowledge, leveraging +them in applications with low latency requirements and large document sets is +challenging. To incorporate missing knowledge at scale, we propose SKIM +(Scalable Knowledge Infusion for Missing Labels), an algorithm that leverages a +combination of small LM and abundant unstructured meta-data to effectively +mitigate the missing label problem. We show the efficacy of our method on +large-scale public datasets through exhaustive unbiased evaluation ranging from +human annotations to simulations inspired from industrial settings. SKIM +outperforms existing methods on Recall@100 by more than 10 absolute points. +Additionally, SKIM scales to proprietary query-ad retrieval datasets containing +10 million documents, outperforming contemporary methods by 12% in offline +evaluation and increased ad click-yield by 1.23% in an online A/B test +conducted on a popular search engine. We release our code, prompts, trained XC +models and finetuned SLMs at: https://github.com/bicycleman15/skim + +
+
+ comment: Preprint, 23 pages +
+
+
+
+
+ + ☆ Convolutional Conditional Neural Processes + + +
+ Neural processes are a family of models which use neural networks to directly +parametrise a map from data sets to predictions. Directly parametrising this +map enables the use of expressive neural networks in small-data problems where +neural networks would traditionally overfit. Neural processes can produce +well-calibrated uncertainties, effectively deal with missing data, and are +simple to train. These properties make this family of models appealing for a +breadth of applications areas, such as healthcare or environmental sciences. + This thesis advances neural processes in three ways. + First, we propose convolutional neural processes (ConvNPs). ConvNPs improve +data efficiency of neural processes by building in a symmetry called +translation equivariance. ConvNPs rely on convolutional neural networks rather +than multi-layer perceptrons. + Second, we propose Gaussian neural processes (GNPs). GNPs directly +parametrise dependencies in the predictions of a neural process. Current +approaches to modelling dependencies in the predictions depend on a latent +variable, which consequently requires approximate inference, undermining the +simplicity of the approach. + Third, we propose autoregressive conditional neural processes (AR CNPs). AR +CNPs train a neural process without any modifications to the model or training +procedure and, at test time, roll out the model in an autoregressive fashion. +AR CNPs equip the neural process framework with a new knob where modelling +complexity and computational expense at training time can be traded for +computational expense at test time. + In addition to methodological advancements, this thesis also proposes a +software abstraction that enables a compositional approach to implementing +neural processes. This approach allows the user to rapidly explore the space of +neural process models by putting together elementary building blocks in +different ways. + +
+
+ comment: PhD thesis, 226 pages +
+
+
+
+
+ + ☆ A Markov Random Field Multi-Modal Variational AutoEncoder + + +
+ Recent advancements in multimodal Variational AutoEncoders (VAEs) have +highlighted their potential for modeling complex data from multiple modalities. +However, many existing approaches use relatively straightforward aggregating +schemes that may not fully capture the complex dynamics present between +different modalities. This work introduces a novel multimodal VAE that +incorporates a Markov Random Field (MRF) into both the prior and posterior +distributions. This integration aims to capture complex intermodal interactions +more effectively. Unlike previous models, our approach is specifically designed +to model and leverage the intricacies of these relationships, enabling a more +faithful representation of multimodal data. Our experiments demonstrate that +our model performs competitively on the standard PolyMNIST dataset and shows +superior performance in managing complex intermodal dependencies in a specially +designed synthetic dataset, intended to test intricate relationships. + +
+
+
+
+
+ + ☆ Say My Name: a Model's Bias Discovery Framework + + +
+ In the last few years, due to the broad applicability of deep learning to +downstream tasks and end-to-end training capabilities, increasingly more +concerns about potential biases to specific, non-representative patterns have +been raised. Many works focusing on unsupervised debiasing usually leverage the +tendency of deep models to learn ``easier'' samples, for example by clustering +the latent space to obtain bias pseudo-labels. However, the interpretation of +such pseudo-labels is not trivial, especially for a non-expert end user, as it +does not provide semantic information about the bias features. To address this +issue, we introduce ``Say My Name'' (SaMyNa), the first tool to identify biases +within deep models semantically. Unlike existing methods, our approach focuses +on biases learned by the model. Our text-based pipeline enhances explainability +and supports debiasing efforts: applicable during either training or post-hoc +validation, our method can disentangle task-related information and proposes +itself as a tool to analyze biases. Evaluation on traditional benchmarks +demonstrates its effectiveness in detecting biases and even disclaiming them, +showcasing its broad applicability for model diagnosis. + +
+
+
+
+
+ + ☆ Security Concerns in Quantum Machine Learning as a Service + + +
+ Quantum machine learning (QML) is a category of algorithms that employ +variational quantum circuits (VQCs) to tackle machine learning tasks. Recent +discoveries have shown that QML models can effectively generalize from limited +training data samples. This capability has sparked increased interest in +deploying these models to address practical, real-world challenges, resulting +in the emergence of Quantum Machine Learning as a Service (QMLaaS). QMLaaS +represents a hybrid model that utilizes both classical and quantum computing +resources. Classical computers play a crucial role in this setup, handling +initial pre-processing and subsequent post-processing of data to compensate for +the current limitations of quantum hardware. Since this is a new area, very +little work exists to paint the whole picture of QMLaaS in the context of known +security threats in the domain of classical and quantum machine learning. This +SoK paper is aimed to bridge this gap by outlining the complete QMLaaS +workflow, which encompasses both the training and inference phases and +highlighting significant security concerns involving untrusted classical or +quantum providers. QML models contain several sensitive assets, such as the +model architecture, training/testing data, encoding techniques, and trained +parameters. Unauthorized access to these components could compromise the +model's integrity and lead to intellectual property (IP) theft. We pinpoint the +critical security issues that must be considered to pave the way for a secure +QMLaaS deployment. + +
+
+ comment: 9 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Could a Large Language Model be Conscious? NeurIPS + + +
+ There has recently been widespread discussion of whether large language +models might be sentient. Should we take this idea seriously? I will break down +the strongest reasons for and against. Given mainstream assumptions in the +science of consciousness, there are significant obstacles to consciousness in +current models: for example, their lack of recurrent processing, a global +workspace, and unified agency. At the same time, it is quite possible that +these obstacles will be overcome in the next decade or so. I conclude that +while it is somewhat unlikely that current large language models are conscious, +we should take seriously the possibility that successors to large language +models may be conscious in the not-too-distant future. + +
+
+ comment: Invited lecture at NeurIPS, November 28, 2022 +
+
+
+
+
+ + ♻ ☆ Minimum Description Feature Selection for Complexity Reduction in + Machine Learning-based Wireless Positioning + + +
+ Recently, deep learning approaches have provided solutions to difficult +problems in wireless positioning (WP). Although these WP algorithms have +attained excellent and consistent performance against complex channel +environments, the computational complexity coming from processing +high-dimensional features can be prohibitive for mobile applications. In this +work, we design a novel positioning neural network (P-NN) that utilizes the +minimum description features to substantially reduce the complexity of deep +learning-based WP. P-NN's feature selection strategy is based on maximum power +measurements and their temporal locations to convey information needed to +conduct WP. We improve P-NN's learning ability by intelligently processing two +different types of inputs: sparse image and measurement matrices. Specifically, +we implement a self-attention layer to reinforce the training ability of our +network. We also develop a technique to adapt feature space size, optimizing +over the expected information gain and the classification capability quantified +with information-theoretic measures on signal bin selection. Numerical results +show that P-NN achieves a significant advantage in performance-complexity +tradeoff over deep learning baselines that leverage the full power delay +profile (PDP). In particular, we find that P-NN achieves a large improvement in +performance for low SNR, as unnecessary measurements are discarded in our +minimum description features. + +
+
+ comment: This paper has been accepted for the publication in IEEE Journal on + Selected Areas in Communications. arXiv admin note: text overlap with + arXiv:2402.09580 +
+
+
+
+
+ + ♻ ☆ Generalizable Physics-Informed Learning for Stochastic Safety-Critical + Systems + + +
+ Accurate estimate of long-term risk is critical for safe decision-making, but +sampling from rare risk events and long-term trajectories can be prohibitively +costly. Risk gradient can be used in many first-order techniques for learning +and control methods, but gradient estimate is difficult to obtain using Monte +Carlo (MC) methods because the infinitesimal divisor may significantly amplify +sampling noise. Motivated by this gap, we propose an efficient method to +evaluate long-term risk probabilities and their gradients using short-term +samples without sufficient risk events. We first derive that four types of +long-term risk probability are solutions of certain partial differential +equations (PDEs). Then, we propose a physics-informed learning technique that +integrates data and physics information (aforementioned PDEs). The physics +information helps propagate information beyond available data and obtain +provable generalization beyond available data, which in turn enables long-term +risk to be estimated using short-term samples of safe events. Finally, we +demonstrate in simulation that the proposed technique has improved sample +efficiency, generalizes well to unseen regions, and adapts to changing system +parameters. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2305.06432 +
+
+
+
+
+ + ♻ ☆ Data Science Kitchen at GermEval 2021: A Fine Selection of Hand-Picked + Features, Delivered Fresh from the Oven + + +
+ This paper presents the contribution of the Data Science Kitchen at GermEval +2021 shared task on the identification of toxic, engaging, and fact-claiming +comments. The task aims at extending the identification of offensive language, +by including additional subtasks that identify comments which should be +prioritized for fact-checking by moderators and community managers. Our +contribution focuses on a feature-engineering approach with a conventional +classification backend. We combine semantic and writing style embeddings +derived from pre-trained deep neural networks with additional numerical +features, specifically designed for this task. Classifier ensembles are used to +derive predictions for each subtask via a majority voting scheme. Our best +submission achieved macro-averaged F1-scores of 66.8\%,\,69.9\% and 72.5\% for +the identification of toxic, engaging, and fact-claiming comments. + +
+
+ comment: Accepted at 17th Conference on Natural Language Processing (KONVENS + 2021) +
+
+
+
+
+ + ♻ ☆ A Generalizable Physics-informed Learning Framework for Risk Probability + Estimation + + +
+ Accurate estimates of long-term risk probabilities and their gradients are +critical for many stochastic safe control methods. However, computing such risk +probabilities in real-time and in unseen or changing environments is +challenging. Monte Carlo (MC) methods cannot accurately evaluate the +probabilities and their gradients as an infinitesimal devisor can amplify the +sampling noise. In this paper, we develop an efficient method to evaluate the +probabilities of long-term risk and their gradients. The proposed method +exploits the fact that long-term risk probability satisfies certain partial +differential equations (PDEs), which characterize the neighboring relations +between the probabilities, to integrate MC methods and physics-informed neural +networks. We provide theoretical guarantees of the estimation error given +certain choices of training configurations. Numerical results show the proposed +method has better sample efficiency, generalizes well to unseen regions, and +can adapt to systems with changing parameters. The proposed method can also +accurately estimate the gradients of risk probabilities, which enables first- +and second-order techniques on risk probabilities to be used for learning and +control. + +
+
+ comment: Accepted at the 5th Annual Learning for Dynamics & Control (L4DC) + Conference, 2023 +
+
+
+
+
+ + ♻ ☆ Data-driven Semi-supervised Machine Learning with Surrogate Measures of + Safety for Abnormal Driving Behavior Detection + + +
+ Detecting abnormal driving behavior is critical for road traffic safety and +the evaluation of drivers' behavior. With the advancement of machine learning +(ML) algorithms and the accumulation of naturalistic driving data, many ML +models have been adopted for abnormal driving behavior detection (also referred +to in this paper as anomalies). Most existing ML-based detectors rely on +(fully) supervised ML methods, which require substantial labeled data. However, +ground truth labels are not always available in the real world, and labeling +large amounts of data is tedious. Thus, there is a need to explore unsupervised +or semi-supervised methods to make the anomaly detection process more feasible +and efficient. To fill this research gap, this study analyzes large-scale +real-world data revealing several abnormal driving behaviors (e.g., sudden +acceleration, rapid lane-changing) and develops a Hierarchical Extreme Learning +Machines (HELM) based semi-supervised ML method using partly labeled data to +accurately detect the identified abnormal driving behaviors. Moreover, previous +ML-based approaches predominantly utilized basic vehicle motion features (such +as velocity and acceleration) to label and detect abnormal driving behaviors, +while this study seeks to introduce Surrogate Measures of Safety (SMoS) as +input features for ML models to improve the detection performance. Results from +extensive experiments demonstrate the effectiveness of the proposed +semi-supervised ML model with the introduced SMoS serving as important +features. The proposed semi-supervised ML method outperforms other baseline +semi-supervised or unsupervised methods regarding various metrics, e.g., +delivering the best accuracy at 99.58% and the best F-1 measure at 0.9913. The +ablation study further highlights the significance of SMoS for advancing the +detection performance of abnormal driving behaviors. + +
+
+ comment: 24 pages, 10 figures, accepted by the 103rd Transportation Research + Board (TRB) Annual Meeting, under third round review by Transportation + Research Record: Journal of the Transportation Research Board +
+
+
+
+
+ + ♻ ☆ Using Implicit Behavior Cloning and Dynamic Movement Primitive to + Facilitate Reinforcement Learning for Robot Motion Planning + + +
+ Reinforcement learning (RL) for motion planning of multi-degree-of-freedom +robots still suffers from low efficiency in terms of slow training speed and +poor generalizability. In this paper, we propose a novel RL-based robot motion +planning framework that uses implicit behavior cloning (IBC) and dynamic +movement primitive (DMP) to improve the training speed and generalizability of +an off-policy RL agent. IBC utilizes human demonstration data to leverage the +training speed of RL, and DMP serves as a heuristic model that transfers motion +planning into a simpler planning space. To support this, we also create a human +demonstration dataset using a pick-and-place experiment that can be used for +similar studies. Comparison studies in simulation reveal the advantage of the +proposed method over the conventional RL agents with faster training speed and +higher scores. A real-robot experiment indicates the applicability of the +proposed method to a simple assembly task. Our work provides a novel +perspective on using motion primitives and human demonstration to leverage the +performance of RL for robot applications. + +
+
+
+
+
+ + ♻ ☆ Leveraging Knowledge Graph-Based Human-Like Memory Systems to Solve + Partially Observable Markov Decision Processes + + +
+ Humans observe only part of their environment at any moment but can still +make complex, long-term decisions thanks to our long-term memory. To test how +an AI can learn and utilize its long-term memory, we have developed a partially +observable Markov decision processes (POMDP) environment, where the agent has +to answer questions while navigating a maze. The environment is completely +knowledge graph (KG) based, where the hidden states are dynamic KGs. A KG is +both human- and machine-readable, making it easy to see what the agents +remember and forget. We train and compare agents with different memory systems, +to shed light on how human brains work when it comes to managing its own +memory. By repurposing the given learning objective as learning a memory +management policy, we were able to capture the most likely hidden state, which +is not only interpretable but also reusable. + +
+
+
+
+
+ + ♻ ☆ C-Mamba: Channel Correlation Enhanced State Space Models for + Multivariate Time Series Forecasting + + +
+ In recent years, significant progress has been made in multivariate time +series forecasting using Linear-based, Transformer-based, and Convolution-based +models. However, these approaches face notable limitations: linear forecasters +struggle with representation capacities, attention mechanisms suffer from +quadratic complexity, and convolutional models have a restricted receptive +field. These constraints impede their effectiveness in modeling complex time +series, particularly those with numerous variables. Additionally, many models +adopt the Channel-Independent (CI) strategy, treating multivariate time series +as uncorrelated univariate series while ignoring their correlations. For models +considering inter-channel relationships, whether through the self-attention +mechanism, linear combination, or convolution, they all incur high +computational costs and focus solely on weighted summation relationships, +neglecting potential proportional relationships between channels. In this work, +we address these issues by leveraging the newly introduced state space model +and propose \textbf{C-Mamba}, a novel approach that captures cross-channel +dependencies while maintaining linear complexity without losing the global +receptive field. Our model consists of two key components: (i) channel mixup, +where two channels are mixed to enhance the training sets; (ii) channel +attention enhanced patch-wise Mamba encoder that leverages the ability of the +state space models to capture cross-time dependencies and models correlations +between channels by mining their weight relationships. Our model achieves +state-of-the-art performance on seven real-world time series datasets. +Moreover, the proposed mixup and attention strategy exhibits strong +generalizability across other frameworks. + +
+
+
+
+
+ + ♻ ☆ Latent Guard: a Safety Framework for Text-to-image Generation ECCV 2024 + + +
+ With the ability to generate high-quality images, text-to-image (T2I) models +can be exploited for creating inappropriate content. To prevent misuse, +existing safety measures are either based on text blacklists, which can be +easily circumvented, or harmful content classification, requiring large +datasets for training and offering low flexibility. Hence, we propose Latent +Guard, a framework designed to improve safety measures in text-to-image +generation. Inspired by blacklist-based approaches, Latent Guard learns a +latent space on top of the T2I model's text encoder, where it is possible to +check the presence of harmful concepts in the input text embeddings. Our +proposed framework is composed of a data generation pipeline specific to the +task using large language models, ad-hoc architectural components, and a +contrastive learning strategy to benefit from the generated data. The +effectiveness of our method is verified on three datasets and against four +baselines. Code and data will be shared at https://latentguard.github.io/. + +
+
+ comment: This paper has been accepted to ECCV 2024 +
+
+
+
+
+
+
+
+ + Multimedia 5 + +
+
+
+ + ☆ SpeechEE: A Novel Benchmark for Speech Event Extraction + + +
+ Event extraction (EE) is a critical direction in the field of information +extraction, laying an important foundation for the construction of structured +knowledge bases. EE from text has received ample research and attention for +years, yet there can be numerous real-world applications that require direct +information acquisition from speech signals, online meeting minutes, interview +summaries, press releases, etc. While EE from speech has remained +under-explored, this paper fills the gap by pioneering a SpeechEE, defined as +detecting the event predicates and arguments from a given audio speech. To +benchmark the SpeechEE task, we first construct a large-scale high-quality +dataset. Based on textual EE datasets under the sentence, document, and +dialogue scenarios, we convert texts into speeches through both manual +real-person narration and automatic synthesis, empowering the data with diverse +scenarios, languages, domains, ambiences, and speaker styles. Further, to +effectively address the key challenges in the task, we tailor an E2E SpeechEE +system based on the encoder-decoder architecture, where a novel Shrinking Unit +module and a retrieval-aided decoding mechanism are devised. Extensive +experimental results on all SpeechEE subsets demonstrate the efficacy of the +proposed model, offering a strong baseline for the task. At last, being the +first work on this topic, we shed light on key directions for future research. +Our codes and the benchmark datasets are open at https://SpeechEE.github.io/ + +
+
+
+
+
+ + ☆ FD2Talk: Towards Generalized Talking Head Generation with Facial + Decoupled Diffusion Model + + +
+ Talking head generation is a significant research topic that still faces +numerous challenges. Previous works often adopt generative adversarial networks +or regression models, which are plagued by generation quality and average +facial shape problem. Although diffusion models show impressive generative +ability, their exploration in talking head generation remains unsatisfactory. +This is because they either solely use the diffusion model to obtain an +intermediate representation and then employ another pre-trained renderer, or +they overlook the feature decoupling of complex facial details, such as +expressions, head poses and appearance textures. Therefore, we propose a Facial +Decoupled Diffusion model for Talking head generation called FD2Talk, which +fully leverages the advantages of diffusion models and decouples the complex +facial details through multi-stages. Specifically, we separate facial details +into motion and appearance. In the initial phase, we design the Diffusion +Transformer to accurately predict motion coefficients from raw audio. These +motions are highly decoupled from appearance, making them easier for the +network to learn compared to high-dimensional RGB images. Subsequently, in the +second phase, we encode the reference image to capture appearance textures. The +predicted facial and head motions and encoded appearance then serve as the +conditions for the Diffusion UNet, guiding the frame generation. Benefiting +from decoupling facial details and fully leveraging diffusion models, extensive +experiments substantiate that our approach excels in enhancing image quality +and generating more accurate and diverse results compared to previous +state-of-the-art methods. + +
+
+ comment: Accepted by ACM Multimedia 2024 +
+
+
+
+
+ + ☆ Enhancing Modal Fusion by Alignment and Label Matching for Multimodal + Emotion Recognition INTERSPEECH 2024 + + +
+ To address the limitation in multimodal emotion recognition (MER) performance +arising from inter-modal information fusion, we propose a novel MER framework +based on multitask learning where fusion occurs after alignment, called +Foal-Net. The framework is designed to enhance the effectiveness of modality +fusion and includes two auxiliary tasks: audio-video emotion alignment (AVEL) +and cross-modal emotion label matching (MEM). First, AVEL achieves alignment of +emotional information in audio-video representations through contrastive +learning. Then, a modal fusion network integrates the aligned features. +Meanwhile, MEM assesses whether the emotions of the current sample pair are the +same, providing assistance for modal information fusion and guiding the model +to focus more on emotional information. The experimental results conducted on +IEMOCAP corpus show that Foal-Net outperforms the state-of-the-art methods and +emotion alignment is necessary before modal fusion. + +
+
+ comment: The paper has been accepted by INTERSPEECH 2024 +
+
+
+
+
+ + ♻ ☆ SynopGround: A Large-Scale Dataset for Multi-Paragraph Video Grounding + from TV Dramas and Synopses ACM MM 2024 + + +
+ Video grounding is a fundamental problem in multimodal content understanding, +aiming to localize specific natural language queries in an untrimmed video. +However, current video grounding datasets merely focus on simple events and are +either limited to shorter videos or brief sentences, which hinders the model +from evolving toward stronger multimodal understanding capabilities. To address +these limitations, we present a large-scale video grounding dataset named +SynopGround, in which more than 2800 hours of videos are sourced from popular +TV dramas and are paired with accurately localized human-written synopses. Each +paragraph in the synopsis serves as a language query and is manually annotated +with precise temporal boundaries in the long video. These paragraph queries are +tightly correlated to each other and contain a wealth of abstract expressions +summarizing video storylines and specific descriptions portraying event +details, which enables the model to learn multimodal perception on more +intricate concepts over longer context dependencies. Based on the dataset, we +further introduce a more complex setting of video grounding dubbed +Multi-Paragraph Video Grounding (MPVG), which takes as input multiple +paragraphs and a long video for grounding each paragraph query to its temporal +interval. In addition, we propose a novel Local-Global Multimodal Reasoner +(LGMR) to explicitly model the local-global structures of long-term multimodal +inputs for MPVG. Our method provides an effective baseline solution to the +multi-paragraph video grounding problem. Extensive experiments verify the +proposed model's effectiveness as well as its superiority in long-term +multi-paragraph video grounding over prior state-of-the-arts. Dataset and code +are publicly available. Project page: https://synopground.github.io/. + +
+
+ comment: Accepted to ACM MM 2024. Project page: https://synopground.github.io/ +
+
+
+
+
+ + ♻ ☆ Singer separation for karaoke content generation + + +
+ Due to the rapid development of deep learning, we can now successfully +separate singing voice from mono audio music. However, this separation can only +extract human voices from other musical instruments, which is undesirable for +karaoke content generation applications that only require the separation of +lead singers. For this karaoke application, we need to separate the music +containing male and female duets into two vocals, or extract a single lead +vocal from the music containing vocal harmony. For this reason, we propose in +this article to use a singer separation system, which generates karaoke content +for one or two separated lead singers. In particular, we introduced three +models for the singer separation task and designed an automatic model selection +scheme to distinguish how many lead singers are in the song. We also collected +a large enough data set, MIR-SingerSeparation, which has been publicly released +to advance the frontier of this research. Our singer separation is most +suitable for sentimental ballads and can be directly applied to karaoke content +generation. As far as we know, this is the first singer-separation work for +real-world karaoke applications. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 24 + +
+
+
+ + ☆ An Open-Source American Sign Language Fingerspell Recognition and + Semantic Pose Retrieval Interface + + +
+ This paper introduces an open-source interface for American Sign Language +fingerspell recognition and semantic pose retrieval, aimed to serve as a +stepping stone towards more advanced sign language translation systems. +Utilizing a combination of convolutional neural networks and pose estimation +models, the interface provides two modular components: a recognition module for +translating ASL fingerspelling into spoken English and a production module for +converting spoken English into ASL pose sequences. The system is designed to be +highly accessible, user-friendly, and capable of functioning in real-time under +varying environmental conditions like backgrounds, lighting, skin tones, and +hand sizes. We discuss the technical details of the model architecture, +application in the wild, as well as potential future enhancements for +real-world consumer applications. + +
+
+ comment: 8 pages, 9 figures +
+
+
+
+
+ + ☆ CyberPal.AI: Empowering LLMs with Expert-Driven Cybersecurity + Instructions + + +
+ Large Language Models (LLMs) have significantly advanced natural language +processing (NLP), providing versatile capabilities across various applications. +However, their application to complex, domain-specific tasks, such as +cyber-security, often faces substantial challenges. In this study, we introduce +SecKnowledge and CyberPal.AI to address these challenges and train +security-expert LLMs. SecKnowledge is a domain-knowledge-driven cyber-security +instruction dataset, meticulously designed using years of accumulated expert +knowledge in the domain through a multi-phase generation process. CyberPal.AI +refers to a family of LLMs fine-tuned using SecKnowledge, aimed at building +security-specialized LLMs capable of answering and following complex +security-related instructions. Additionally, we introduce SecKnowledge-Eval, a +comprehensive and diverse cyber-security evaluation benchmark, composed of an +extensive set of cyber-security tasks we specifically developed to assess LLMs +in the field of cyber-security, along with other publicly available security +benchmarks. Our results show a significant average improvement of up to 24% +over the baseline models, underscoring the benefits of our expert-driven +instruction dataset generation process. These findings contribute to the +advancement of AI-based cyber-security applications, paving the way for +security-expert LLMs that can enhance threat-hunting and investigation +processes. + +
+
+
+
+
+ + ☆ ConVerSum: A Contrastive Learning based Approach for Data-Scarce + Solution of Cross-Lingual Summarization Beyond Direct Equivalents + + +
+ Cross-Lingual summarization (CLS) is a sophisticated branch in Natural +Language Processing that demands models to accurately translate and summarize +articles from different source languages. Despite the improvement of the +subsequent studies, This area still needs data-efficient solutions along with +effective training methodologies. To the best of our knowledge, there is no +feasible solution for CLS when there is no available high-quality CLS data. In +this paper, we propose a novel data-efficient approach, ConVerSum, for CLS +leveraging the power of contrastive learning, generating versatile candidate +summaries in different languages based on the given source document and +contrasting these summaries with reference summaries concerning the given +documents. After that, we train the model with a contrastive ranking loss. +Then, we rigorously evaluate the proposed approach against current +methodologies and compare it to powerful Large Language Models (LLMs)- Gemini, +GPT 3.5, and GPT 4 proving our model performs better for low-resource +languages' CLS. These findings represent a substantial improvement in the area, +opening the door to more efficient and accurate cross-lingual summarizing +techniques. + +
+
+
+
+
+ + ☆ Reference-Guided Verdict: LLMs-as-Judges in Automatic Evaluation of + Free-Form Text + + +
+ The rapid advancements in Large Language Models (LLMs) have highlighted the +critical need for robust evaluation methods that can accurately assess the +quality of generated text, particularly in free-form tasks. Traditional metrics +like BLEU and ROUGE, while useful, often fail to capture the semantic richness +and contextual relevance of free-form text compared to reference answers. In +this study, we introduce a reference-guided verdict method that leverages +multiple LLMs-as-judges to provide a more reliable and accurate evaluation of +open-ended LLM generations. By integrating diverse LLMs, our approach mitigates +individual model biases and significantly improves alignment with human +judgments, especially in challenging tasks where traditional metrics and +single-model evaluations fall short. Through experiments across multiple +question-answering tasks, we show that our method closely aligns with human +evaluations, establishing it as a scalable, reproducible, and effective +alternative to human evaluation. Our approach not only enhances evaluation +reliability but also opens new avenues for refining automated assessment in +generative AI. + +
+
+
+
+
+ + ☆ Generating Data with Text-to-Speech and Large-Language Models for + Conversational Speech Recognition + + +
+ Currently, a common approach in many speech processing tasks is to leverage +large scale pre-trained models by fine-tuning them on in-domain data for a +particular application. Yet obtaining even a small amount of such data can be +problematic, especially for sensitive domains and conversational speech +scenarios, due to both privacy issues and annotation costs. To address this, +synthetic data generation using single speaker datasets has been employed. Yet, +for multi-speaker cases, such an approach often requires extensive manual +effort and is prone to domain mismatches. In this work, we propose a synthetic +data generation pipeline for multi-speaker conversational ASR, leveraging a +large language model (LLM) for content creation and a conversational +multi-speaker text-to-speech (TTS) model for speech synthesis. We conduct +evaluation by fine-tuning the Whisper ASR model for telephone and distant +conversational speech settings, using both in-domain data and generated +synthetic data. Our results show that the proposed method is able to +significantly outperform classical multi-speaker generation approaches that use +external, non-conversational speech datasets. + +
+
+ comment: To appear at SynData4GenAI 2024 workshop +
+
+
+
+
+ + ☆ Architectural Foundations and Strategic Considerations for the Large + Language Model Infrastructures + + +
+ The development of a large language model (LLM) infrastructure is a pivotal +undertaking in artificial intelligence. This paper explores the intricate +landscape of LLM infrastructure, software, and data management. By analyzing +these core components, we emphasize the pivotal considerations and safeguards +crucial for successful LLM development. This work presents a concise synthesis +of the challenges and strategies inherent in constructing a robust and +effective LLM infrastructure, offering valuable insights for researchers and +practitioners alike. + +
+
+
+
+
+ + ☆ AI Managed Emergency Documentation with a Pretrained Model + + +
+ This study investigates the use of a large language model system to improve +efficiency and quality in emergency department (ED) discharge letter writing. +Time constraints and infrastructural deficits make compliance with current +discharge letter targets difficult. We explored potential efficiencies from an +artificial intelligence software in the generation of ED discharge letters and +the attitudes of doctors toward this technology. The evaluated system leverages +advanced techniques to fine-tune a model to generate discharge summaries from +short-hand inputs, including voice, text, and electronic health record data. +Nineteen physicians with emergency medicine experience evaluated the system +text and voice-to-text interfaces against manual typing. The results showed +significant time savings with MedWrite LLM interfaces compared to manual +methods. + +
+
+ comment: Ethical approval for the study was obtained from the University + College Dublin, Human Research Ethics Committee (UCD HREC) +
+
+
+
+
+ + ☆ Chinese Metaphor Recognition Using a Multi-stage Prompting Large + Language Model + + +
+ Metaphors are common in everyday language, and the identification and +understanding of metaphors are facilitated by models to achieve a better +understanding of the text. Metaphors are mainly identified and generated by +pre-trained models in existing research, but situations, where tenors or +vehicles are not included in the metaphor, cannot be handled. The problem can +be effectively solved by using Large Language Models (LLMs), but significant +room for exploration remains in this early-stage research area. A multi-stage +generative heuristic-enhanced prompt framework is proposed in this study to +enhance the ability of LLMs to recognize tenors, vehicles, and grounds in +Chinese metaphors. In the first stage, a small model is trained to obtain the +required confidence score for answer candidate generation. In the second stage, +questions are clustered and sampled according to specific rules. Finally, the +heuristic-enhanced prompt needed is formed by combining the generated answer +candidates and demonstrations. The proposed model achieved 3rd place in Track 1 +of Subtask 1, 1st place in Track 2 of Subtask 1, and 1st place in both tracks +of Subtask 2 at the NLPCC-2024 Shared Task 9. + +
+
+
+
+
+ + ☆ Cognitive LLMs: Towards Integrating Cognitive Architectures and Large + Language Models for Manufacturing Decision-making + + +
+ Resolving the dichotomy between the human-like yet constrained reasoning +processes of Cognitive Architectures and the broad but often noisy inference +behavior of Large Language Models (LLMs) remains a challenging but exciting +pursuit, for enabling reliable machine reasoning capabilities in production +systems. Because Cognitive Architectures are famously developed for the purpose +of modeling the internal mechanisms of human cognitive decision-making at a +computational level, new investigations consider the goal of informing LLMs +with the knowledge necessary for replicating such processes, e.g., guided +perception, memory, goal-setting, and action. Previous approaches that use LLMs +for grounded decision-making struggle with complex reasoning tasks that require +slower, deliberate cognition over fast and intuitive inference -- reporting +issues related to the lack of sufficient grounding, as in hallucination. To +resolve these challenges, we introduce LLM-ACTR, a novel neuro-symbolic +architecture that provides human-aligned and versatile decision-making by +integrating the ACT-R Cognitive Architecture with LLMs. Our framework extracts +and embeds knowledge of ACT-R's internal decision-making process as latent +neural representations, injects this information into trainable LLM adapter +layers, and fine-tunes the LLMs for downstream prediction. Our experiments on +novel Design for Manufacturing tasks show both improved task performance as +well as improved grounded decision-making capability of our approach, compared +to LLM-only baselines that leverage chain-of-thought reasoning strategies. + +
+
+ comment: 20 pages, 8 figures, 2 tables +
+
+
+
+
+ + ☆ TableBench: A Comprehensive and Complex Benchmark for Table Question + Answering + + +
+ Recent advancements in Large Language Models (LLMs) have markedly enhanced +the interpretation and processing of tabular data, introducing previously +unimaginable capabilities. Despite these achievements, LLMs still encounter +significant challenges when applied in industrial scenarios, particularly due +to the increased complexity of reasoning required with real-world tabular data, +underscoring a notable disparity between academic benchmarks and practical +applications. To address this discrepancy, we conduct a detailed investigation +into the application of tabular data in industrial scenarios and propose a +comprehensive and complex benchmark TableBench, including 18 fields within four +major categories of table question answering (TableQA) capabilities. +Furthermore, we introduce TableLLM, trained on our meticulously constructed +training set TableInstruct, achieving comparable performance with GPT-3.5. +Massive experiments conducted on TableBench indicate that both open-source and +proprietary LLMs still have significant room for improvement to meet real-world +demands, where the most advanced model, GPT-4, achieves only a modest score +compared to humans. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ Unc-TTP: A Method for Classifying LLM Uncertainty to Improve In-Context + Example Selection + + +
+ Nowadays, Large Language Models (LLMs) have demonstrated exceptional +performance across various downstream tasks. However, it is challenging for +users to discern whether the responses are generated with certainty or are +fabricated to meet user expectations. Estimating the uncertainty of LLMs is +particularly challenging due to their vast scale and the lack of white-box +access. In this work, we propose a novel Uncertainty Tripartite Testing +Paradigm (Unc-TTP) to classify LLM uncertainty, via evaluating the consistency +of LLM outputs when incorporating label interference into the sampling-based +approach. Based on Unc-TTP outputs, we aggregate instances into certain and +uncertain categories. Further, we conduct a detailed analysis of the +uncertainty properties of LLMs and show Unc-TTP's superiority over the existing +sampling-based methods. In addition, we leverage the obtained uncertainty +information to guide in-context example selection, demonstrating that Unc-TTP +obviously outperforms retrieval-based and sampling-based approaches in +selecting more informative examples. Our work paves a new way to classify the +uncertainty of both open- and closed-source LLMs, and introduces a practical +approach to exploit this uncertainty to improve LLMs performance. + +
+
+ comment: 7 pages, long paper +
+
+
+
+
+ + ☆ Automatic Metrics in Natural Language Generation: A Survey of Current + Evaluation Practices + + +
+ Automatic metrics are extensively used to evaluate natural language +processing systems. However, there has been increasing focus on how they are +used and reported by practitioners within the field. In this paper, we have +conducted a survey on the use of automatic metrics, focusing particularly on +natural language generation (NLG) tasks. We inspect which metrics are used as +well as why they are chosen and how their use is reported. Our findings from +this survey reveal significant shortcomings, including inappropriate metric +usage, lack of implementation details and missing correlations with human +judgements. We conclude with recommendations that we believe authors should +follow to enable more rigour within the field. + +
+
+ comment: Accepted to INLG 2024 +
+
+
+
+
+ + ☆ CogLM: Tracking Cognitive Development of Large Language Models + + +
+ Piaget's Theory of Cognitive Development (PTC) posits that the development of +cognitive levels forms the foundation for human learning across various +abilities. As Large Language Models (LLMs) have recently shown remarkable +abilities across a wide variety of tasks, we are curious about the cognitive +levels of current LLMs: to what extent they have developed and how this +development has been achieved. To this end, we construct a benchmark CogLM +(Cognitive Ability Evaluation for Language Model) based on PTC to assess the +cognitive levels of LLMs. CogLM comprises 1,220 questions spanning 10 cognitive +abilities crafted by more than 20 human experts, providing a comprehensive +testbed for the cognitive levels of LLMs. Through extensive experiments across +multiple mainstream LLMs with CogLM, we find that: (1) Human-like cognitive +abilities have emerged in advanced LLMs (GPT-4), comparable to those of a +20-year-old human. (2) The parameter size and optimization objective are two +key factors affecting the cognitive levels of LLMs. (3) The performance on +downstream tasks is positively correlated with the level of cognitive +abilities. These findings fill the gap in research on the cognitive abilities +of LLMs, tracing the development of LLMs from a cognitive perspective and +guiding the future direction of their evolution. + +
+
+ comment: under review +
+
+
+
+
+ + ☆ Selective Prompt Anchoring for Code Generation + + +
+ Recent advances in large language models (LLMs) such as Copilot and ChatGPT +have transformed software development by automating coding tasks. Despite these +advancements, challenges remain in reducing error rates and fully meeting user +expectations. Our empirical study reveals LLMs tend to dilute their +self-attention on the initial prompt as more code tokens are generated. We +hypothesize this self-attention dilution issue is one of the root causes of +inaccuracies in LLM-generated code. To mitigate this issue, we propose +Selective Prompt Anchoring (SPA). SPA amplifies the influence of the selected +parts in the initial prompt, which we refer to as ``anchored text'', during +code generation. Specifically, SPA calculates the logit distribution difference +with and without the anchored text. We prove this difference approximates the +anchored text's contextual contribution to the output logits. SPA creates an +augmented logit distribution by linearly combining the original logit +distribution and the logit difference. We evaluate SPA with five LLMs on four +benchmarks. Our results demonstrate that using SPA can consistently improve +Pass@1 rates by up to 9.7% in all settings. Notably, with selective text +anchoring, a small version of DeepSeek-Coder (6.7B) can achieve better +performance than an original much larger version (33B). Our code is available +at https://github.com/magic-YuanTian/Selective-Prompt-Anchoring. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Measuring Visual Sycophancy in Multimodal Models + + +
+ This paper introduces and examines the phenomenon of "visual sycophancy" in +multimodal language models, a term we propose to describe these models' +tendency to disproportionately favor visually presented information, even when +it contradicts their prior knowledge or responses. Our study employs a +systematic methodology to investigate this phenomenon: we present models with +images of multiple-choice questions, which they initially answer correctly, +then expose the same model to versions with visually pre-marked options. Our +findings reveal a significant shift in the models' responses towards the +pre-marked option despite their previous correct answers. Comprehensive +evaluations demonstrate that visual sycophancy is a consistent and quantifiable +behavior across various model architectures. Our findings highlight potential +limitations in the reliability of these models when processing potentially +misleading visual information, raising important questions about their +application in critical decision-making contexts. + +
+
+
+
+
+ + ☆ Improving Rare Word Translation With Dictionaries and Attention Masking + + +
+ In machine translation, rare words continue to be a problem for the dominant +encoder-decoder architecture, especially in low-resource and out-of-domain +translation settings. Human translators solve this problem with monolingual or +bilingual dictionaries. In this paper, we propose appending definitions from a +bilingual dictionary to source sentences and using attention masking to link +together rare words with their definitions. We find that including definitions +for rare words improves performance by up to 1.0 BLEU and 1.6 MacroF1. + +
+
+
+
+
+ + ☆ CodeTaxo: Enhancing Taxonomy Expansion with Limited Examples via Code + Language Prompts + + +
+ Taxonomies play a crucial role in various applications by providing a +structural representation of knowledge. The task of taxonomy expansion involves +integrating emerging concepts into existing taxonomies by identifying +appropriate parent concepts for these new query concepts. Previous approaches +typically relied on self-supervised methods that generate annotation data from +existing taxonomies. However, these methods are less effective when the +existing taxonomy is small (fewer than 100 entities). In this work, we +introduce \textsc{CodeTaxo}, a novel approach that leverages large language +models through code language prompts to capture the taxonomic structure. +Extensive experiments on five real-world benchmarks from different domains +demonstrate that \textsc{CodeTaxo} consistently achieves superior performance +across all evaluation metrics, significantly outperforming previous +state-of-the-art methods. The code and data are available at +\url{https://github.com/QingkaiZeng/CodeTaxo-Pub}. + +
+
+
+
+
+ + ♻ ☆ Chain-of-Dictionary Prompting Elicits Translation in Large Language + Models + + +
+ Large language models (LLMs) have shown surprisingly good performance in +multilingual neural machine translation (MNMT) even when trained without +parallel data. Yet, despite the fact that the amount of training data is +gigantic, they still struggle with translating rare words, particularly for +low-resource languages. Even worse, it is usually unrealistic to retrieve +relevant demonstrations for in-context learning with low-resource languages on +LLMs, which restricts the practical use of LLMs for translation -- how should +we mitigate this problem? To this end, we present a novel method, CoD, which +augments LLMs with prior knowledge with the chains of multilingual dictionaries +for a subset of input words to elicit translation abilities for LLMs. Extensive +experiments indicate that augmenting ChatGPT with CoD elicits large gains by up +to 13x chrF++ points for MNMT (3.08 to 42.63 for English to Serbian written in +Cyrillic script) on FLORES-200 full devtest set. We further demonstrate the +importance of chaining the multilingual dictionaries, as well as the +superiority of CoD to few-shot demonstration for low-resource languages. + +
+
+
+
+
+ + ♻ ☆ MEDVOC: Vocabulary Adaptation for Fine-tuning Pre-trained Language + Models on Medical Text Summarization IJCAI 2024 + + +
+ This work presents a dynamic vocabulary adaptation strategy, MEDVOC, for +fine-tuning pre-trained language models (PLMs) like BertSumAbs, BART, and +PEGASUS for improved medical text summarization. In contrast to existing domain +adaptation approaches in summarization, MEDVOC treats vocabulary as an +optimizable parameter and optimizes the PLM vocabulary based on fragment score +conditioned only on the downstream task's reference summaries. Unlike previous +works on vocabulary adaptation (limited only to classification tasks), +optimizing vocabulary based on summarization tasks requires an extremely costly +intermediate fine-tuning step on large summarization datasets. To that end, our +novel fragment score-based hyperparameter search very significantly reduces +this fine-tuning time -- from 450 days to less than 2 days on average. +Furthermore, while previous works on vocabulary adaptation are often primarily +tied to single PLMs, MEDVOC is designed to be deployable across multiple PLMs +(with varying model vocabulary sizes, pre-training objectives, and model sizes) +-- bridging the limited vocabulary overlap between the biomedical literature +domain and PLMs. MEDVOC outperforms baselines by 15.74% in terms of Rouge-L in +zero-shot setting and shows gains of 17.29% in high Out-Of-Vocabulary (OOV) +concentrations. Our human evaluation shows MEDVOC generates more faithful +medical summaries (88% compared to 59% in baselines). We make the codebase +publicly available at https://github.com/gb-kgp/MEDVOC. + +
+
+ comment: 13 pages, Accepted to the 33rd International Joint Conference on + Artificial Intelligence, IJCAI 2024 (Main) Track +
+
+
+
+
+ + ♻ ☆ Introducing a new hyper-parameter for RAG: Context Window Utilization + + +
+ This paper introduces a new hyper-parameter for Retrieval-Augmented +Generation (RAG) systems called Context Window Utilization. RAG systems enhance +generative models by incorporating relevant information retrieved from external +knowledge bases, improving the factual accuracy and contextual relevance of +generated responses. The size of the text chunks retrieved and processed is a +critical factor influencing RAG performance. This study aims to identify the +optimal chunk size that maximizes answer generation quality. Through systematic +experimentation, we analyze the effects of varying chunk sizes on the +efficiency and effectiveness of RAG frameworks. Our findings reveal that an +optimal chunk size balances the trade-off between providing sufficient context +and minimizing irrelevant information. These insights are crucial for enhancing +the design and implementation of RAG systems, underscoring the importance of +selecting an appropriate chunk size to achieve superior performance. + +
+
+
+
+
+ + ♻ ☆ Proving membership in LLM pretraining data via data watermarks ACL 2024 + + +
+ Detecting whether copyright holders' works were used in LLM pretraining is +poised to be an important problem. This work proposes using data watermarks to +enable principled detection with only black-box model access, provided that the +rightholder contributed multiple training documents and watermarked them before +public release. By applying a randomly sampled data watermark, detection can be +framed as hypothesis testing, which provides guarantees on the false detection +rate. We study two watermarks: one that inserts random sequences, and another +that randomly substitutes characters with Unicode lookalikes. We first show how +three aspects of watermark design -- watermark length, number of duplications, +and interference -- affect the power of the hypothesis test. Next, we study how +a watermark's detection strength changes under model and dataset scaling: while +increasing the dataset size decreases the strength of the watermark, watermarks +remain strong if the model size also increases. Finally, we view SHA hashes as +natural watermarks and show that we can robustly detect hashes from +BLOOM-176B's training data, as long as they occurred at least 90 times. +Together, our results point towards a promising future for data watermarks in +real world use. + +
+
+ comment: Findings of ACL 2024 +
+
+
+
+
+ + ♻ ☆ Direct Multi-Turn Preference Optimization for Language Agents + + +
+ Adapting Large Language Models (LLMs) for agent tasks is critical in +developing language agents. Direct Preference Optimization (DPO) is a promising +technique for this adaptation with the alleviation of compounding errors, +offering a means to directly optimize Reinforcement Learning (RL) objectives. +However, applying DPO to multi-turn tasks presents challenges due to the +inability to cancel the partition function. Overcoming this obstacle involves +making the partition function independent of the current state and addressing +length disparities between preferred and dis-preferred trajectories. In this +light, we replace the policy constraint with the state-action occupancy measure +constraint in the RL objective and add length normalization to the +Bradley-Terry model, yielding a novel loss function named DMPO for multi-turn +agent tasks with theoretical explanations. Extensive experiments on three +multi-turn agent task datasets confirm the effectiveness and superiority of the +DMPO loss. + +
+
+
+
+
+ + ♻ ☆ MMM: Multilingual Mutual Reinforcement Effect Mix Datasets & Test with + Open-domain Information Extraction Large Language Models + + +
+ The Mutual Reinforcement Effect (MRE) represents a promising avenue in +information extraction and multitasking research. Nevertheless, its +applicability has been constrained due to the exclusive availability of MRE mix +datasets in Japanese, thereby limiting comprehensive exploration by the global +research community. To address this limitation, we introduce a Multilingual MRE +mix dataset (MMM) that encompasses 21 sub-datasets in English, Japanese, and +Chinese. In this paper, we also propose a method for dataset translation +assisted by Large Language Models (LLMs), which significantly reduces the +manual annotation time required for dataset construction by leveraging LLMs to +translate the original Japanese datasets. Additionally, we have enriched the +dataset by incorporating open-domain Named Entity Recognition (NER) and +sentence classification tasks. Utilizing this expanded dataset, we developed a +unified input-output framework to train an Open-domain Information Extraction +Large Language Model (OIELLM). The OIELLM model demonstrates the capability to +effectively process novel MMM datasets, exhibiting significant improvements in +performance. + +
+
+ comment: Under Review. 11 pages, 5 Figure +
+
+
+
+
+ + ♻ ☆ Open Ko-LLM Leaderboard: Evaluating Large Language Models in Korean with + Ko-H5 Benchmark ACL 2024 + + +
+ This paper introduces the Open Ko-LLM Leaderboard and the Ko-H5 Benchmark as +vital tools for evaluating Large Language Models (LLMs) in Korean. +Incorporating private test sets while mirroring the English Open LLM +Leaderboard, we establish a robust evaluation framework that has been well +integrated in the Korean LLM community. We perform data leakage analysis that +shows the benefit of private test sets along with a correlation study within +the Ko-H5 benchmark and temporal analyses of the Ko-H5 score. Moreover, we +present empirical support for the need to expand beyond set benchmarks. We hope +the Open Ko-LLM Leaderboard sets precedent for expanding LLM evaluation to +foster more linguistic diversity. + +
+
+ comment: Accepted at ACL 2024 Main +
+
+
+
+
+
+
+
+ + Information Retrieval 9 + +
+
+
+ + ☆ A Study of PHOC Spatial Region Configurations for Math Formula Retrieval + + +
+ A Pyramidal Histogram Of Characters (PHOC) represents the spatial location of +symbols as binary vectors. The vectors are composed of levels that split a +formula into equal-sized regions of one or more types (e.g., rectangles or +ellipses). For each region type, this produces a pyramid of overlapping +regions, where the first level contains the entire formula, and the final level +the finest-grained regions. In this work, we introduce concentric rectangles +for regions, and analyze whether subsequent PHOC levels encode redundant +information by omitting levels from PHOC configurations. As a baseline, we +include a bag of words PHOC containing only the first whole-formula level. +Finally, using the ARQMath-3 formula retrieval benchmark, we demonstrate that +some levels encoded in the original PHOC configurations are redundant, that +PHOC models with rectangular regions outperform earlier PHOC models, and that +despite their simplicity, PHOC models are surprisingly competitive with the +state-of-the-art. PHOC is not math-specific, and might be used for chemical +diagrams, charts, or other graphics. + +
+
+
+
+
+ + ☆ Towards Effective Top-N Hamming Search via Bipartite Graph Contrastive + Hashing + + +
+ Searching on bipartite graphs serves as a fundamental task for various +real-world applications, such as recommendation systems, database retrieval, +and document querying. Conventional approaches rely on similarity matching in +continuous Euclidean space of vectorized node embeddings. To handle intensive +similarity computation efficiently, hashing techniques for graph-structured +data have emerged as a prominent research direction. However, despite the +retrieval efficiency in Hamming space, previous studies have encountered +catastrophic performance decay. To address this challenge, we investigate the +problem of hashing with Graph Convolutional Network for effective Top-N search. +Our findings indicate the learning effectiveness of incorporating hashing +techniques within the exploration of bipartite graph reception fields, as +opposed to simply treating hashing as post-processing to output embeddings. To +further enhance the model performance, we advance upon these findings and +propose Bipartite Graph Contrastive Hashing (BGCH+). BGCH+ introduces a novel +dual augmentation approach to both intermediate information and hash code +outputs in the latent feature spaces, thereby producing more expressive and +robust hash codes within a dual self-supervised learning paradigm. +Comprehensive empirical analyses on six real-world benchmarks validate the +effectiveness of our dual feature contrastive learning in boosting the +performance of BGCH+ compared to existing approaches. + +
+
+
+
+
+ + ☆ Hybrid Semantic Search: Unveiling User Intent Beyond Keywords + + +
+ This paper addresses the limitations of traditional keyword-based search in +understanding user intent and introduces a novel hybrid search approach that +leverages the strengths of non-semantic search engines, Large Language Models +(LLMs), and embedding models. The proposed system integrates keyword matching, +semantic vector embeddings, and LLM-generated structured queries to deliver +highly relevant and contextually appropriate search results. By combining these +complementary methods, the hybrid approach effectively captures both explicit +and implicit user intent.The paper further explores techniques to optimize +query execution for faster response times and demonstrates the effectiveness of +this hybrid search model in producing comprehensive and accurate search +outcomes. + +
+
+
+
+
+ + ☆ FabricQA-Extractor: A Question Answering System to Extract Information + from Documents using Natural Language Questions + + +
+ Reading comprehension models answer questions posed in natural language when +provided with a short passage of text. They present an opportunity to address a +long-standing challenge in data management: the extraction of structured data +from unstructured text. Consequently, several approaches are using these models +to perform information extraction. However, these modern approaches leave an +opportunity behind because they do not exploit the relational structure of the +target extraction table. In this paper, we introduce a new model, Relation +Coherence, that exploits knowledge of the relational structure to improve the +extraction quality. We incorporate the Relation Coherence model as part of +FabricQA-Extractor, an end-to-end system we built from scratch to conduct large +scale extraction tasks over millions of documents. We demonstrate on two +datasets with millions of passages that Relation Coherence boosts extraction +performance and evaluate FabricQA-Extractor on large scale datasets. + +
+
+
+
+
+ + ☆ TC-RAG:Turing-Complete RAG's Case study on Medical LLM Systems + + +
+ In the pursuit of enhancing domain-specific Large Language Models (LLMs), +Retrieval-Augmented Generation (RAG) emerges as a promising solution to +mitigate issues such as hallucinations, outdated knowledge, and limited +expertise in highly specialized queries. However, existing approaches to RAG +fall short by neglecting system state variables, which are crucial for ensuring +adaptive control, retrieval halting, and system convergence. In this paper, we +introduce the TC-RAG through rigorous proof, a novel framework that addresses +these challenges by incorporating a Turing Complete System to manage state +variables, thereby enabling more efficient and accurate knowledge retrieval. By +leveraging a memory stack system with adaptive retrieval, reasoning, and +planning capabilities, TC-RAG not only ensures the controlled halting of +retrieval processes but also mitigates the accumulation of erroneous knowledge +via Push and Pop actions. In the case study of the medical domain, our +extensive experiments on real-world healthcare datasets demonstrate the +superiority of TC-RAG over existing methods in accuracy by over 7.20\%. Our +dataset and code have been available at +https://https://github.com/Artessay/SAMA.git. + +
+
+ comment: version 1.0 +
+
+
+
+
+ + ☆ Ranking Across Different Content Types: The Robust Beauty of Multinomial + Blending RecSys24 + + +
+ An increasing number of media streaming services have expanded their +offerings to include entities of multiple content types. For instance, audio +streaming services that started by offering music only, now also offer +podcasts, merchandise items, and videos. Ranking items across different content +types into a single slate poses a significant challenge for traditional +learning-to-rank (LTR) algorithms due to differing user engagement patterns for +different content types. We explore a simple method for cross-content-type +ranking, called multinomial blending (MB), which can be used in conjunction +with most existing LTR algorithms. We compare MB to existing baselines not only +in terms of ranking quality but also from other industry-relevant perspectives +such as interpretability, ease-of-use, and stability in dynamic environments +with changing user behavior and ranking model retraining. Finally, we report +the results of an A/B test from an Amazon Music ranking use-case. + +
+
+ comment: To appear in 18th ACM Conference on Recommender Systems (RecSys24), + Bari, Italy. ACM, New York, NY, USA, 3 pages +
+
+
+
+
+ + ☆ CodeTaxo: Enhancing Taxonomy Expansion with Limited Examples via Code + Language Prompts + + +
+ Taxonomies play a crucial role in various applications by providing a +structural representation of knowledge. The task of taxonomy expansion involves +integrating emerging concepts into existing taxonomies by identifying +appropriate parent concepts for these new query concepts. Previous approaches +typically relied on self-supervised methods that generate annotation data from +existing taxonomies. However, these methods are less effective when the +existing taxonomy is small (fewer than 100 entities). In this work, we +introduce \textsc{CodeTaxo}, a novel approach that leverages large language +models through code language prompts to capture the taxonomic structure. +Extensive experiments on five real-world benchmarks from different domains +demonstrate that \textsc{CodeTaxo} consistently achieves superior performance +across all evaluation metrics, significantly outperforming previous +state-of-the-art methods. The code and data are available at +\url{https://github.com/QingkaiZeng/CodeTaxo-Pub}. + +
+
+
+
+
+ + ♻ ☆ iRAG: Advancing RAG for Videos with an Incremental Approach CIKM 2024 + + +
+ Retrieval-augmented generation (RAG) systems combine the strengths of +language generation and information retrieval to power many real-world +applications like chatbots. Use of RAG for understanding of videos is appealing +but there are two critical limitations. One-time, upfront conversion of all +content in large corpus of videos into text descriptions entails high +processing times. Also, not all information in the rich video data is typically +captured in the text descriptions. Since user queries are not known apriori, +developing a system for video to text conversion and interactive querying of +video data is challenging. + To address these limitations, we propose an incremental RAG system called +iRAG, which augments RAG with a novel incremental workflow to enable +interactive querying of a large corpus of videos. Unlike traditional RAG, iRAG +quickly indexes large repositories of videos, and in the incremental workflow, +it uses the index to opportunistically extract more details from select +portions of the videos to retrieve context relevant to an interactive user +query. Such an incremental workflow avoids long video to text conversion times, +and overcomes information loss issues due to conversion of video to text, by +doing on-demand query-specific extraction of details in video data. This +ensures high quality of responses to interactive user queries that are often +not known apriori. To the best of our knowledge, iRAG is the first system to +augment RAG with an incremental workflow to support efficient interactive +querying of a large corpus of videos. Experimental results on real-world +datasets demonstrate 23x to 25x faster video to text ingestion, while ensuring +that latency and quality of responses to interactive user queries is comparable +to responses from a traditional RAG where all video data is converted to text +upfront before any user querying. + +
+
+ comment: Accepted in CIKM 2024 +
+
+
+
+
+ + ♻ ☆ Mitigating Pooling Bias in E-commerce Search via False Negative + Estimation WWW'24 + + +
+ Efficient and accurate product relevance assessment is critical for user +experiences and business success. Training a proficient relevance assessment +model requires high-quality query-product pairs, often obtained through +negative sampling strategies. Unfortunately, current methods introduce pooling +bias by mistakenly sampling false negatives, diminishing performance and +business impact. To address this, we present Bias-mitigating Hard Negative +Sampling (BHNS), a novel negative sampling strategy tailored to identify and +adjust for false negatives, building upon our original False Negative +Estimation algorithm. Our experiments in the Instacart search setting confirm +BHNS as effective for practical e-commerce use. Furthermore, comparative +analyses on public dataset showcase its domain-agnostic potential for diverse +applications. + +
+
+ comment: Submitted to WWW'24 Industry Track +
+
+
+
+
+
+
+
+ + Multimedia 2 + +
+
+
+ + ♻ ☆ MInD: Improving Multimodal Sentiment Analysis via Multimodal Information + Disentanglement + + +
+ Learning effective joint representations has been a central task in +multi-modal sentiment analysis. Previous works addressing this task focus on +exploring sophisticated fusion techniques to enhance performance. However, the +inherent heterogeneity of distinct modalities remains a core problem that +brings challenges in fusing and coordinating the multi-modal signals at both +the representational level and the informational level, impeding the full +exploitation of multi-modal information. To address this problem, we propose +the Multi-modal Information Disentanglement (MInD) method, which decomposes the +multi-modal inputs into modality-invariant and modality-specific components +through a shared encoder and multiple private encoders. Furthermore, by +explicitly training generated noise in an adversarial manner, MInD is able to +isolate uninformativeness, thus improves the learned representations. +Therefore, the proposed disentangled decomposition allows for a fusion process +that is simpler than alternative methods and results in improved performance. +Experimental evaluations conducted on representative benchmark datasets +demonstrate MInD's effectiveness in both multi-modal emotion recognition and +multi-modal humor detection tasks. Code will be released upon acceptance of the +paper. + +
+
+
+
+
+ + ♻ ☆ Integrating Large Language Models into a Tri-Modal Architecture for + Automated Depression Classification + + +
+ Major Depressive Disorder (MDD) is a pervasive mental health condition that +affects 300 million people worldwide. This work presents a novel, BiLSTM-based +tri-modal model-level fusion architecture for the binary classification of +depression from clinical interview recordings. The proposed architecture +incorporates Mel Frequency Cepstral Coefficients, Facial Action Units, and uses +a two-shot learning based GPT-4 model to process text data. This is the first +work to incorporate large language models into a multi-modal architecture for +this task. It achieves impressive results on the DAIC-WOZ AVEC 2016 Challenge +cross-validation split and Leave-One-Subject-Out cross-validation split, +surpassing all baseline models and multiple state-of-the-art models. In +Leave-One-Subject-Out testing, it achieves an accuracy of 91.01%, an F1-Score +of 85.95%, a precision of 80%, and a recall of 92.86%. + +
+
+ comment: Keywords: Multi-Modal Neural Networks, Deep Learning, Large Language + Models, Depression Diagnosis, Biomedical Informatics, DAIC-WOZ +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 64 + +
+
+
+ + ☆ xGen-MM (BLIP-3): A Family of Open Large Multimodal Models + + +
+ This report introduces xGen-MM (also known as BLIP-3), a framework for +developing Large Multimodal Models (LMMs). The framework comprises meticulously +curated datasets, a training recipe, model architectures, and a resulting suite +of LMMs. xGen-MM, short for xGen-MultiModal, expands the Salesforce xGen +initiative on foundation AI models. Our models undergo rigorous evaluation +across a range of tasks, including both single and multi-image benchmarks. Our +pre-trained base model exhibits strong in-context learning capabilities and the +instruction-tuned model demonstrates competitive performance among open-source +LMMs with similar model sizes. In addition, we introduce a safety-tuned model +with DPO, aiming to mitigate harmful behaviors such as hallucinations and +improve safety. We open-source our models, curated large-scale datasets, and +our fine-tuning codebase to facilitate further advancements in LMM research. +Associated resources will be available on our project page above. + +
+
+
+
+
+ + ☆ PEDAL: Enhancing Greedy Decoding with Large Language Models using + Diverse Exemplars + + +
+ Self-ensembling techniques with diverse reasoning paths such as +Self-Consistency have demonstrated remarkable gains in accuracy for Large +Language Models (LLMs). However, such techniques depend on the availability of +an accurate answer extraction process to aggregate across multiple outputs. +Moreover, they acquire higher inference cost, in comparison to Greedy Decoding, +due to generation of relatively higher number of output tokens. Research has +shown that the free form text outputs from Self-Consistency can be aggregated +reliably using LLMs to produce the final output. Additionally, recent +advancements in LLM inference have demonstrated that usage of diverse exemplars +in prompts have the ability to induce diversity in the LLM outputs. Such proven +techniques can be easily extended to self-ensembling based approaches to +achieve enhanced results in text generation. In this paper, we introduce PEDAL +(Prompts based on Exemplar Diversity Aggregated using LLMs), a hybrid +self-ensembling approach, that combines the strengths of diverse exemplar based +prompts and LLM based aggregation to achieve improvement in overall +performance. On the publicly available SVAMP and ARC datasets, our experiments +reveal that PEDAL can achieve better accuracy than Greedy Decoding based +strategies with lower inference cost compared to Self Consistency based +approaches. + +
+
+
+
+
+ + ☆ PsychoLex: Unveiling the Psychological Mind of Large Language Models + + +
+ This paper explores the intersection of psychology and artificial +intelligence through the development and evaluation of specialized Large +Language Models (LLMs). We introduce PsychoLex, a suite of resources designed +to enhance LLMs' proficiency in psychological tasks in both Persian and +English. Key contributions include the PsychoLexQA dataset for instructional +content and the PsychoLexEval dataset for rigorous evaluation of LLMs in +complex psychological scenarios. Additionally, we present the PsychoLexLLaMA +model, optimized specifically for psychological applications, demonstrating +superior performance compared to general-purpose models. The findings +underscore the potential of tailored LLMs for advancing psychological research +and applications, while also highlighting areas for further refinement. This +research offers a foundational step towards integrating LLMs into specialized +psychological domains, with implications for future advancements in AI-driven +psychological practice. + +
+
+
+
+
+ + ☆ FLEXTAF: Enhancing Table Reasoning with Flexible Tabular Formats + + +
+ The table reasoning task aims to answer the question according to the given +table. Currently, using Large Language Models (LLMs) is the predominant method +for table reasoning. Most existing methods employ a fixed tabular format to +represent the table, which could limit the performance. Given that each +instance requires different capabilities and models possess varying abilities, +we assert that different instances and models suit different tabular formats. +We prove the aforementioned claim through quantitative analysis of experimental +results, where different instances and models achieve different performances +using various tabular formats. Building on this discussion, we propose +FLEXTAF-Single and FLEXTAF-Vote to enhance table reasoning performance by +employing flexible tabular formats. Specifically, (i) FLEXTAF-Single trains a +classifier to predict the most suitable tabular format based on the instance +and the LLM. (ii) FLEXTAF-Vote integrates the results across different formats. +Our experiments on WikiTableQuestions and TabFact reveal significant +improvements, with average gains of 2.3% and 4.8% compared to the best +performance achieved using a fixed tabular format with greedy decoding and +self-consistency decoding, thereby validating the effectiveness of our methods. + +
+
+
+
+
+ + ☆ CIKMar: A Dual-Encoder Approach to Prompt-Based Reranking in Educational + Dialogue Systems + + +
+ In this study, we introduce CIKMar, an efficient approach to educational +dialogue systems powered by the Gemma Language model. By leveraging a +Dual-Encoder ranking system that incorporates both BERT and SBERT model, we +have designed CIKMar to deliver highly relevant and accurate responses, even +with the constraints of a smaller language model size. Our evaluation reveals +that CIKMar achieves a robust recall and F1-score of 0.70 using BERTScore +metrics. However, we have identified a significant challenge: the Dual-Encoder +tends to prioritize theoretical responses over practical ones. These findings +underscore the potential of compact and efficient models like Gemma in +democratizing access to advanced educational AI systems, ensuring effective and +contextually appropriate responses. + +
+
+ comment: This paper is the result of the final project of the Natural Language + Processing course, Master of Artificial Intelligence, Universitas Gadjah Mada +
+
+
+
+
+ + ☆ Leveraging FourierKAN Classification Head for Pre-Trained + Transformer-based Text Classification + + +
+ For many years, transformer-based pre-trained models with Multi-layer +Perceptron (MLP) heads have been the standard for text classification tasks. +However, the fixed non-linear functions employed by MLPs often fall short of +capturing the intricacies of the contextualized embeddings produced by +pre-trained encoders. Furthermore, MLPs usually require a significant number of +training parameters, which can be computationally expensive. In this work, we +introduce FourierKAN (FR-KAN), a variant of the promising MLP alternative +called Kolmogorov-Arnold Networks (KANs), as classification heads for +transformer-based encoders. Our studies reveal an average increase of 10% in +accuracy and 11% in F1-score when incorporating FR-KAN heads instead of +traditional MLP heads for several transformer-based pre-trained models across +multiple text classification tasks. Beyond improving model accuracy, FR-KAN +heads train faster and require fewer parameters. Our research opens new grounds +for broader applications of KAN across several Natural Language Processing +(NLP) tasks. + +
+
+
+
+
+ + ☆ EmoDynamiX: Emotional Support Dialogue Strategy Prediction by Modelling + MiXed Emotions and Discourse Dynamics + + +
+ Designing emotionally intelligent conversational systems to provide comfort +and advice to people experiencing distress is a compelling area of research. +Previous efforts have focused on developing modular dialogue systems that treat +socio-emotional strategy prediction as an auxiliary task and generate +strategy-conditioned responses with customized decoders. Recently, with +advancements in large language models (LLMs), end-to-end dialogue agents +without explicit socio-emotional strategy prediction steps have become +prevalent. However, despite their excellence in language generation, recent +studies show that LLMs' inherent preference bias towards certain +socio-emotional strategies hinders the delivery of high-quality emotional +support. To address this challenge, we propose decoupling strategy prediction +from language generation, and introduce a novel dialogue strategy predictor, +EmoDynamiX, which models the discourse dynamics between user emotions and +system strategies using a heterogeneous graph. Additionally, we make use of the +Emotion Recognition in Conversations (ERC) task and design a flexible +mixed-emotion module to capture fine-grained emotional states of the user. +Experimental results on two ESC datasets show EmoDynamiX outperforms previous +state-of-the-art methods with a significant margin. + +
+
+
+
+
+ + ☆ Evaluating the Evaluator: Measuring LLMs' Adherence to Task Evaluation + Instructions + + +
+ LLMs-as-a-judge is a recently popularized method which replaces human +judgements in task evaluation (Zheng et al. 2024) with automatic evaluation +using LLMs. Due to widespread use of RLHF (Reinforcement Learning from Human +Feedback), state-of-the-art LLMs like GPT4 and Llama3 are expected to have +strong alignment with human preferences when prompted for a quality judgement, +such as the coherence of a text. While this seems beneficial, it is not clear +whether the assessments by an LLM-as-a-judge constitute only an evaluation +based on the instructions in the prompts, or reflect its preference for +high-quality data similar to its fine-tune data. To investigate how much +influence prompting the LLMs-as-a-judge has on the alignment of AI judgements +to human judgements, we analyze prompts with increasing levels of instructions +about the target quality of an evaluation, for several LLMs-as-a-judge. +Further, we compare to a prompt-free method using model perplexity as a quality +measure instead. We aggregate a taxonomy of quality criteria commonly used +across state-of-the-art evaluations with LLMs and provide this as a rigorous +benchmark of models as judges. Overall, we show that the LLMs-as-a-judge +benefit only little from highly detailed instructions in prompts and that +perplexity can sometimes align better with human judgements than prompting, +especially on textual quality. + +
+
+
+
+
+ + ☆ Large Language Models Might Not Care What You Are Saying: Prompt Format + Beats Descriptions + + +
+ With the help of in-context learning (ICL), large language models (LLMs) have +achieved impressive performance across various tasks. However, the function of +descriptive instructions during ICL remains under-explored. In this work, we +propose an ensemble prompt framework to describe the selection criteria of +multiple in-context examples, and preliminary experiments on machine +translation (MT) across six translation directions confirm that this framework +boosts ICL perfromance. But to our surprise, LLMs might not necessarily care +what the descriptions actually say, and the performance gain is primarily +caused by the ensemble format, since the framework could lead to improvement +even with random descriptive nouns. We further apply this new ensemble prompt +on a range of commonsense, math, logical reasoning and hallucination tasks with +three LLMs and achieve promising results, suggesting again that designing a +proper prompt format would be much more effective and efficient than paying +effort into specific descriptions. Our code will be publicly available once +this paper is published. + +
+
+ comment: 10 pages, 6 figures, 3 tables +
+
+
+
+
+ + ☆ DAC: Decomposed Automation Correction for Text-to-SQL + + +
+ Text-to-SQL is an important task that helps people obtain information from +databases by automatically generating SQL queries. Considering the brilliant +performance, approaches based on Large Language Models (LLMs) become the +mainstream for text-to-SQL. Among these approaches, automated correction is an +effective approach that further enhances performance by correcting the mistakes +in the generated results. The existing correction methods require LLMs to +directly correct with generated SQL, while previous research shows that LLMs do +not know how to detect mistakes, leading to poor performance. Therefore, in +this paper, we propose to employ the decomposed correction to enhance +text-to-SQL performance. We first demonstrate that decomposed correction +outperforms direct correction since detecting and fixing mistakes with the +results of the decomposed sub-tasks is easier than with SQL. Based on this +analysis, we introduce Decomposed Automation Correction (DAC), which corrects +SQL by decomposing text-to-SQL into entity linking and skeleton parsing. DAC +first generates the entity and skeleton corresponding to the question and then +compares the differences between the initial SQL and the generated entities and +skeleton as feedback for correction. Experimental results show that our method +improves performance by $3.7\%$ on average of Spider, Bird, and KaggleDBQA +compared with the baseline method, demonstrating the effectiveness of DAC. + +
+
+
+
+
+ + ☆ Lower Layer Matters: Alleviating Hallucination via Multi-Layer Fusion + Contrastive Decoding with Truthfulness Refocused + + +
+ Large Language Models (LLMs) have demonstrated exceptional performance across +various natural language processing tasks, yet they occasionally tend to yield +content that factually inaccurate or discordant with the expected output, a +phenomenon empirically referred to as "hallucination". To tackle this issue, +recent works have investigated contrastive decoding between the original model +and an amateur model with induced hallucination, which has shown promising +results. Nonetheless, this method may undermine the output distribution of the +original LLM caused by its coarse contrast and simplistic subtraction +operation, potentially leading to errors in certain cases. In this paper, we +introduce a novel contrastive decoding framework termed LOL (LOwer Layer +Matters). Our approach involves concatenating the contrastive decoding of both +the final and lower layers between the original model and the amateur model, +thereby achieving multi-layer fusion to aid in the mitigation of hallucination. +Additionally, we incorporate a truthfulness refocused module that leverages +contextual guidance to enhance factual encoding, further capturing truthfulness +during contrastive decoding. Extensive experiments conducted on two publicly +available datasets illustrate that our proposed LOL framework can substantially +alleviate hallucination while surpassing existing baselines in most cases. +Compared with the best baseline, we improve by average 4.5 points on all +metrics of TruthfulQA. The source code is coming soon. + +
+
+ comment: 9 pages, 4 figures, 5 tables +
+
+
+
+
+ + ☆ ConcateNet: Dialogue Separation Using Local And Global Feature + Concatenation + + +
+ Dialogue separation involves isolating a dialogue signal from a mixture, such +as a movie or a TV program. This can be a necessary step to enable dialogue +enhancement for broadcast-related applications. In this paper, ConcateNet for +dialogue separation is proposed, which is based on a novel approach for +processing local and global features aimed at better generalization for +out-of-domain signals. ConcateNet is trained using a noise reduction-focused, +publicly available dataset and evaluated using three datasets: two noise +reduction-focused datasets (in-domain), which show competitive performance for +ConcateNet, and a broadcast-focused dataset (out-of-domain), which verifies the +better generalization performance for the proposed architecture compared to +considered state-of-the-art noise-reduction methods. + +
+
+
+
+
+ + ☆ ChatZero:Zero-shot Cross-Lingual Dialogue Generation via Pseudo-Target + Language ECAI2024 + + +
+ Although large language models(LLMs) show amazing capabilities, among various +exciting applications discovered for LLMs fall short in other low-resource +languages. Besides, most existing methods depend on large-scale dialogue +corpora and thus building systems for dialogue generation in a zero-shot +scenario remains a considerable challenge. To address this challenge, we +propose a novel end-to-end zero-shot dialogue generation model ChatZero based +on cross-lingual code-switching method. First, we construct code-switching +language and pseudo-target language with placeholders. Then for cross-lingual +semantic transfer, we employ unsupervised contrastive learning to minimize the +semantics gap of the source language, code-switching language, and +pseudo-target language that are mutually positive examples in the high +dimensional semantic space. Experiments on the multilingual DailyDialog and +DSTC7-AVSD datasets demonstrate that ChatZero can achieve more than 90\% of the +original performance under the zero-shot case compared to supervised learning, +and achieve state-of-the-art performance compared with other baselines. + +
+
+ comment: ECAI2024 +
+
+
+
+
+ + ☆ Turning Trash into Treasure: Accelerating Inference of Large Language + Models with Token Recycling + + +
+ The rapid growth in the parameters of large language models (LLMs) has made +inference latency a fundamental bottleneck, limiting broader application of +LLMs. Speculative decoding represents a lossless approach to accelerate +inference through a guess-and-verify paradigm, leveraging the parallel +capabilities of modern hardware. Some speculative decoding methods rely on +additional structures to guess draft tokens, such as small models or +parameter-efficient architectures, which need extra training before use. +Alternatively, retrieval-based train-free techniques build libraries from +pre-existing corpora or by n-gram generation. However, they face challenges +like large storage requirements, time-consuming retrieval, and limited +adaptability. Observing that candidate tokens generated during the decoding +process are likely to reoccur in future sequences, we propose Token Recycling. +This approach stores candidate tokens in an adjacency matrix and employs a +breadth-first search (BFS)-like algorithm on the matrix to construct a draft +tree. The tree is then validated through tree attention. New candidate tokens +from the decoding process are then used to update the matrix. Token Recycling +requires \textless2MB of additional storage and achieves approximately 2x +speedup across all sizes of LLMs. It significantly outperforms existing +train-free methods by 30\% and even a training method by 25\%. It can be +directly applied to any existing LLMs and tasks without the need for +adaptation. + +
+
+ comment: under review +
+
+
+
+
+ + ☆ Quantifying the Effectiveness of Student Organization Activities using + Natural Language Processing + + +
+ Student extracurricular activities play an important role in enriching the +students' educational experiences. With the increasing popularity of Machine +Learning and Natural Language Processing, it becomes a logical step that +incorporating ML-NLP in improving extracurricular activities is a potential +focus of study in Artificial Intelligence (AI). This research study aims to +develop a machine learning workflow that will quantify the effectiveness of +student-organized activities based on student emotional responses using +sentiment analysis. The study uses the Bidirectional Encoder Representations +from Transformers (BERT) Large Language Model (LLM) called via the +pysentimiento toolkit, as a Transformer pipeline in Hugging Face. A sample data +set from Organization C, a Recognized Student Organization (RSO) of a higher +educational institute in the Philippines, College X, was used to develop the +workflow. The workflow consisted of data preprocessing, key feature selection, +LLM feature processing, and score aggregation, resulting in an Event Score for +each data set. The results show that the BERT LLM can also be used effectively +in analyzing sentiment beyond product reviews and post comments. For the +student affairs offices of educational institutions, this study can provide a +practical example of how NLP can be applied to real-world scenarios, showcasing +the potential impact of data-driven decision making. + +
+
+ comment: 11 pages, 4 figures, presented in International Conference on + Generative Al and its Applications (ICGAIA-24) last 22nd - 23rd, July, 2024 + at Jakarta, Indonesia +
+
+
+
+
+ + ☆ Med-PMC: Medical Personalized Multi-modal Consultation with a Proactive + Ask-First-Observe-Next Paradigm + + +
+ The application of the Multi-modal Large Language Models (MLLMs) in medical +clinical scenarios remains underexplored. Previous benchmarks only focus on the +capacity of the MLLMs in medical visual question-answering (VQA) or report +generation and fail to assess the performance of the MLLMs on complex clinical +multi-modal tasks. In this paper, we propose a novel Medical Personalized +Multi-modal Consultation (Med-PMC) paradigm to evaluate the clinical capacity +of the MLLMs. Med-PMC builds a simulated clinical environment where the MLLMs +are required to interact with a patient simulator to complete the multi-modal +information-gathering and decision-making task. Specifically, the patient +simulator is decorated with personalized actors to simulate diverse patients in +real scenarios. We conduct extensive experiments to access 12 types of MLLMs, +providing a comprehensive view of the MLLMs' clinical performance. We found +that current MLLMs fail to gather multimodal information and show potential +bias in the decision-making task when consulted with the personalized patient +simulators. Further analysis demonstrates the effectiveness of Med-PMC, showing +the potential to guide the development of robust and reliable clinical MLLMs. +Code and data are available at https://github.com/LiuHC0428/Med-PMC. + +
+
+ comment: 26 pages, 5 figures +
+
+
+
+
+ + ☆ The Fellowship of the LLMs: Multi-Agent Workflows for Synthetic + Preference Optimization Dataset Generation + + +
+ This paper presents and evaluates multi-agent workflows for synthetic +Preference Optimization (PO) dataset generation. PO dataset generation requires +two modules: (1) response evaluation, and (2) response generation. In the +response evaluation module, the responses from Large Language Models (LLMs) are +evaluated and ranked - a task typically carried out by human annotators that we +automate using LLMs. We assess the response evaluation module in a 2 step +process. In step 1, we assess LLMs as evaluators using three distinct prompting +strategies. In step 2, we apply the winning prompting strategy to compare the +performance of LLM-as-a-Judge, LLMs-as-a-Jury, and LLM Debate. In each step, we +use inter-rater agreement using Cohen's Kappa between human annotators and +LLMs. For the response generation module, we compare different configurations +for the LLM Feedback Loop using the identified LLM evaluator configuration. We +use the win rate (the fraction of times a generation framework is selected as +the best by an LLM evaluator) to determine the best multi-agent configuration +for generation. After identifying the best configurations for both modules, we +use models from the GPT, Gemma, and Llama families to generate our PO datasets +using the above pipeline. We generate two types of PO datasets, one to improve +the generation capabilities of individual LLM and the other to improve the +multi-agent workflow. Our evaluation shows that GPT-4o-as-a-Judge is more +consistent across datasets when the candidate responses do not include +responses from the GPT family. Additionally, we find that the LLM Feedback +Loop, with Llama as the generator and Gemma as the reviewer, achieves a notable +71.8% and 73.8% win rate over single-agent Llama and Gemma, respectively. + +
+
+
+
+
+ + ☆ LLM-PCGC: Large Language Model-based Point Cloud Geometry Compression + + +
+ The key to effective point cloud compression is to obtain a robust context +model consistent with complex 3D data structures. Recently, the advancement of +large language models (LLMs) has highlighted their capabilities not only as +powerful generators for in-context learning and generation but also as +effective compressors. These dual attributes of LLMs make them particularly +well-suited to meet the demands of data compression. Therefore, this paper +explores the potential of using LLM for compression tasks, focusing on lossless +point cloud geometry compression (PCGC) experiments. However, applying LLM +directly to PCGC tasks presents some significant challenges, i.e., LLM does not +understand the structure of the point cloud well, and it is a difficult task to +fill the gap between text and point cloud through text description, especially +for large complicated and small shapeless point clouds. To address these +problems, we introduce a novel architecture, namely the Large Language +Model-based Point Cloud Geometry Compression (LLM-PCGC) method, using LLM to +compress point cloud geometry information without any text description or +aligning operation. By utilizing different adaptation techniques for +cross-modality representation alignment and semantic consistency, including +clustering, K-tree, token mapping invariance, and Low Rank Adaptation (LoRA), +the proposed method can translate LLM to a compressor/generator for point +cloud. To the best of our knowledge, this is the first structure to employ LLM +as a compressor for point cloud data. Experiments demonstrate that the LLM-PCGC +outperforms the other existing methods significantly, by achieving -40.213% bit +rate reduction compared to the reference software of MPEG Geometry-based Point +Cloud Compression (G-PCC) standard, and by achieving -2.267% bit rate reduction +compared to the state-of-the-art learning-based method. + +
+
+
+
+
+ + ☆ MIA-Tuner: Adapting Large Language Models as Pre-training Text Detector + + +
+ The increasing parameters and expansive dataset of large language models +(LLMs) highlight the urgent demand for a technical solution to audit the +underlying privacy risks and copyright issues associated with LLMs. Existing +studies have partially addressed this need through an exploration of the +pre-training data detection problem, which is an instance of a membership +inference attack (MIA). This problem involves determining whether a given piece +of text has been used during the pre-training phase of the target LLM. Although +existing methods have designed various sophisticated MIA score functions to +achieve considerable detection performance in pre-trained LLMs, how to achieve +high-confidence detection and how to perform MIA on aligned LLMs remain +challenging. In this paper, we propose MIA-Tuner, a novel instruction-based MIA +method, which instructs LLMs themselves to serve as a more precise pre-training +data detector internally, rather than design an external MIA score function. +Furthermore, we design two instruction-based safeguards to respectively +mitigate the privacy risks brought by the existing methods and MIA-Tuner. To +comprehensively evaluate the most recent state-of-the-art LLMs, we collect a +more up-to-date MIA benchmark dataset, named WIKIMIA-24, to replace the widely +adopted benchmark WIKIMIA. We conduct extensive experiments across various +aligned and unaligned LLMs over the two benchmark datasets. The results +demonstrate that MIA-Tuner increases the AUC of MIAs from 0.7 to a +significantly high level of 0.9. + +
+
+ comment: code and dataset: https://github.com/wjfu99/MIA-Tuner +
+
+
+
+
+ + ☆ LLMs Are Biased Towards Output Formats! Systematically Evaluating and + Mitigating Output Format Bias of LLMs + + +
+ We present the first systematic evaluation examining format bias in +performance of large language models (LLMs). Our approach distinguishes between +two categories of an evaluation metric under format constraints to reliably and +accurately assess performance: one measures performance when format constraints +are adhered to, while the other evaluates performance regardless of constraint +adherence. We then define a metric for measuring the format bias of LLMs and +establish effective strategies to reduce it. Subsequently, we present our +empirical format bias evaluation spanning four commonly used categories -- +multiple-choice question-answer, wrapping, list, and mapping -- covering 15 +widely-used formats. Our evaluation on eight generation tasks uncovers +significant format bias across state-of-the-art LLMs. We further discover that +improving the format-instruction following capabilities of LLMs across formats +potentially reduces format bias. Based on our evaluation findings, we study +prompting and fine-tuning with synthesized format data techniques to mitigate +format bias. Our methods successfully reduce the variance in ChatGPT's +performance among wrapping formats from 235.33 to 0.71 (%$^2$). + +
+
+
+
+
+ + ☆ Reasoning Beyond Bias: A Study on Counterfactual Prompting and Chain of + Thought Reasoning + + +
+ Language models are known to absorb biases from their training data, leading +to predictions driven by statistical regularities rather than semantic +relevance. We investigate the impact of these biases on answer choice +preferences in the Massive Multi-Task Language Understanding (MMLU) task. Our +findings reveal that differences in learned regularities across answer options +are predictive of model preferences and mirror human test-taking strategies. To +address this issue, we introduce two novel methods: Counterfactual Prompting +with Chain of Thought (CoT) and Counterfactual Prompting with Agnostically +Primed CoT (APriCoT). We demonstrate that while Counterfactual Prompting with +CoT alone is insufficient to mitigate bias, our novel Primed Counterfactual +Prompting with CoT approach effectively reduces the influence of base-rate +probabilities while improving overall accuracy. Our results suggest that +mitigating bias requires a "System-2" like process and that CoT reasoning is +susceptible to confirmation bias under some prompting methodologies. Our +contributions offer practical solutions for developing more robust and fair +language models. + +
+
+
+
+
+ + ☆ An End-to-End Model for Photo-Sharing Multi-modal Dialogue Generation + + +
+ Photo-Sharing Multi-modal dialogue generation requires a dialogue agent not +only to generate text responses but also to share photos at the proper moment. +Using image text caption as the bridge, a pipeline model integrates an image +caption model, a text generation model, and an image generation model to handle +this complex multi-modal task. However, representing the images with text +captions may loss important visual details and information and cause error +propagation in the complex dialogue system. Besides, the pipeline model +isolates the three models separately because discrete image text captions +hinder end-to-end gradient propagation. We propose the first end-to-end model +for photo-sharing multi-modal dialogue generation, which integrates an image +perceptron and an image generator with a large language model. The large +language model employs the Q-Former to perceive visual images in the input end. +For image generation in the output end, we propose a dynamic vocabulary +transformation matrix and use straight-through and gumbel-softmax techniques to +align the large language model and stable diffusion model and achieve +end-to-end gradient propagation. We perform experiments on PhotoChat and +DialogCC datasets to evaluate our end-to-end model. Compared with pipeline +models, the end-to-end model gains state-of-the-art performances on various +metrics of text and image generation. More analysis experiments also verify the +effectiveness of the end-to-end model for photo-sharing multi-modal dialogue +generation. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Understanding Enthymemes in Argument Maps: Bridging Argument Mining and + Logic-based Argumentation + + +
+ Argument mining is natural language processing technology aimed at +identifying arguments in text. Furthermore, the approach is being developed to +identify the premises and claims of those arguments, and to identify the +relationships between arguments including support and attack relationships. In +this paper, we assume that an argument map contains the premises and claims of +arguments, and support and attack relationships between them, that have been +identified by argument mining. So from a piece of text, we assume an argument +map is obtained automatically by natural language processing. However, to +understand and to automatically analyse that argument map, it would be +desirable to instantiate that argument map with logical arguments. Once we have +the logical representation of the arguments in an argument map, we can use +automated reasoning to analyze the argumentation (e.g. check consistency of +premises, check validity of claims, and check the labelling on each arc +corresponds with thw logical arguments). We address this need by using +classical logic for representing the explicit information in the text, and +using default logic for representing the implicit information in the text. In +order to investigate our proposal, we consider some specific options for +instantiation. + +
+
+ comment: Research note +
+
+
+
+
+ + ☆ Math-PUMA: Progressive Upward Multimodal Alignment to Enhance + Mathematical Reasoning + + +
+ Multimodal Large Language Models (MLLMs) excel in solving text-based +mathematical problems, but they struggle with mathematical diagrams since they +are primarily trained on natural scene images. For humans, visual aids +generally enhance problem-solving, but MLLMs perform worse as information +shifts from textual to visual modality. This decline is mainly due to their +shortcomings in aligning images and text. To tackle aforementioned challenges, +we propose Math-PUMA, a methodology focused on Progressive Upward Multimodal +Alignment. This approach is designed to improve the mathematical reasoning +skills of MLLMs through a three-stage training process, with the second stage +being the critical alignment stage. We first enhance the language model's +mathematical reasoning capabilities with extensive set of textual mathematical +problems. We then construct a multimodal dataset with varying degrees of +textual and visual information, creating data pairs by presenting each problem +in at least two forms. By leveraging the Kullback-Leibler (KL) divergence of +next-token prediction distributions to align visual and textual modalities, +consistent problem-solving abilities are ensured. Finally, we utilize +multimodal instruction tuning for MLLMs with high-quality multimodal data. +Experimental results on multiple mathematical reasoning benchmarks demonstrate +that the MLLMs trained with Math-PUMA surpass most open-source MLLMs. Our +approach effectively narrows the performance gap for problems presented in +different modalities. + +
+
+
+
+
+ + ☆ A Survey on Benchmarks of Multimodal Large Language Models + + +
+ Multimodal Large Language Models (MLLMs) are gaining increasing popularity in +both academia and industry due to their remarkable performance in various +applications such as visual question answering, visual perception, +understanding, and reasoning. Over the past few years, significant efforts have +been made to examine MLLMs from multiple perspectives. This paper presents a +comprehensive review of \textbf{180 benchmarks} and evaluation for MLLMs, +focusing on (1)perception and understanding, (2)cognition and reasoning, +(3)specific domains, (4)key capabilities, and (5)other modalities. Finally, we +discuss the limitations of the current evaluation methods for MLLMs and explore +promising future directions. Our key argument is that evaluation should be +regarded as a crucial discipline to better support the development of MLLMs. +For more details, please visit our GitHub repository: +https://github.com/swordlidev/Evaluation-Multimodal-LLMs-Survey. + +
+
+
+
+
+ + ☆ Persona is a Double-edged Sword: Enhancing the Zero-shot Reasoning by + Ensembling the Role-playing and Neutral Prompts + + +
+ Recent studies demonstrate that prompting an appropriate role-playing persona +to an LLM improves its reasoning capability. However, assigning a proper +persona is difficult since an LLM's performance is extremely sensitive to +assigned prompts; therefore, personas sometimes hinder LLMs and degrade their +reasoning capabilities. In this paper, we propose a novel framework, Jekyll \& +Hyde, which ensembles the results of role-playing and neutral prompts to +eradicate performance degradation via unilateral use of role-playing prompted +LLM and enhance the robustness of an LLM's reasoning ability. Specifically, +Jekyll \& Hyde collects two potential solutions from both role-playing and +neutral prompts and selects a better solution after cross-checking via an LLM +evaluator. However, LLM-based evaluators tend to be affected by the order of +those potential solutions within the prompt when selecting the proper solution; +thus, we also propose a robust LLM evaluator to mitigate the position bias. The +experimental analysis demonstrates that role-playing prompts distract LLMs and +degrade their reasoning abilities in 4 out of 12 datasets, even when using +GPT-4. In addition, we reveal that Jekyll \& Hyde improves reasoning +capabilities by selecting better choices among the potential solutions on +twelve widely-used reasoning datasets. We further show that our proposed LLM +evaluator outperforms other baselines, proving the LLMs' position bias is +successfully mitigated. + +
+
+ comment: 13 pages, 4 figures +
+
+
+
+
+ + ☆ RealMedQA: A pilot biomedical question answering dataset containing + realistic clinical questions + + +
+ Clinical question answering systems have the potential to provide clinicians +with relevant and timely answers to their questions. Nonetheless, despite the +advances that have been made, adoption of these systems in clinical settings +has been slow. One issue is a lack of question-answering datasets which reflect +the real-world needs of health professionals. In this work, we present +RealMedQA, a dataset of realistic clinical questions generated by humans and an +LLM. We describe the process for generating and verifying the QA pairs and +assess several QA models on BioASQ and RealMedQA to assess the relative +difficulty of matching answers to questions. We show that the LLM is more +cost-efficient for generating "ideal" QA pairs. Additionally, we achieve a +lower lexical similarity between questions and answers than BioASQ which +provides an additional challenge to the top two QA models, as per the results. +We release our code and our dataset publicly to encourage further research. + +
+
+ comment: Accepted at AMIA Annual Symposium 2024 +
+
+
+
+
+ + ☆ A Mechanistic Interpretation of Syllogistic Reasoning in Auto-Regressive + Language Models + + +
+ Recent studies on logical reasoning in auto-regressive Language Models (LMs) +have sparked a debate on whether such models can learn systematic reasoning +principles during pre-training or merely exploit superficial patterns in the +training data. This paper presents a mechanistic interpretation of syllogistic +reasoning in LMs to further enhance our understanding of internal dynamics. +Specifically, we present a methodology for circuit discovery aimed at +disentangling content-independent reasoning mechanisms from world knowledge +acquired during pre-training. Through two distinct intervention methods, we +uncover a sufficient and necessary circuit involving middle-term suppression +that elucidates how LMs transfer information to derive valid conclusions from +premises. Furthermore, we investigate how belief biases manifest in syllogistic +reasoning, finding evidence of partial contamination from additional attention +heads responsible for encoding commonsense and contextualized knowledge. +Finally, we explore the generalization of the discovered mechanisms across +various syllogistic schemes and model sizes, finding that the identified +circuit is sufficient and necessary for all the schemes on which the model +achieves high downstream accuracy ($\geq$ 60\%). Overall, our findings suggest +that LMs indeed learn transferable content-independent reasoning mechanisms, +but that, at the same time, such mechanisms do not involve generalisable and +abstract logical primitives, being susceptible to contamination by the same +world knowledge acquired during pre-training. + +
+
+
+
+
+ + ☆ Overview of the BioLaySumm 2024 Shared Task on the Lay Summarization of + Biomedical Research Articles + + +
+ This paper presents the setup and results of the second edition of the +BioLaySumm shared task on the Lay Summarisation of Biomedical Research +Articles, hosted at the BioNLP Workshop at ACL 2024. In this task edition, we +aim to build on the first edition's success by further increasing research +interest in this important task and encouraging participants to explore novel +approaches that will help advance the state-of-the-art. Encouragingly, we found +research interest in the task to be high, with this edition of the task +attracting a total of 53 participating teams, a significant increase in +engagement from the previous edition. Overall, our results show that a broad +range of innovative approaches were adopted by task participants, with a +predictable shift towards the use of Large Language Models (LLMs). + +
+
+ comment: Published in: Proceedings of the 23rd Workshop on Biomedical Natural + Language Processing +
+
+
+
+
+ + ☆ Collaborative Cross-modal Fusion with Large Language Model for + Recommendation CIKM 2024 + + +
+ Despite the success of conventional collaborative filtering (CF) approaches +for recommendation systems, they exhibit limitations in leveraging semantic +knowledge within the textual attributes of users and items. Recent focus on the +application of large language models for recommendation (LLM4Rec) has +highlighted their capability for effective semantic knowledge capture. However, +these methods often overlook the collaborative signals in user behaviors. Some +simply instruct-tune a language model, while others directly inject the +embeddings of a CF-based model, lacking a synergistic fusion of different +modalities. To address these issues, we propose a framework of Collaborative +Cross-modal Fusion with Large Language Models, termed CCF-LLM, for +recommendation. In this framework, we translate the user-item interactions into +a hybrid prompt to encode both semantic knowledge and collaborative signals, +and then employ an attentive cross-modal fusion strategy to effectively fuse +latent embeddings of both modalities. Extensive experiments demonstrate that +CCF-LLM outperforms existing methods by effectively utilizing semantic and +collaborative signals in the LLM4Rec context. + +
+
+ comment: 10 pages, 4 figures, accepted by CIKM 2024 +
+
+
+
+
+ + ☆ Integrating Multi-view Analysis: Multi-view Mixture-of-Expert for + Textual Personality Detection NLPCC 2024 + + +
+ Textual personality detection aims to identify personality traits by +analyzing user-generated content. To achieve this effectively, it is essential +to thoroughly examine user-generated content from various perspectives. +However, previous studies have struggled with automatically extracting and +effectively integrating information from multiple perspectives, thereby +limiting their performance on personality detection. To address these +challenges, we propose the Multi-view Mixture-of-Experts Model for Textual +Personality Detection (MvP). MvP introduces a Multi-view Mixture-of-Experts +(MoE) network to automatically analyze user posts from various perspectives. +Additionally, it employs User Consistency Regularization to mitigate conflicts +among different perspectives and learn a multi-view generic user +representation. The model's training is optimized via a multi-task joint +learning strategy that balances supervised personality detection with +self-supervised user consistency constraints. Experimental results on two +widely-used personality detection datasets demonstrate the effectiveness of the +MvP model and the benefits of automatically analyzing user posts from diverse +perspectives for textual personality detection. + +
+
+ comment: Accepted by NLPCC 2024 +
+
+
+
+
+ + ☆ SelectLLM: Query-Aware Efficient Selection Algorithm for Large Language + Models + + +
+ Large language models (LLMs) have gained increased popularity due to their +remarkable success across various tasks, which has led to the active +development of a large set of diverse LLMs. However, individual LLMs have +limitations when applied to complex tasks because of such factors as training +biases, model sizes, and the datasets used. A promising approach is to +efficiently harness the diverse capabilities of LLMs to overcome these +individual limitations. Towards this goal, we introduce a novel LLM selection +algorithm called SelectLLM. This algorithm directs input queries to the most +suitable subset of LLMs from a large pool, ensuring they collectively provide +the correct response efficiently. SelectLLM uses a multi-label classifier, +utilizing the classifier's predictions and confidence scores to design optimal +policies for selecting an optimal, query-aware, and lightweight subset of LLMs. +Our findings show that the proposed model outperforms individual LLMs and +achieves competitive performance compared to similarly sized, computationally +expensive top-performing LLM subsets. Specifically, with a similarly sized +top-performing LLM subset, we achieve a significant reduction in latency on two +standard reasoning benchmarks: 13% lower latency for GSM8K and 70% lower +latency for MMLU. Additionally, we conduct comprehensive analyses and ablation +studies, which validate the robustness of the proposed model. + +
+
+
+
+
+ + ☆ Where is the signal in tokenization space? + + +
+ Large Language Models (LLMs) are typically shipped with tokenizers that +deterministically encode text into so-called canonical token sequences, to +which the LLMs assign probability values. One common assumption is that the +probability of a piece of text is the probability of its canonical token +sequence. However, the tokenization of a string is not unique: e.g., the Llama2 +tokenizer encodes Tokens as [Tok,ens], but [Tok,en,s] also represents the same +text. In this paper, we study non-canonical tokenizations. We prove that, given +a string, it is computationally hard to find the most likely tokenization for +an autoregressive LLM, as well as to compute the marginal probability over all +possible tokenizations. We then show how the marginal is, in most cases, +indistinguishable from the canonical probability. Surprisingly, we then +empirically demonstrate the existence of a significant amount of signal hidden +within tokenization space. Notably, by simply aggregating the probabilities of +non-canonical tokenizations, we achieve improvements across a range of LLM +evaluation benchmarks for a variety of architectures, including transformers +and state space models. + +
+
+
+
+
+ + ☆ CommunityKG-RAG: Leveraging Community Structures in Knowledge Graphs for + Advanced Retrieval-Augmented Generation in Fact-Checking + + +
+ Despite advancements in Large Language Models (LLMs) and Retrieval-Augmented +Generation (RAG) systems, their effectiveness is often hindered by a lack of +integration with entity relationships and community structures, limiting their +ability to provide contextually rich and accurate information retrieval for +fact-checking. We introduce CommunityKG-RAG (Community Knowledge +Graph-Retrieval Augmented Generation), a novel zero-shot framework that +integrates community structures within Knowledge Graphs (KGs) with RAG systems +to enhance the fact-checking process. Capable of adapting to new domains and +queries without additional training, CommunityKG-RAG utilizes the multi-hop +nature of community structures within KGs to significantly improve the accuracy +and relevance of information retrieval. Our experimental results demonstrate +that CommunityKG-RAG outperforms traditional methods, representing a +significant advancement in fact-checking by offering a robust, scalable, and +efficient solution. + +
+
+
+
+
+ + ☆ MuRAR: A Simple and Effective Multimodal Retrieval and Answer Refinement + Framework for Multimodal Question Answering + + +
+ Recent advancements in retrieval-augmented generation (RAG) have demonstrated +impressive performance in the question-answering (QA) task. However, most +previous works predominantly focus on text-based answers. While some studies +address multimodal data, they still fall short in generating comprehensive +multimodal answers, particularly for explaining concepts or providing +step-by-step tutorials on how to accomplish specific goals. This capability is +especially valuable for applications such as enterprise chatbots and settings +such as customer service and educational systems, where the answers are sourced +from multimodal data. In this paper, we introduce a simple and effective +framework named MuRAR (Multimodal Retrieval and Answer Refinement). MuRAR +enhances text-based answers by retrieving relevant multimodal data and refining +the responses to create coherent multimodal answers. This framework can be +easily extended to support multimodal answers in enterprise chatbots with +minimal modifications. Human evaluation results indicate that multimodal +answers generated by MuRAR are more useful and readable compared to plain text +answers. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Ex3: Automatic Novel Writing by Extracting, Excelsior and Expanding + + +
+ Generating long-term texts such as novels using artificial intelligence has +always been a challenge. A common approach is to use large language models +(LLMs) to construct a hierarchical framework that first plans and then writes. +Despite the fact that the generated novels reach a sufficient length, they +exhibit poor logical coherence and appeal in their plots and deficiencies in +character and event depiction, ultimately compromising the overall narrative +quality. In this paper, we propose a method named Extracting Excelsior and +Expanding. Ex3 initially extracts structure information from raw novel data. By +combining this structure information with the novel data, an +instruction-following dataset is meticulously crafted. This dataset is then +utilized to fine-tune the LLM, aiming for excelsior generation performance. In +the final stage, a tree-like expansion method is deployed to facilitate the +generation of arbitrarily long novels. Evaluation against previous methods +showcases Ex3's ability to produce higher-quality long-form novels. + +
+
+
+
+
+ + ♻ ☆ Self-Taught Optimizer (STOP): Recursively Self-Improving Code Generation + + +
+ Several recent advances in AI systems solve problems by providing a +"scaffolding" program that structures multiple calls to language models (LMs) +to generate better outputs. A scaffolding program is written in a programming +language such as Python. In this work, we use a language-model-infused +scaffolding program to improve itself. We start with a seed "improver" that +improves an input program according to a given utility function by querying an +LM several times and returning the best solution. We then run this seed +improver to improve itself. Across a small set of downstream tasks, the +resulting improved improver generates programs with significantly better +performance than its seed improver. A variety of self-improvement strategies +are proposed by the language model, including beam search, genetic algorithms, +and simulated annealing. Since the language models themselves are not altered, +this is not full recursive self-improvement. Nonetheless, it demonstrates that +a modern language model, GPT-4 in our experiments, is capable of writing code +that can call itself to improve itself. We consider concerns around the +development of self-improving technologies and evaluate the frequency with +which the generated code bypasses a sandbox. + +
+
+ comment: Published as a conference paper at COLM 2024 +
+
+
+
+
+ + ♻ ☆ Improving Sampling Methods for Fine-tuning SentenceBERT in Text Streams ICPR + + +
+ The proliferation of textual data on the Internet presents a unique +opportunity for institutions and companies to monitor public opinion about +their services and products. Given the rapid generation of such data, the text +stream mining setting, which handles sequentially arriving, potentially +infinite text streams, is often more suitable than traditional batch learning. +While pre-trained language models are commonly employed for their high-quality +text vectorization capabilities in streaming contexts, they face challenges +adapting to concept drift - the phenomenon where the data distribution changes +over time, adversely affecting model performance. Addressing the issue of +concept drift, this study explores the efficacy of seven text sampling methods +designed to selectively fine-tune language models, thereby mitigating +performance degradation. We precisely assess the impact of these methods on +fine-tuning the SBERT model using four different loss functions. Our +evaluation, focused on Macro F1-score and elapsed time, employs two text stream +datasets and an incremental SVM classifier to benchmark performance. Our +findings indicate that Softmax loss and Batch All Triplets loss are +particularly effective for text stream classification, demonstrating that +larger sample sizes generally correlate with improved macro F1-scores. Notably, +our proposed WordPieceToken ratio sampling method significantly enhances +performance with the identified loss functions, surpassing baseline results. + +
+
+ comment: Accepted for presentation at the 27th International Conference on + Pattern Recognition (ICPR) 2024 +
+
+
+
+
+ + ♻ ☆ Apollo: A Lightweight Multilingual Medical LLM towards Democratizing + Medical AI to 6B People + + +
+ Despite the vast repository of global medical knowledge predominantly being +in English, local languages are crucial for delivering tailored healthcare +services, particularly in areas with limited medical resources. To extend the +reach of medical AI advancements to a broader population, we aim to develop +medical LLMs across the six most widely spoken languages, encompassing a global +population of 6.1 billion. This effort culminates in the creation of the +ApolloCorpora multilingual medical dataset and the XMedBench benchmark. In the +multilingual medical benchmark, the released Apollo models, at various +relatively-small sizes (i.e., 0.5B, 1.8B, 2B, 6B, and 7B), achieve the best +performance among models of equivalent size. Especially, Apollo-7B is the +state-of-the-art multilingual medical LLMs up to 70B. Additionally, these lite +models could be used to improve the multi-lingual medical capabilities of +larger models without fine-tuning in a proxy-tuning fashion. We will +open-source training corpora, code, model weights and evaluation benchmark. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ AI-as-exploration: Navigating intelligence space + + +
+ Artificial Intelligence is a field that lives many lives, and the term has +come to encompass a motley collection of scientific and commercial endeavours. +In this paper, I articulate the contours of a rather neglected but central +scientific role that AI has to play, which I dub `AI-as-exploration'.The basic +thrust of AI-as-exploration is that of creating and studying systems that can +reveal candidate building blocks of intelligence that may differ from the forms +of human and animal intelligence we are familiar with. In other words, I +suggest that AI is one of the best tools we have for exploring intelligence +space, namely the space of possible intelligent systems. I illustrate the value +of AI-as-exploration by focusing on a specific case study, i.e., recent work on +the capacity to combine novel and invented concepts in humans and Large +Language Models. I show that the latter, despite showing human-level accuracy +in such a task, probably solve it in ways radically different, but no less +relevant to intelligence research, to those hypothesised for humans. + +
+
+
+
+
+ + ♻ ☆ Self-Supervised Multimodal Learning: A Survey + + +
+ Multimodal learning, which aims to understand and analyze information from +multiple modalities, has achieved substantial progress in the supervised regime +in recent years. However, the heavy dependence on data paired with expensive +human annotations impedes scaling up models. Meanwhile, given the availability +of large-scale unannotated data in the wild, self-supervised learning has +become an attractive strategy to alleviate the annotation bottleneck. Building +on these two directions, self-supervised multimodal learning (SSML) provides +ways to learn from raw multimodal data. In this survey, we provide a +comprehensive review of the state-of-the-art in SSML, in which we elucidate +three major challenges intrinsic to self-supervised learning with multimodal +data: (1) learning representations from multimodal data without labels, (2) +fusion of different modalities, and (3) learning with unaligned data. We then +detail existing solutions to these challenges. Specifically, we consider (1) +objectives for learning from multimodal unlabeled data via self-supervision, +(2) model architectures from the perspective of different multimodal fusion +strategies, and (3) pair-free learning strategies for coarse-grained and +fine-grained alignment. We also review real-world applications of SSML +algorithms in diverse fields such as healthcare, remote sensing, and machine +translation. Finally, we discuss challenges and future directions for SSML. A +collection of related resources can be found at: +https://github.com/ys-zong/awesome-self-supervised-multimodal-learning. + +
+
+ comment: Accepted to IEEE T-PAMI +
+
+
+
+
+ + ♻ ☆ Multi-Hop Table Retrieval for Open-Domain Text-to-SQL + + +
+ Open-domain text-to-SQL is an important task that retrieves question-relevant +tables from massive databases and then generates SQL. However, existing +retrieval methods that retrieve in a single hop do not pay attention to the +text-to-SQL challenge of schema linking, which is aligning the entities in the +question with table entities, reflected in two aspects: similar irrelevant +entity and domain mismatch entity. Therefore, we propose our method, the +multi-hop table retrieval with rewrite and beam search (Murre). To reduce the +effect of the similar irrelevant entity, our method focuses on unretrieved +entities at each hop and considers the low-ranked tables by beam search. To +alleviate the limitation of domain mismatch entity, Murre rewrites the question +based on retrieved tables in multiple hops, decreasing the domain gap with +relevant tables. We conduct experiments on SpiderUnion and BirdUnion+, reaching +new state-of-the-art results with an average improvement of 6.38%. + +
+
+
+
+
+ + ♻ ☆ Fine-Tuned 'Small' LLMs (Still) Significantly Outperform Zero-Shot + Generative AI Models in Text Classification + + +
+ Generative AI offers a simple, prompt-based alternative to fine-tuning +smaller BERT-style LLMs for text classification tasks. This promises to +eliminate the need for manually labeled training data and task-specific model +training. However, it remains an open question whether tools like ChatGPT can +deliver on this promise. In this paper, we show that smaller, fine-tuned LLMs +(still) consistently and significantly outperform larger, zero-shot prompted +models in text classification. We compare three major generative AI models +(ChatGPT with GPT-3.5/GPT-4 and Claude Opus) with several fine-tuned LLMs +across a diverse set of classification tasks (sentiment, approval/disapproval, +emotions, party positions) and text categories (news, tweets, speeches). We +find that fine-tuning with application-specific training data achieves superior +performance in all cases. To make this approach more accessible to a broader +audience, we provide an easy-to-use toolkit alongside this paper. Our toolkit, +accompanied by non-technical step-by-step guidance, enables users to select and +fine-tune BERT-like LLMs for any classification task with minimal technical and +computational effort. + +
+
+
+
+
+ + ♻ ☆ Mind the Privacy Unit! User-Level Differential Privacy for Language + Model Fine-Tuning + + +
+ Large language models (LLMs) have emerged as powerful tools for tackling +complex tasks across diverse domains, but they also raise privacy concerns when +fine-tuned on sensitive data due to potential memorization. While differential +privacy (DP) offers a promising solution by ensuring models are 'almost +indistinguishable' with or without any particular privacy unit, current +evaluations on LLMs mostly treat each example (text record) as the privacy +unit. This leads to uneven user privacy guarantees when contributions per user +vary. We therefore study user-level DP motivated by applications where it +necessary to ensure uniform privacy protection across users. We present a +systematic evaluation of user-level DP for LLM fine-tuning on natural language +generation tasks. Focusing on two mechanisms for achieving user-level DP +guarantees, Group Privacy and User-wise DP-SGD, we investigate design choices +like data selection strategies and parameter tuning for the best +privacy-utility tradeoff. + +
+
+ comment: Published as a conference paper at COLM 2024 +
+
+
+
+
+ + ♻ ☆ Transformers and Cortical Waves: Encoders for Pulling In Context Across + Time + + +
+ The capabilities of transformer networks such as ChatGPT and other Large +Language Models (LLMs) have captured the world's attention. The crucial +computational mechanism underlying their performance relies on transforming a +complete input sequence - for example, all the words in a sentence - into a +long "encoding vector" that allows transformers to learn long-range temporal +dependencies in naturalistic sequences. Specifically, "self-attention" applied +to this encoding vector enhances temporal context in transformers by computing +associations between pairs of words in the input sequence. We suggest that +waves of neural activity traveling across single cortical areas or multiple +regions at the whole-brain scale could implement a similar encoding principle. +By encapsulating recent input history into a single spatial pattern at each +moment in time, cortical waves may enable temporal context to be extracted from +sequences of sensory inputs, the same computational principle used in +transformers. + +
+
+ comment: 27 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ No Language is an Island: Unifying Chinese and English in Financial + Large Language Models, Instruction Data, and Benchmarks + + +
+ While the progression of Large Language Models (LLMs) has notably propelled +financial analysis, their application has largely been confined to singular +language realms, leaving untapped the potential of bilingual Chinese-English +capacity. To bridge this chasm, we introduce ICE-PIXIU, seamlessly amalgamating +the ICE-INTENT model and ICE-FLARE benchmark for bilingual financial analysis. +ICE-PIXIU uniquely integrates a spectrum of Chinese tasks, alongside translated +and original English datasets, enriching the breadth and depth of bilingual +financial modeling. It provides unrestricted access to diverse model variants, +a substantial compilation of diverse cross-lingual and multi-modal instruction +data, and an evaluation benchmark with expert annotations, comprising 10 NLP +tasks, 20 bilingual specific tasks, totaling 95k datasets. Our thorough +evaluation emphasizes the advantages of incorporating these bilingual datasets, +especially in translation tasks and utilizing original English data, enhancing +both linguistic flexibility and analytical acuity in financial contexts. +Notably, ICE-INTENT distinguishes itself by showcasing significant enhancements +over conventional LLMs and existing financial LLMs in bilingual milieus, +underscoring the profound impact of robust bilingual data on the accuracy and +efficacy of financial NLP. + +
+
+ comment: 19 pages, 3 figures, 12 tables, including Appendix +
+
+
+
+
+ + ♻ ☆ Covert Bias: The Severity of Social Views' Unalignment in Language + Models Towards Implicit and Explicit Opinion + + +
+ While various approaches have recently been studied for bias identification, +little is known about how implicit language that does not explicitly convey a +viewpoint affects bias amplification in large language models. To examine the +severity of bias toward a view, we evaluated the performance of two downstream +tasks where the implicit and explicit knowledge of social groups were used. +First, we present a stress test evaluation by using a biased model in edge +cases of excessive bias scenarios. Then, we evaluate how LLMs calibrate +linguistically in response to both implicit and explicit opinions when they are +aligned with conflicting viewpoints. Our findings reveal a discrepancy in LLM +performance in identifying implicit and explicit opinions, with a general +tendency of bias toward explicit opinions of opposing stances. Moreover, the +bias-aligned models generate more cautious responses using uncertainty phrases +compared to the unaligned (zero-shot) base models. The direct, incautious +responses of the unaligned models suggest a need for further refinement of +decisiveness by incorporating uncertainty markers to enhance their reliability, +especially on socially nuanced topics with high subjectivity. + +
+
+ comment: This work is under-review +
+
+
+
+
+ + ♻ ☆ Large Language Models Meet Text-Centric Multimodal Sentiment Analysis: A + Survey + + +
+ Compared to traditional sentiment analysis, which only considers text, +multimodal sentiment analysis needs to consider emotional signals from +multimodal sources simultaneously and is therefore more consistent with the way +how humans process sentiment in real-world scenarios. It involves processing +emotional information from various sources such as natural language, images, +videos, audio, physiological signals, etc. However, although other modalities +also contain diverse emotional cues, natural language usually contains richer +contextual information and therefore always occupies a crucial position in +multimodal sentiment analysis. The emergence of ChatGPT has opened up immense +potential for applying large language models (LLMs) to text-centric multimodal +tasks. However, it is still unclear how existing LLMs can adapt better to +text-centric multimodal sentiment analysis tasks. This survey aims to (1) +present a comprehensive review of recent research in text-centric multimodal +sentiment analysis tasks, (2) examine the potential of LLMs for text-centric +multimodal sentiment analysis, outlining their approaches, advantages, and +limitations, (3) summarize the application scenarios of LLM-based multimodal +sentiment analysis technology, and (4) explore the challenges and potential +research directions for multimodal sentiment analysis in the future. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2210.14556 by other authors +
+
+
+
+
+ + ♻ ☆ Unlocking the Non-Native Language Context Limitation: Native Language + Prompting Facilitates Knowledge Elicitation + + +
+ Multilingual large language models (MLLMs) struggle to answer questions posed +in non-dominant languages, even though they have acquired the relevant +knowledge from their dominant language corpus. In contrast, human multilinguals +can overcome such non-native language context limitations through Positive +Native Language Transfer (PNLT). Inspired by the process of PNLT, we analogize +the dominant language of MLLMs to the native language of human multilinguals, +and propose Native Language Prompting (NatLan) to simulate the PNLT observed in +human multilinguals. It explicitly creates native language contexts for MLLMs +to facilitate the elicitation of the rich native language knowledge during +question-answering, unlocking the limitations imposed by non-native language +contexts. By employing multi-MLLM collaboration, NatLan reduces the workload on +each MLLM in simulating PNLT and refines semantic transfer. On the C-Eval +benchmark, NatLan provides up to a 10.1% average accuracy improvement and up to +a 5.0% increase in the hard-level subset across five MLLMs, surpassing all +top-notch related methods. Our code is available at +https://github.com/AnonyNLP/NatLan. + +
+
+
+
+
+ + ♻ ☆ Emphasising Structured Information: Integrating Abstract Meaning + Representation into LLMs for Enhanced Open-Domain Dialogue Evaluation + + +
+ Automatic open-domain dialogue evaluation has attracted increasing attention. +Trainable evaluation metrics, typically trained with true positive and randomly +selected negative responses, tend to assign higher scores to responses that +share greater content similarity with a given context. However, adversarial +negative responses, despite possessing high content similarity with the +contexts, are semantically different. Consequently, existing evaluation metrics +are not robust enough to evaluate such responses, resulting in low correlations +with human judgments. While recent studies have demonstrated the effectiveness +of Large Language Models (LLMs) for open-domain dialogue evaluation, they still +face challenges in effectively handling adversarial negative examples. In this +paper, we propose an effective framework for open-domain dialogue evaluation, +which combines domain-specific language models (SLMs) enhanced with Abstract +Meaning Representation (AMR) knowledge with LLMs. The SLMs can explicitly +incorporate AMR graph information of the dialogue through a gating mechanism +for enhanced dialogue semantic representation learning. Both the evaluation +result from the SLMs and the AMR graph information are incorporated into the +LLM's prompt for enhanced evaluation performance. Experimental results on +open-domain dialogue evaluation tasks demonstrate the superiority of our method +compared to a wide range of state-of-the-art baselines, especially in +discriminating adversarial negative responses. Our code and data are publicly +available at https://github.com/Bernard-Yang/SIMAMR. + +
+
+
+
+
+ + ♻ ☆ MathBridge: A Large Corpus Dataset for Translating Spoken Mathematical + Expressions into $LaTeX$ Formulas for Improved Readability + + +
+ Improving the readability of mathematical expressions in text-based document +such as subtitle of mathematical video, is an significant task. To achieve +this, mathematical expressions should be convert to compiled formulas. For +instance, the spoken expression ``x equals minus b plus or minus the square +root of b squared minus four a c, all over two a'' from automatic speech +recognition is more readily comprehensible when displayed as a compiled formula +$x = \frac{-b \pm \sqrt{b^2 - 4ac}}{2a}$. To convert mathematical spoken +sentences to compiled formulas, two processes are required: spoken sentences +are converted into LaTeX formulas, and LaTeX formulas are converted into +compiled formulas. The latter can be managed by using LaTeX engines. However, +there is no way to do the former effectively. Even if we try to solve this +using language models, there is no paired data between spoken sentences and +LaTeX formulas to train it. In this paper, we introduce MathBridge, the first +extensive dataset for translating mathematical spoken sentences into LaTeX +formulas. MathBridge comprises approximately 23 million LaTeX formulas paired +with the corresponding mathematical spoken sentences. Through comprehensive +evaluations, including fine-tuning with proposed data, we discovered that +MathBridge significantly enhances the capabilities of pretrained language +models for converting to LaTeX formulas from mathematical spoken sentences. +Specifically, for the T5-large model, the sacreBLEU score increased from 4.77 +to 46.8, demonstrating substantial enhancement. + +
+
+ comment: 9 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Crafting Customisable Characters with LLMs: Introducing SimsChat, a + Persona-Driven Role-Playing Agent Framework + + +
+ Large Language Models (LLMs) demonstrate a remarkable ability to comprehend +human instructions and generate high-quality text. This capability allows LLMs +to function as agents that can emulate human beings at a more sophisticated +level, beyond the mere replication of basic human behaviours. However, there is +a lack of exploring into leveraging LLMs to craft characters from diverse +aspects. In this work, we introduce the Customisable Conversation Agent +Framework, which leverages LLMs to simulate real-world characters that can be +freely customised according to various user preferences. This adaptable +framework is beneficial for the design of customisable characters and +role-playing agents aligned with human preferences. We propose the SimsConv +dataset, which encompasses 68 different customised characters, 1,360 multi-turn +role-playing dialogues, and a total of 13,971 interaction dialogues. The +characters are created from several real-world elements, such as career, +aspiration, trait, and skill. Building upon these foundations, we present +SimsChat, a freely customisable role-playing agent. It incorporates diverse +real-world scenes and topic-specific character interaction dialogues, thereby +simulating characters' life experiences in various scenarios and topic-specific +interactions with specific emotions. Experimental results indicate that our +proposed framework achieves desirable performance and provides a valuable +guideline for the construction of more accurate human simulacra in the future. +Our data and code are publicly available at +https://github.com/Bernard-Yang/SimsChat. + +
+
+
+
+
+ + ♻ ☆ Ada-KV: Optimizing KV Cache Eviction by Adaptive Budget Allocation for + Efficient LLM Inference + + +
+ Large Language Models have excelled in various fields but encounter +challenges in memory and time efficiency due to the expanding Key-Value (KV) +cache required for long-sequence inference. Recent efforts try to reduce KV +cache size to a given memory budget by evicting vast non-critical cache +elements during runtime, while preserving generation quality. Our revisiting of +current eviction methods reveals that they fundamentally minimize an upper +bound of the $L_1$ eviction loss between the pre- and post-eviction outputs of +multi-head self-attention mechanisms. Moreover, our analysis indicates that the +common practices of uniformly assigning budgets across attention heads harm +their post-eviction generation quality. In light of these findings, we propose +a simple yet effective adaptive budget allocation algorithm. This algorithm not +only optimizes the theoretical loss upper bound but also reduces the $L_1$ +eviction loss in practice by aligning with the varied characteristics across +different heads. By integrating this algorithm into two state-of-the-art +methods, we demonstrate the effectiveness of using adaptive budget allocation +to optimize KV cache eviction. Extensive evaluations on 16 datasets and the +Needle-in-a-Haystack test confirm significant performance improvements across +various tasks. + +
+
+
+
+
+ + ♻ ☆ Robust Neural Information Retrieval: An Adversarial and + Out-of-distribution Perspective + + +
+ Recent advances in neural information retrieval (IR) models have +significantly enhanced their effectiveness over various IR tasks. The +robustness of these models, essential for ensuring their reliability in +practice, has also garnered significant attention. With a wide array of +research on robust IR being proposed, we believe it is the opportune moment to +consolidate the current status, glean insights from existing methodologies, and +lay the groundwork for future development. We view the robustness of IR to be a +multifaceted concept, emphasizing its necessity against adversarial attacks, +out-of-distribution (OOD) scenarios and performance variance. With a focus on +adversarial and OOD robustness, we dissect robustness solutions for dense +retrieval models (DRMs) and neural ranking models (NRMs), respectively, +recognizing them as pivotal components of the neural IR pipeline. We provide an +in-depth discussion of existing methods, datasets, and evaluation metrics, +shedding light on challenges and future directions in the era of large language +models. To the best of our knowledge, this is the first comprehensive survey on +the robustness of neural IR models, and we will also be giving our first +tutorial presentation at SIGIR 2024 +\url{https://sigir2024-robust-information-retrieval.github.io}. Along with the +organization of existing work, we introduce a Benchmark for robust IR (BestIR), +a heterogeneous evaluation benchmark for robust neural information retrieval, +which is publicly available at \url{https://github.com/Davion-Liu/BestIR}. We +hope that this study provides useful clues for future research on the +robustness of IR models and helps to develop trustworthy search engines +\url{https://github.com/Davion-Liu/Awesome-Robustness-in-Information-Retrieval}. + +
+
+ comment: Survey paper +
+
+
+
+
+ + ♻ ☆ What Do Language Models Hear? Probing for Auditory Representations in + Language Models + + +
+ This work explores whether language models encode meaningfully grounded +representations of sounds of objects. We learn a linear probe that retrieves +the correct text representation of an object given a snippet of audio related +to that object, where the sound representation is given by a pretrained audio +model. This probe is trained via a contrastive loss that pushes the language +representations and sound representations of an object to be close to one +another. After training, the probe is tested on its ability to generalize to +objects that were not seen during training. Across different language models +and audio models, we find that the probe generalization is above chance in many +cases, indicating that despite being trained only on raw text, language models +encode grounded knowledge of sounds for some objects. + +
+
+
+
+
+ + ♻ ☆ Natural Language Interaction with a Household Electricity + Knowledge-based Digital Twin + + +
+ Domain specific digital twins, representing a digital replica of various +segments of the smart grid, are foreseen as able to model, simulate, and +control the respective segments. At the same time, knowledge-based digital +twins, coupled with AI, may also empower humans to understand aspects of the +system through natural language interaction in view of planning and policy +making. This paper is the first to assess and report on the potential of +Retrieval Augmented Generation (RAG) question answers related to household +electrical energy measurement aspects leveraging a knowledge-based energy +digital twin. Relying on the recently published electricity consumption +knowledge graph that actually represents a knowledge-based digital twin, we +study the capabilities of ChatGPT, Gemini and Llama in answering electricity +related questions. Furthermore, we compare the answers with the ones generated +through a RAG techniques that leverages an existing electricity knowledge-based +digital twin. Our findings illustrate that the RAG approach not only reduces +the incidence of incorrect information typically generated by LLMs but also +significantly improves the quality of the output by grounding responses in +verifiable data. This paper details our methodology, presents a comparative +analysis of responses with and without RAG, and discusses the implications of +our findings for future applications of AI in specialized sectors like energy +data analysis. + +
+
+ comment: Accepted at IEEE SmartGridComm'24 +
+
+
+
+
+ + ♻ ☆ RT-Surv: Improving Mortality Prediction After Radiotherapy with Large + Language Model Structuring of Large-Scale Unstructured Electronic Health + Records + + +
+ Accurate patient selection is critical in radiotherapy (RT) to prevent +ineffective treatments. Traditional survival prediction models, relying on +structured data, often lack precision. This study explores the potential of +large language models (LLMs) to structure unstructured electronic health record +(EHR) data, thereby improving survival prediction accuracy through +comprehensive clinical information integration. Data from 34,276 patients +treated with RT at Yonsei Cancer Center between 2013 and 2023 were analyzed, +encompassing both structured and unstructured data. An open-source LLM was used +to structure the unstructured EHR data via single-shot learning, with its +performance compared against a domain-specific medical LLM and a smaller +variant. Survival prediction models were developed using statistical, machine +learning, and deep learning approaches, incorporating both structured and +LLM-structured data. Clinical experts evaluated the accuracy of the +LLM-structured data. The open-source LLM achieved 87.5% accuracy in structuring +unstructured EHR data without additional training, significantly outperforming +the domain-specific medical LLM, which reached only 35.8% accuracy. Larger LLMs +were more effective, particularly in extracting clinically relevant features +like general condition and disease extent, which closely correlated with +patient survival. Incorporating LLM-structured clinical features into survival +prediction models significantly improved accuracy, with the C-index of deep +learning models increasing from 0.737 to 0.820. These models also became more +interpretable by emphasizing clinically significant factors. This study shows +that general-domain LLMs, even without specific medical training, can +effectively structure large-scale unstructured EHR data, substantially +enhancing the accuracy and interpretability of clinical predictive models. + +
+
+ comment: 23 pages, 2 tables, 4 figures +
+
+
+
+
+ + ♻ ☆ SCENE: Evaluating Explainable AI Techniques Using Soft Counterfactuals + + +
+ Explainable Artificial Intelligence (XAI) plays a crucial role in enhancing +the transparency and accountability of AI models, particularly in natural +language processing (NLP) tasks. However, popular XAI methods such as LIME and +SHAP have been found to be unstable and potentially misleading, underscoring +the need for a standardized evaluation approach. This paper introduces SCENE +(Soft Counterfactual Evaluation for Natural language Explainability), a novel +evaluation method that leverages large language models (LLMs) to generate Soft +Counterfactual explanations in a zero-shot manner. By focusing on token-based +substitutions, SCENE creates contextually appropriate and semantically +meaningful Soft Counterfactuals without extensive fine-tuning. SCENE adopts +Validitysoft and Csoft metrics to assess the effectiveness of model-agnostic +XAI methods in text classification tasks. Applied to CNN, RNN, and Transformer +architectures, SCENE provides valuable insights into the strengths and +limitations of various XAI techniques. + +
+
+
+
+
+ + ♻ ☆ Labeling supervised fine-tuning data with the scaling law + + +
+ This paper introduces a multi-stage manual annotation calibrated by the +scaling law, offering a high-quality Supervised Fine-Tuning data acquisition +method for environments with constrained resources like GPU poor, limited GPT +access, and funding restrictions. We have preprocessed 58k authentic chat data +and manually annotated 2.3k questions. After this, we conducted fine-tuning on +Qwen models, ranging from 0.5B to 32B parameters. The optimal version improved +29.07 in F1 score. This confirms the viability of fine-tuning Large Language +Model (LLM) for downstream Natural Language Processing (NLP) tasks. Our +contributions are: 1) Created Supervised Fine-Tuning (SFT) training data in +alpaca format, along with a set of Low-Rank Adaptation (LoRA) weights, and 2) +Developed a method for acquiring high-quality data leveraging scaling law +principle. The script, raw data with alpaca format and experiments track are +open-sourced on Github +(https://github.com/InternLM/HuixiangDou/tree/main/web/tools), HuggingFace +(https://huggingface.co/tpoisonooo) and WandB +(https://wandb.ai/tpoisonooo/huixiangdou-cr/table?nw=nwusertpoisonooo). The +privacy of the data involved has been authorized by users. SFT data and license +comes from ncnn contributors group. + +
+
+ comment: 5 pages, 3 tables, 3 figures +
+
+
+
+
+ + ♻ ☆ ToolSword: Unveiling Safety Issues of Large Language Models in Tool + Learning Across Three Stages ACL 2024 + + +
+ Tool learning is widely acknowledged as a foundational approach or deploying +large language models (LLMs) in real-world scenarios. While current research +primarily emphasizes leveraging tools to augment LLMs, it frequently neglects +emerging safety considerations tied to their application. To fill this gap, we +present *ToolSword*, a comprehensive framework dedicated to meticulously +investigating safety issues linked to LLMs in tool learning. Specifically, +ToolSword delineates six safety scenarios for LLMs in tool learning, +encompassing **malicious queries** and **jailbreak attacks** in the input +stage, **noisy misdirection** and **risky cues** in the execution stage, and +**harmful feedback** and **error conflicts** in the output stage. Experiments +conducted on 11 open-source and closed-source LLMs reveal enduring safety +challenges in tool learning, such as handling harmful queries, employing risky +tools, and delivering detrimental feedback, which even GPT-4 is susceptible to. +Moreover, we conduct further studies with the aim of fostering research on tool +learning safety. The data is released in +https://github.com/Junjie-Ye/ToolSword. + +
+
+ comment: Accepted by ACL 2024 Main Conference +
+
+
+
+
+ + ♻ ☆ MAG-SQL: Multi-Agent Generative Approach with Soft Schema Linking and + Iterative Sub-SQL Refinement for Text-to-SQL + + +
+ Recent In-Context Learning based methods have achieved remarkable success in +Text-to-SQL task. However, there is still a large gap between the performance +of these models and human performance on datasets with complex database schema +and difficult questions, such as BIRD. Besides, existing work has neglected to +supervise intermediate steps when solving questions iteratively with question +decomposition methods, and the schema linking methods used in these works are +very rudimentary. To address these issues, we propose MAG-SQL, a multi-agent +generative approach with soft schema linking and iterative Sub-SQL refinement. +In our framework, an entity-based method with tables' summary is used to select +the columns in database, and a novel targets-conditions decomposition method is +introduced to decompose those complex questions. Additionally, we build a +iterative generating module which includes a Sub-SQL Generator and Sub-SQL +Refiner, introducing external oversight for each step of generation. Through a +series of ablation studies, the effectiveness of each agent in our framework +has been demonstrated. When evaluated on the BIRD benchmark with GPT-4, MAG-SQL +achieves an execution accuracy of 61.08%, compared to the baseline accuracy of +46.35% for vanilla GPT-4 and the baseline accuracy of 57.56% for MAC-SQL. +Besides, our approach makes similar progress on Spider. + +
+
+ comment: 22 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ Distilling Reasoning Ability from Large Language Models with Adaptive + Thinking + + +
+ Chain of thought finetuning (cot-finetuning) aims to endow small language +models (SLM) with reasoning ability to improve their performance towards +specific tasks by allowing them to imitate the reasoning procedure of large +language models (LLM) beyond simply predicting the answers. Most existing +cot-finetuning methods adopt a pre-thinking mechanism, allowing the SLM to +generate a rationale before providing an answer. This mechanism enables SLM to +analyze and think about complex questions, but it also makes answer correctness +highly sensitive to minor errors in rationale. Therefore, we propose a robust +post-thinking mechanism to generate answers before rationale. Thanks to this +answer-first setting, 1) the answer can escape from the adverse effects caused +by minor errors in the rationale; 2) the rationale serves as an error amplifier +to the answer, which makes the SLM focus on learning hard samples; 3) the +inferring efficiency can also benefit from the setting since users can stop the +generation right after answers are outputted when inference is conducted. +However, although the post-thinking mechanism brings many advantages and +improves the overall performance of SLM on specific tasks, it may lose the +ability to think about the questions and decompose complex questions into +simple sub-questions compared to pre-thinking mechanism. Therefore, a +plug-and-play adaptive-thinking mechanism is proposed with the aid of the soft +prompt tuning to integrate the merits of the pre-thinking mechanism and +post-thinking mechanism, in which a perception module is introduced to +adaptively prompt SLM answer or think first based on perceiving the complexity +of the questions. Extensive experiments are conducted across 12 reasoning tasks +and 2 representative language models to demonstrate the effectiveness of the +proposed mechanism. + +
+
+
+
+
+ + ♻ ☆ MKRAG: Medical Knowledge Retrieval Augmented Generation for Medical + Question Answering + + +
+ Large Language Models (LLMs), although powerful in general domains, often +perform poorly on domain-specific tasks such as medical question answering +(QA). In addition, LLMs tend to function as "black-boxes", making it +challenging to modify their behavior. To address the problem, our work employs +a transparent process of retrieval augmented generation (RAG), aiming to +improve LLM responses without the need for fine-tuning or retraining. +Specifically, we propose a comprehensive retrieval strategy to extract medical +facts from an external knowledge base, and then inject them into the LLM's +query prompt. Focusing on medical QA, we evaluate the impact of different +retrieval models and the number of facts on LLM performance using the +MedQA-SMILE dataset. Notably, our retrieval-augmented Vicuna-7B model exhibited +an accuracy improvement from 44.46% to 48.54%. This work underscores the +potential of RAG to enhance LLM performance, offering a practical approach to +mitigate the challenges posed by black-box LLMs. + +
+
+ comment: Accepted by AMIA 2024 Annual Symposium +
+
+
+
+
+ + ♻ ☆ A Data Generation Perspective to the Mechanism of In-Context Learning + + +
+ In-Context Learning (ICL) empowers Large Language Models (LLMs) with the +capacity to learn in context, achieving downstream generalization without +gradient updates but with a few in-context examples. Despite the encouraging +empirical success, the underlying mechanism of ICL remains unclear, and +existing research offers various viewpoints of understanding. These studies +propose intuition-driven and ad-hoc technical solutions for interpreting ICL, +illustrating an ambiguous road map. In this paper, we leverage a data +generation perspective to reinterpret recent efforts and demonstrate the +potential broader usage of popular technical solutions, approaching a +systematic angle. For a conceptual definition, we rigorously adopt the terms of +skill learning and skill recognition. The difference between them is skill +learning can learn new data generation functions from in-context data. We also +provide a comprehensive study on the merits and weaknesses of different +solutions, and highlight the uniformity among them given the perspective of +data generation, establishing a technical foundation for future research to +incorporate the strengths of different lines of research. + +
+
+ comment: 11 pages, 1 figure +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 95 + +
+
+
+ + ☆ xGen-MM (BLIP-3): A Family of Open Large Multimodal Models + + +
+ This report introduces xGen-MM (also known as BLIP-3), a framework for +developing Large Multimodal Models (LMMs). The framework comprises meticulously +curated datasets, a training recipe, model architectures, and a resulting suite +of LMMs. xGen-MM, short for xGen-MultiModal, expands the Salesforce xGen +initiative on foundation AI models. Our models undergo rigorous evaluation +across a range of tasks, including both single and multi-image benchmarks. Our +pre-trained base model exhibits strong in-context learning capabilities and the +instruction-tuned model demonstrates competitive performance among open-source +LMMs with similar model sizes. In addition, we introduce a safety-tuned model +with DPO, aiming to mitigate harmful behaviors such as hallucinations and +improve safety. We open-source our models, curated large-scale datasets, and +our fine-tuning codebase to facilitate further advancements in LMM research. +Associated resources will be available on our project page above. + +
+
+
+
+
+ + ☆ SAM2-UNet: Segment Anything 2 Makes Strong Encoder for Natural and + Medical Image Segmentation + + +
+ Image segmentation plays an important role in vision understanding. Recently, +the emerging vision foundation models continuously achieved superior +performance on various tasks. Following such success, in this paper, we prove +that the Segment Anything Model 2 (SAM2) can be a strong encoder for U-shaped +segmentation models. We propose a simple but effective framework, termed +SAM2-UNet, for versatile image segmentation. Specifically, SAM2-UNet adopts the +Hiera backbone of SAM2 as the encoder, while the decoder uses the classic +U-shaped design. Additionally, adapters are inserted into the encoder to allow +parameter-efficient fine-tuning. Preliminary experiments on various downstream +tasks, such as camouflaged object detection, salient object detection, marine +animal segmentation, mirror detection, and polyp segmentation, demonstrate that +our SAM2-UNet can simply beat existing specialized state-of-the-art methods +without bells and whistles. Project page: +\url{https://github.com/WZH0120/SAM2-UNet}. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ☆ DPA: Dual Prototypes Alignment for Unsupervised Adaptation of + Vision-Language Models + + +
+ Vision-language models (VLMs), e.g., CLIP, have shown remarkable potential in +zero-shot image classification. However, adapting these models to new domains +remains challenging, especially in unsupervised settings where labelled data is +unavailable. Recent research has proposed pseudo-labelling approaches to adapt +CLIP in an unsupervised manner using unlabelled target data. Nonetheless, these +methods struggle due to noisy pseudo-labels resulting from the misalignment +between CLIP's visual and textual representations. This study introduces DPA, +an unsupervised domain adaptation method for VLMs. DPA introduces the concept +of dual prototypes, acting as distinct classifiers, along with the convex +combination of their outputs, thereby leading to accurate pseudo-label +construction. Next, it ranks pseudo-labels to facilitate robust self-training, +particularly during early training. Finally, it addresses visual-textual +misalignment by aligning textual prototypes with image prototypes to further +improve the adaptation performance. Experiments on 13 downstream vision tasks +demonstrate that DPA significantly outperforms zero-shot CLIP and the +state-of-the-art unsupervised adaptation baselines. + +
+
+
+
+
+ + ☆ HistoGym: A Reinforcement Learning Environment for Histopathological + Image Analysis + + +
+ In pathological research, education, and clinical practice, the +decision-making process based on pathological images is critically important. +This significance extends to digital pathology image analysis: its adequacy is +demonstrated by the extensive information contained within tissue structures, +which is essential for accurate cancer classification and grading. +Additionally, its necessity is highlighted by the inherent requirement for +interpretability in the conclusions generated by algorithms. For humans, +determining tumor type and grade typically involves multi-scale analysis, which +presents a significant challenge for AI algorithms. Traditional patch-based +methods are inadequate for modeling such complex structures, as they fail to +capture the intricate, multi-scale information inherent in whole slide images. +Consequently, there is a pressing need for advanced AI techniques capable of +efficiently and accurately replicating this complex analytical process. To +address this issue, we introduce HistoGym, an open-source reinforcement +learning environment for histopathological image analysis. Following OpenAI Gym +APIs, HistoGym aims to foster whole slide image diagnosis by mimicking the +real-life processes of doctors. Leveraging the pyramid feature of WSIs and the +OpenSlide API, HistoGym provides a unified framework for various clinical +tasks, including tumor detection and classification. We detail the observation, +action, and reward specifications tailored for the histopathological image +analysis domain and provide an open-source Python-based interface for both +clinicians and researchers. To accommodate different clinical demands, we offer +various scenarios for different organs and cancers, including both WSI-based +and selected region-based scenarios, showcasing several noteworthy results. + +
+
+
+
+
+ + ☆ RGBT Tracking via All-layer Multimodal Interactions with Progressive + Fusion Mamba + + +
+ Existing RGBT tracking methods often design various interaction models to +perform cross-modal fusion of each layer, but can not execute the feature +interactions among all layers, which plays a critical role in robust multimodal +representation, due to large computational burden. To address this issue, this +paper presents a novel All-layer multimodal Interaction Network, named AINet, +which performs efficient and effective feature interactions of all modalities +and layers in a progressive fusion Mamba, for robust RGBT tracking. Even though +modality features in different layers are known to contain different cues, it +is always challenging to build multimodal interactions in each layer due to +struggling in balancing interaction capabilities and efficiency. Meanwhile, +considering that the feature discrepancy between RGB and thermal modalities +reflects their complementary information to some extent, we design a +Difference-based Fusion Mamba (DFM) to achieve enhanced fusion of different +modalities with linear complexity. When interacting with features from all +layers, a huge number of token sequences (3840 tokens in this work) are +involved and the computational burden is thus large. To handle this problem, we +design an Order-dynamic Fusion Mamba (OFM) to execute efficient and effective +feature interactions of all layers by dynamically adjusting the scan order of +different layers in Mamba. Extensive experiments on four public RGBT tracking +datasets show that AINet achieves leading performance against existing +state-of-the-art methods. + +
+
+
+
+
+ + ☆ PFDiff: Training-free Acceleration of Diffusion Models through the + Gradient Guidance of Past and Future + + +
+ Diffusion Probabilistic Models (DPMs) have shown remarkable potential in +image generation, but their sampling efficiency is hindered by the need for +numerous denoising steps. Most existing solutions accelerate the sampling +process by proposing fast ODE solvers. However, the inevitable discretization +errors of the ODE solvers are significantly magnified when the number of +function evaluations (NFE) is fewer. In this work, we propose PFDiff, a novel +training-free and orthogonal timestep-skipping strategy, which enables existing +fast ODE solvers to operate with fewer NFE. Based on two key observations: a +significant similarity in the model's outputs at time step size that is not +excessively large during the denoising process of existing ODE solvers, and a +high resemblance between the denoising process and SGD. PFDiff, by employing +gradient replacement from past time steps and foresight updates inspired by +Nesterov momentum, rapidly updates intermediate states, thereby reducing +unnecessary NFE while correcting for discretization errors inherent in +first-order ODE solvers. Experimental results demonstrate that PFDiff exhibits +flexible applicability across various pre-trained DPMs, particularly excelling +in conditional DPMs and surpassing previous state-of-the-art training-free +methods. For instance, using DDIM as a baseline, we achieved 16.46 FID (4 NFE) +compared to 138.81 FID with DDIM on ImageNet 64x64 with classifier guidance, +and 13.06 FID (10 NFE) on Stable Diffusion with 7.5 guidance scale. + +
+
+
+
+
+ + ☆ Retrieval-augmented Few-shot Medical Image Segmentation with Foundation + Models + + +
+ Medical image segmentation is crucial for clinical decision-making, but the +scarcity of annotated data presents significant challenges. Few-shot +segmentation (FSS) methods show promise but often require retraining on the +target domain and struggle to generalize across different modalities. +Similarly, adapting foundation models like the Segment Anything Model (SAM) for +medical imaging has limitations, including the need for finetuning and +domain-specific adaptation. To address these issues, we propose a novel method +that adapts DINOv2 and Segment Anything Model 2 (SAM 2) for retrieval-augmented +few-shot medical image segmentation. Our approach uses DINOv2's feature as +query to retrieve similar samples from limited annotated data, which are then +encoded as memories and stored in memory bank. With the memory attention +mechanism of SAM 2, the model leverages these memories as conditions to +generate accurate segmentation of the target image. We evaluated our framework +on three medical image segmentation tasks, demonstrating superior performance +and generalizability across various modalities without the need for any +retraining or finetuning. Overall, this method offers a practical and effective +solution for few-shot medical image segmentation and holds significant +potential as a valuable annotation tool in clinical applications. + +
+
+
+
+
+ + ☆ PriorMapNet: Enhancing Online Vectorized HD Map Construction with Priors + + +
+ Online vectorized High-Definition (HD) map construction is crucial for +subsequent prediction and planning tasks in autonomous driving. Following MapTR +paradigm, recent works have made noteworthy achievements. However, reference +points are randomly initialized in mainstream methods, leading to unstable +matching between predictions and ground truth. To address this issue, we +introduce PriorMapNet to enhance online vectorized HD map construction with +priors. We propose the PPS-Decoder, which provides reference points with +position and structure priors. Fitted from the map elements in the dataset, +prior reference points lower the learning difficulty and achieve stable +matching. Furthermore, we propose the PF-Encoder to enhance the image-to-BEV +transformation with BEV feature priors. Besides, we propose the DMD +cross-attention, which decouples cross-attention along multi-scale and +multi-sample respectively to achieve efficiency. Our proposed PriorMapNet +achieves state-of-the-art performance in the online vectorized HD map +construction task on nuScenes and Argoverse2 datasets. The code will be +released publicly soon. + +
+
+
+
+
+ + ☆ Backward-Compatible Aligned Representations via an Orthogonal + Transformation Layer ECCV2024 + + +
+ Visual retrieval systems face significant challenges when updating models +with improved representations due to misalignment between the old and new +representations. The costly and resource-intensive backfilling process involves +recalculating feature vectors for images in the gallery set whenever a new +model is introduced. To address this, prior research has explored +backward-compatible training methods that enable direct comparisons between new +and old representations without backfilling. Despite these advancements, +achieving a balance between backward compatibility and the performance of +independently trained models remains an open problem. In this paper, we address +it by expanding the representation space with additional dimensions and +learning an orthogonal transformation to achieve compatibility with old models +and, at the same time, integrate new information. This transformation preserves +the original feature space's geometry, ensuring that our model aligns with +previous versions while also learning new data. Our Orthogonal Compatible +Aligned (OCA) approach eliminates the need for re-indexing during model updates +and ensures that features can be compared directly across different model +updates without additional mapping functions. Experimental results on CIFAR-100 +and ImageNet-1k demonstrate that our method not only maintains compatibility +with previous models but also achieves state-of-the-art accuracy, outperforming +several existing methods. + +
+
+ comment: Accepted at BEW2024 Workshop at ECCV2024 +
+
+
+
+
+ + ☆ Assessing Generalization Capabilities of Malaria Diagnostic Models from + Thin Blood Smears MICCAI 2024 + + +
+ Malaria remains a significant global health challenge, necessitating rapid +and accurate diagnostic methods. While computer-aided diagnosis (CAD) tools +utilizing deep learning have shown promise, their generalization to diverse +clinical settings remains poorly assessed. This study evaluates the +generalization capabilities of a CAD model for malaria diagnosis from thin +blood smear images across four sites. We explore strategies to enhance +generalization, including fine-tuning and incremental learning. Our results +demonstrate that incorporating site-specific data significantly improves model +performance, paving the way for broader clinical application. + +
+
+ comment: MICCAI 2024 AMAI Workshop, Accepted for presentation, Submitted + Manuscript Version, 10 pages +
+
+
+
+
+ + ☆ A Disease-Specific Foundation Model Using Over 100K Fundus Images: + Release and Validation for Abnormality and Multi-Disease Classification on + Downstream Tasks + + +
+ Artificial intelligence applied to retinal images offers significant +potential for recognizing signs and symptoms of retinal conditions and +expediting the diagnosis of eye diseases and systemic disorders. However, +developing generalized artificial intelligence models for medical data often +requires a large number of labeled images representing various disease signs, +and most models are typically task-specific, focusing on major retinal +diseases. In this study, we developed a Fundus-Specific Pretrained Model +(Image+Fundus), a supervised artificial intelligence model trained to detect +abnormalities in fundus images. A total of 57,803 images were used to develop +this pretrained model, which achieved superior performance across various +downstream tasks, indicating that our proposed model outperforms other general +methods. Our Image+Fundus model offers a generalized approach to improve model +performance while reducing the number of labeled datasets required. +Additionally, it provides more disease-specific insights into fundus images, +with visualizations generated by our model. These disease-specific foundation +models are invaluable in enhancing the performance and efficiency of deep +learning models in the field of fundus imaging. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ☆ Multi-task Learning Approach for Intracranial Hemorrhage Prognosis + + +
+ Prognosis after intracranial hemorrhage (ICH) is influenced by a complex +interplay between imaging and tabular data. Rapid and reliable prognosis are +crucial for effective patient stratification and informed treatment +decision-making. In this study, we aim to enhance image-based prognosis by +learning a robust feature representation shared between prognosis and the +clinical and demographic variables most highly correlated with it. Our approach +mimics clinical decision-making by reinforcing the model to learn valuable +prognostic data embedded in the image. We propose a 3D multi-task image model +to predict prognosis, Glasgow Coma Scale and age, improving accuracy and +interpretability. Our method outperforms current state-of-the-art baseline +image models, and demonstrates superior performance in ICH prognosis compared +to four board-certified neuroradiologists using only CT scans as input. We +further validate our model with interpretability saliency maps. Code is +available at https://github.com/MiriamCobo/MultitaskLearning_ICH_Prognosis.git. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ VF-NeRF: Learning Neural Vector Fields for Indoor Scene Reconstruction + + +
+ Implicit surfaces via neural radiance fields (NeRF) have shown surprising +accuracy in surface reconstruction. Despite their success in reconstructing +richly textured surfaces, existing methods struggle with planar regions with +weak textures, which account for the majority of indoor scenes. In this paper, +we address indoor dense surface reconstruction by revisiting key aspects of +NeRF in order to use the recently proposed Vector Field (VF) as the implicit +representation. VF is defined by the unit vector directed to the nearest +surface point. It therefore flips direction at the surface and equals to the +explicit surface normals. Except for this flip, VF remains constant along +planar surfaces and provides a strong inductive bias in representing planar +surfaces. Concretely, we develop a novel density-VF relationship and a training +scheme that allows us to learn VF via volume rendering By doing this, VF-NeRF +can model large planar surfaces and sharp corners accurately. We show that, +when depth cues are available, our method further improves and achieves +state-of-the-art results in reconstructing indoor scenes and rendering novel +views. We extensively evaluate VF-NeRF on indoor datasets and run ablations of +its components. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ PCP-MAE: Learning to Predict Centers for Point Masked Autoencoders + + +
+ Masked autoencoder has been widely explored in point cloud self-supervised +learning, whereby the point cloud is generally divided into visible and masked +parts. These methods typically include an encoder accepting visible patches +(normalized) and corresponding patch centers (position) as input, with the +decoder accepting the output of the encoder and the centers (position) of the +masked parts to reconstruct each point in the masked patches. Then, the +pre-trained encoders are used for downstream tasks. In this paper, we show a +motivating empirical result that when directly feeding the centers of masked +patches to the decoder without information from the encoder, it still +reconstructs well. In other words, the centers of patches are important and the +reconstruction objective does not necessarily rely on representations of the +encoder, thus preventing the encoder from learning semantic representations. +Based on this key observation, we propose a simple yet effective method, i.e., +learning to Predict Centers for Point Masked AutoEncoders (PCP-MAE) which +guides the model to learn to predict the significant centers and use the +predicted centers to replace the directly provided centers. Specifically, we +propose a Predicting Center Module (PCM) that shares parameters with the +original encoder with extra cross-attention to predict centers. Our method is +of high pre-training efficiency compared to other alternatives and achieves +great improvement over Point-MAE, particularly outperforming it by 5.50%, +6.03%, and 5.17% on three variants of ScanObjectNN. The code will be made +publicly available. + +
+
+
+
+
+ + ☆ Comparative Analysis of Generative Models: Enhancing Image Synthesis + with VAEs, GANs, and Stable Diffusion + + +
+ This paper examines three major generative modelling frameworks: Variational +Autoencoders (VAEs), Generative Adversarial Networks (GANs), and Stable +Diffusion models. VAEs are effective at learning latent representations but +frequently yield blurry results. GANs can generate realistic images but face +issues such as mode collapse. Stable Diffusion models, while producing +high-quality images with strong semantic coherence, are demanding in terms of +computational resources. Additionally, the paper explores how incorporating +Grounding DINO and Grounded SAM with Stable Diffusion improves image accuracy +by utilising sophisticated segmentation and inpainting techniques. The analysis +guides on selecting suitable models for various applications and highlights +areas for further research. + +
+
+
+
+
+ + ☆ MicroSSIM: Improved Structural Similarity for Comparing Microscopy Data ECCV 24 + + +
+ Microscopy is routinely used to image biological structures of interest. Due +to imaging constraints, acquired images are typically low-SNR and contain +noise. Over the last few years, regression-based tasks like unsupervised +denoising and splitting have found utility in working with such noisy +micrographs. For evaluation, Structural Similarity (SSIM) is one of the most +popular measures used in the field. For such tasks, the best evaluation would +be when both low-SNR noisy images and corresponding high-SNR clean images are +obtained directly from a microscope. However, due to the following three +peculiar properties of the microscopy data, we observe that SSIM is not well +suited to this data regime: (a) high-SNR micrographs have higher intensity +pixels as compared to low SNR micrographs, (b) high-SNR micrographs have higher +intensity pixels than found in natural images, images for which SSIM was +developed, and (c) a digitally configurable offset is added by the detector +present inside the microscope. We show that SSIM components behave unexpectedly +when the prediction generated from low-SNR input is compared with the +corresponding high-SNR data. We explain this behavior by introducing the +phenomenon of saturation, where the value of SSIM components becomes less +sensitive to (dis)similarity between the images. We introduce microSSIM, a +variant of SSIM, which overcomes the above-discussed issues. We justify the +soundness and utility of microSSIM using theoretical and empirical arguments +and show the utility of microSSIM on two tasks: unsupervised denoising and +joint image splitting with unsupervised denoising. Since our formulation can be +applied to a broad family of SSIM-based measures, we also introduce MicroMS3IM, +a microscopy-specific variation of MS-SSIM. The source code and python package +is available at https://github.com/juglab/MicroSSIM. + +
+
+ comment: Accepted at BIC workshop, ECCV 24 +
+
+
+
+
+ + ☆ A lifted Bregman strategy for training unfolded proximal neural network + Gaussian denoisers + + +
+ Unfolded proximal neural networks (PNNs) form a family of methods that +combines deep learning and proximal optimization approaches. They consist in +designing a neural network for a specific task by unrolling a proximal +algorithm for a fixed number of iterations, where linearities can be learned +from prior training procedure. PNNs have shown to be more robust than +traditional deep learning approaches while reaching at least as good +performances, in particular in computational imaging. However, training PNNs +still depends on the efficiency of available training algorithms. In this work, +we propose a lifted training formulation based on Bregman distances for +unfolded PNNs. Leveraging the deterministic mini-batch block-coordinate +forward-backward method, we design a bespoke computational strategy beyond +traditional back-propagation methods for solving the resulting learning problem +efficiently. We assess the behaviour of the proposed training approach for PNNs +through numerical simulations on image denoising, considering a denoising PNN +whose structure is based on dual proximal-gradient iterations. + +
+
+ comment: 2024 IEEE International Workshop on Machine Learning for Signal + Processing, Sept. 22--25, 2024, London, UK +
+
+
+
+
+ + ☆ Task-Aware Dynamic Transformer for Efficient Arbitrary-Scale Image + Super-Resolution ECAI 2024 + + +
+ Arbitrary-scale super-resolution (ASSR) aims to learn a single model for +image super-resolution at arbitrary magnifying scales. Existing ASSR networks +typically comprise an off-the-shelf scale-agnostic feature extractor and an +arbitrary scale upsampler. These feature extractors often use fixed network +architectures to address different ASSR inference tasks, each of which is +characterized by an input image and an upsampling scale. However, this +overlooks the difficulty variance of super-resolution on different inference +scenarios, where simple images or small SR scales could be resolved with less +computational effort than difficult images or large SR scales. To tackle this +difficulty variability, in this paper, we propose a Task-Aware Dynamic +Transformer (TADT) as an input-adaptive feature extractor for efficient image +ASSR. Our TADT consists of a multi-scale feature extraction backbone built upon +groups of Multi-Scale Transformer Blocks (MSTBs) and a Task-Aware Routing +Controller (TARC). The TARC predicts the inference paths within feature +extraction backbone, specifically selecting MSTBs based on the input images and +SR scales. The prediction of inference path is guided by a new loss function to +trade-off the SR accuracy and efficiency. Experiments demonstrate that, when +working with three popular arbitrary-scale upsamplers, our TADT achieves +state-of-the-art ASSR performance when compared with mainstream feature +extractors, but with relatively fewer computational costs. The code will be +publicly released. + +
+
+ comment: ECAI 2024 +
+
+
+
+
+ + ☆ Correspondence-Guided SfM-Free 3D Gaussian Splatting for NVS + + +
+ Novel View Synthesis (NVS) without Structure-from-Motion (SfM) pre-processed +camera poses--referred to as SfM-free methods--is crucial for promoting rapid +response capabilities and enhancing robustness against variable operating +conditions. Recent SfM-free methods have integrated pose optimization, +designing end-to-end frameworks for joint camera pose estimation and NVS. +However, most existing works rely on per-pixel image loss functions, such as L2 +loss. In SfM-free methods, inaccurate initial poses lead to misalignment issue, +which, under the constraints of per-pixel image loss functions, results in +excessive gradients, causing unstable optimization and poor convergence for +NVS. In this study, we propose a correspondence-guided SfM-free 3D Gaussian +splatting for NVS. We use correspondences between the target and the rendered +result to achieve better pixel alignment, facilitating the optimization of +relative poses between frames. We then apply the learned poses to optimize the +entire scene. Each 2D screen-space pixel is associated with its corresponding +3D Gaussians through approximated surface rendering to facilitate gradient back +propagation. Experimental results underline the superior performance and time +efficiency of the proposed approach compared to the state-of-the-art baselines. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2312.07504 by other authors +
+
+
+
+
+ + ☆ Decoupling Feature Representations of Ego and Other Modalities for + Incomplete Multi-modal Brain Tumor Segmentation + + +
+ Multi-modal brain tumor segmentation typically involves four magnetic +resonance imaging (MRI) modalities, while incomplete modalities significantly +degrade performance. Existing solutions employ explicit or implicit modality +adaptation, aligning features across modalities or learning a fused feature +robust to modality incompleteness. They share a common goal of encouraging each +modality to express both itself and the others. However, the two expression +abilities are entangled as a whole in a seamless feature space, resulting in +prohibitive learning burdens. In this paper, we propose DeMoSeg to enhance the +modality adaptation by Decoupling the task of representing the ego and other +Modalities for robust incomplete multi-modal Segmentation. The decoupling is +super lightweight by simply using two convolutions to map each modality onto +four feature sub-spaces. The first sub-space expresses itself (Self-feature), +while the remaining sub-spaces substitute for other modalities +(Mutual-features). The Self- and Mutual-features interactively guide each other +through a carefully-designed Channel-wised Sparse Self-Attention (CSSA). After +that, a Radiologist-mimic Cross-modality expression Relationships (RCR) is +introduced to have available modalities provide Self-feature and also `lend' +their Mutual-features to compensate for the absent ones by exploiting the +clinical prior knowledge. The benchmark results on BraTS2020, BraTS2018 and +BraTS2015 verify the DeMoSeg's superiority thanks to the alleviated modality +adaptation difficulty. Concretely, for BraTS2020, DeMoSeg increases Dice by at +least 0.92%, 2.95% and 4.95% on whole tumor, tumor core and enhanced tumor +regions, respectively, compared to other state-of-the-arts. Codes are at +https://github.com/kk42yy/DeMoSeg + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ☆ Beyond the Hype: A dispassionate look at vision-language models in + medical scenario + + +
+ Recent advancements in Large Vision-Language Models (LVLMs) have demonstrated +remarkable capabilities across diverse tasks, garnering significant attention +in AI communities. However, their performance and reliability in specialized +domains such as medicine remain insufficiently assessed. In particular, most +assessments over-concentrate in evaluating VLMs based on simple Visual Question +Answering (VQA) on multi-modality data, while ignoring the in-depth +characteristic of LVLMs. In this study, we introduce RadVUQA, a novel +Radiological Visual Understanding and Question Answering benchmark, to +comprehensively evaluate existing LVLMs. RadVUQA mainly validates LVLMs across +five dimensions: 1) Anatomical understanding, assessing the models' ability to +visually identify biological structures; 2) Multimodal comprehension, which +involves the capability of interpreting linguistic and visual instructions to +produce desired outcomes; 3) Quantitative and spatial reasoning, evaluating the +models' spatial awareness and proficiency in combining quantitative analysis +with visual and linguistic information; 4) Physiological knowledge, measuring +the models' capability to comprehend functions and mechanisms of organs and +systems; and 5) Robustness, which assesses the models' capabilities against +unharmonised and synthetic data. The results indicate that both generalized +LVLMs and medical-specific LVLMs have critical deficiencies with weak +multimodal comprehension and quantitative reasoning capabilities. Our findings +reveal the large gap between existing LVLMs and clinicians, highlighting the +urgent need for more robust and intelligent LVLMs. The code and dataset will be +available after the acceptance of this paper. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ TsCA: On the Semantic Consistency Alignment via Conditional Transport + for Compositional Zero-Shot Learning + + +
+ Compositional Zero-Shot Learning (CZSL) aims to recognize novel +\textit{state-object} compositions by leveraging the shared knowledge of their +primitive components. Despite considerable progress, effectively calibrating +the bias between semantically similar multimodal representations, as well as +generalizing pre-trained knowledge to novel compositional contexts, remains an +enduring challenge. In this paper, our interest is to revisit the conditional +transport (CT) theory and its homology to the visual-semantics interaction in +CZSL and further, propose a novel Trisets Consistency Alignment framework +(dubbed TsCA) that well-addresses these issues. Concretely, we utilize three +distinct yet semantically homologous sets, i.e., patches, primitives, and +compositions, to construct pairwise CT costs to minimize their semantic +discrepancies. To further ensure the consistency transfer within these sets, we +implement a cycle-consistency constraint that refines the learning by +guaranteeing the feature consistency of the self-mapping during transport flow, +regardless of modality. Moreover, we extend the CT plans to an open-world +setting, which enables the model to effectively filter out unfeasible pairs, +thereby speeding up the inference as well as increasing the accuracy. Extensive +experiments are conducted to verify the effectiveness of the proposed method. + +
+
+
+
+
+ + ☆ HyCoT: Hyperspectral Compression Transformer with an Efficient Training + Strategy + + +
+ The development of learning-based hyperspectral image (HSI) compression +models has recently attracted significant interest. Existing models +predominantly utilize convolutional filters, which capture only local +dependencies. Furthermore, they often incur high training costs and exhibit +substantial computational complexity. To address these limitations, in this +paper we propose Hyperspectral Compression Transformer (HyCoT) that is a +transformer-based autoencoder for pixelwise HSI compression. Additionally, we +introduce an efficient training strategy to accelerate the training process. +Experimental results on the HySpecNet-11k dataset demonstrate that HyCoT +surpasses the state-of-the-art across various compression ratios by over 1 dB +with significantly reduced computational requirements. Our code and pre-trained +weights are publicly available at https://git.tu-berlin.de/rsim/hycot . + +
+
+
+
+
+ + ☆ LLM-PCGC: Large Language Model-based Point Cloud Geometry Compression + + +
+ The key to effective point cloud compression is to obtain a robust context +model consistent with complex 3D data structures. Recently, the advancement of +large language models (LLMs) has highlighted their capabilities not only as +powerful generators for in-context learning and generation but also as +effective compressors. These dual attributes of LLMs make them particularly +well-suited to meet the demands of data compression. Therefore, this paper +explores the potential of using LLM for compression tasks, focusing on lossless +point cloud geometry compression (PCGC) experiments. However, applying LLM +directly to PCGC tasks presents some significant challenges, i.e., LLM does not +understand the structure of the point cloud well, and it is a difficult task to +fill the gap between text and point cloud through text description, especially +for large complicated and small shapeless point clouds. To address these +problems, we introduce a novel architecture, namely the Large Language +Model-based Point Cloud Geometry Compression (LLM-PCGC) method, using LLM to +compress point cloud geometry information without any text description or +aligning operation. By utilizing different adaptation techniques for +cross-modality representation alignment and semantic consistency, including +clustering, K-tree, token mapping invariance, and Low Rank Adaptation (LoRA), +the proposed method can translate LLM to a compressor/generator for point +cloud. To the best of our knowledge, this is the first structure to employ LLM +as a compressor for point cloud data. Experiments demonstrate that the LLM-PCGC +outperforms the other existing methods significantly, by achieving -40.213% bit +rate reduction compared to the reference software of MPEG Geometry-based Point +Cloud Compression (G-PCC) standard, and by achieving -2.267% bit rate reduction +compared to the state-of-the-art learning-based method. + +
+
+
+
+
+ + ☆ Towards Physical World Backdoor Attacks against Skeleton Action + Recognition ECCV 2024 + + +
+ Skeleton Action Recognition (SAR) has attracted significant interest for its +efficient representation of the human skeletal structure. Despite its +advancements, recent studies have raised security concerns in SAR models, +particularly their vulnerability to adversarial attacks. However, such +strategies are limited to digital scenarios and ineffective in physical +attacks, limiting their real-world applicability. To investigate the +vulnerabilities of SAR in the physical world, we introduce the Physical +Skeleton Backdoor Attacks (PSBA), the first exploration of physical backdoor +attacks against SAR. Considering the practicalities of physical execution, we +introduce a novel trigger implantation method that integrates infrequent and +imperceivable actions as triggers into the original skeleton data. By +incorporating a minimal amount of this manipulated data into the training set, +PSBA enables the system misclassify any skeleton sequences into the target +class when the trigger action is present. We examine the resilience of PSBA in +both poisoned and clean-label scenarios, demonstrating its efficacy across a +range of datasets, poisoning ratios, and model architectures. Additionally, we +introduce a trigger-enhancing strategy to strengthen attack performance in the +clean label setting. The robustness of PSBA is tested against three distinct +backdoor defenses, and the stealthiness of PSBA is evaluated using two +quantitative metrics. Furthermore, by employing a Kinect V2 camera, we compile +a dataset of human actions from the real world to mimic physical attack +situations, with our findings confirming the effectiveness of our proposed +attacks. Our project website can be found at +https://qichenzheng.github.io/psba-website. + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+ + ☆ Adaptive Layer Selection for Efficient Vision Transformer Fine-Tuning + + +
+ Recently, foundation models based on Vision Transformers (ViTs) have become +widely available. However, their fine-tuning process is highly +resource-intensive, and it hinders their adoption in several edge or low-energy +applications. To this end, in this paper we introduce an efficient fine-tuning +method for ViTs called $\textbf{ALaST}$ ($\textit{Adaptive Layer Selection +Fine-Tuning for Vision Transformers}$) to speed up the fine-tuning process +while reducing computational cost, memory load, and training time. Our approach +is based on the observation that not all layers are equally critical during +fine-tuning, and their importance varies depending on the current mini-batch. +Therefore, at each fine-tuning step, we adaptively estimate the importance of +all layers and we assign what we call ``compute budgets'' accordingly. Layers +that were allocated lower budgets are either trained with a reduced number of +input tokens or kept frozen. Freezing a layer reduces the computational cost +and memory usage by preventing updates to its weights, while discarding tokens +removes redundant data, speeding up processing and reducing memory +requirements. We show that this adaptive compute allocation enables a +nearly-optimal schedule for distributing computational resources across layers, +resulting in substantial reductions in training time (up to 1.5x), FLOPs (up to +2x), and memory load (up to 2x) compared to traditional full fine-tuning +approaches. Additionally, it can be successfully combined with other +parameter-efficient fine-tuning methods, such as LoRA. + +
+
+
+
+
+ + ☆ QMambaBSR: Burst Image Super-Resolution with Query State Space Model + + +
+ Burst super-resolution aims to reconstruct high-resolution images with higher +quality and richer details by fusing the sub-pixel information from multiple +burst low-resolution frames. In BusrtSR, the key challenge lies in extracting +the base frame's content complementary sub-pixel details while simultaneously +suppressing high-frequency noise disturbance. Existing methods attempt to +extract sub-pixels by modeling inter-frame relationships frame by frame while +overlooking the mutual correlations among multi-current frames and neglecting +the intra-frame interactions, leading to inaccurate and noisy sub-pixels for +base frame super-resolution. Further, existing methods mainly employ static +upsampling with fixed parameters to improve spatial resolution for all scenes, +failing to perceive the sub-pixel distribution difference across multiple +frames and cannot balance the fusion weights of different frames, resulting in +over-smoothed details and artifacts. To address these limitations, we introduce +a novel Query Mamba Burst Super-Resolution (QMambaBSR) network, which +incorporates a Query State Space Model (QSSM) and Adaptive Up-sampling module +(AdaUp). Specifically, based on the observation that sub-pixels have consistent +spatial distribution while random noise is inconsistently distributed, a novel +QSSM is proposed to efficiently extract sub-pixels through inter-frame querying +and intra-frame scanning while mitigating noise interference in a single step. +Moreover, AdaUp is designed to dynamically adjust the upsampling kernel based +on the spatial distribution of multi-frame sub-pixel information in the +different burst scenes, thereby facilitating the reconstruction of the spatial +arrangement of high-resolution details. Extensive experiments on four popular +synthetic and real-world benchmarks demonstrate that our method achieves a new +state-of-the-art performance. + +
+
+
+
+
+ + ☆ Modeling the Neonatal Brain Development Using Implicit Neural + Representations MICCAI 2024 + + +
+ The human brain undergoes rapid development during the third trimester of +pregnancy. In this work, we model the neonatal development of the infant brain +in this age range. As a basis, we use MR images of preterm- and term-birth +neonates from the developing human connectome project (dHCP). We propose a +neural network, specifically an implicit neural representation (INR), to +predict 2D- and 3D images of varying time points. In order to model a +subject-specific development process, it is necessary to disentangle the age +from the subjects' identity in the latent space of the INR. We propose two +methods, Subject Specific Latent Vectors (SSL) and Stochastic Global Latent +Augmentation (SGLA), enabling this disentanglement. We perform an analysis of +the results and compare our proposed model to an age-conditioned denoising +diffusion model as a baseline. We also show that our method can be applied in a +memory-efficient way, which is especially important for 3D data. + +
+
+ comment: Preprint, Accepted for PRIME MICCAI 2024 +
+
+
+
+
+ + ☆ Extracting polygonal footprints in off-nadir images with Segment + Anything Model + + +
+ Building Footprint Extraction (BFE) in off-nadir aerial images often relies +on roof segmentation and roof-to-footprint offset prediction, then drugging +roof-to-footprint via the offset. However, the results from this multi-stage +inference are not applicable in data production, because of the low quality of +masks given by prediction. To solve this problem, we proposed OBMv2 in this +paper, which supports both end-to-end and promptable polygonal footprint +prediction. Different from OBM, OBMv2 using a newly proposed Self Offset +Attention (SOFA) to bridge the performance gap on bungalow and skyscraper, +which realized a real end-to-end footprint polygon prediction without +postprocessing. %, such as Non-Maximum Suppression (NMS) and Distance NMS +(DNMS). % To fully use information contained in roof masks, building masks and +offsets, we proposed a Multi-level Information SyStem (MISS) for footprint +prediction, with which OBMv2 can predict footprints even with insufficient +predictions. Additionally, to squeeze information from the same model, we were +inspired by Retrieval-Augmented Generation (RAG) in Nature Language Processing +and proposed "RAG in BFE" problem. To verify the effectiveness of the proposed +method, experiments were conducted on open datasets BONAI and OmniCity-view3. A +generalization test was also conducted on Huizhou test set. The code will be +available at \url{https://github.com/likaiucas/OBM}. + +
+
+
+
+
+ + ☆ Historical Printed Ornaments: Dataset and Tasks + + +
+ This paper aims to develop the study of historical printed ornaments with +modern unsupervised computer vision. We highlight three complex tasks that are +of critical interest to book historians: clustering, element discovery, and +unsupervised change localization. For each of these tasks, we introduce an +evaluation benchmark, and we adapt and evaluate state-of-the-art models. Our +Rey's Ornaments dataset is designed to be a representative example of a set of +ornaments historians would be interested in. It focuses on an XVIIIth century +bookseller, Marc-Michel Rey, providing a consistent set of ornaments with a +wide diversity and representative challenges. Our results highlight the +limitations of state-of-the-art models when faced with real data and show +simple baselines such as k-means or congealing can outperform more +sophisticated approaches on such data. Our dataset and code can be found at +https://printed-ornaments.github.io/. + +
+
+
+
+
+ + ☆ A Survey on Benchmarks of Multimodal Large Language Models + + +
+ Multimodal Large Language Models (MLLMs) are gaining increasing popularity in +both academia and industry due to their remarkable performance in various +applications such as visual question answering, visual perception, +understanding, and reasoning. Over the past few years, significant efforts have +been made to examine MLLMs from multiple perspectives. This paper presents a +comprehensive review of \textbf{180 benchmarks} and evaluation for MLLMs, +focusing on (1)perception and understanding, (2)cognition and reasoning, +(3)specific domains, (4)key capabilities, and (5)other modalities. Finally, we +discuss the limitations of the current evaluation methods for MLLMs and explore +promising future directions. Our key argument is that evaluation should be +regarded as a crucial discipline to better support the development of MLLMs. +For more details, please visit our GitHub repository: +https://github.com/swordlidev/Evaluation-Multimodal-LLMs-Survey. + +
+
+
+
+
+ + ☆ SketchRef: A Benchmark Dataset and Evaluation Metrics for Automated + Sketch Synthesis + + +
+ Sketch, a powerful artistic technique to capture essential visual information +about real-world objects, is increasingly gaining attention in the image +synthesis field. However, evaluating the quality of synthesized sketches +presents unique unsolved challenges. Current evaluation methods for sketch +synthesis are inadequate due to the lack of a unified benchmark dataset, +over-reliance on classification accuracy for recognizability, and unfair +evaluation of sketches with different levels of simplification. To address +these issues, we introduce SketchRef, a benchmark dataset comprising 4 +categories of reference photos--animals, human faces, human bodies, and common +objects--alongside novel evaluation metrics. Considering that classification +accuracy is insufficient to measure the structural consistency between a sketch +and its reference photo, we propose the mean Object Keypoint Similarity (mOKS) +metric, utilizing pose estimation to assess structure-level recognizability. To +ensure fair evaluation sketches with different simplification levels, we +propose a recognizability calculation method constrained by simplicity. We also +collect 8K responses from art enthusiasts, validating the effectiveness of our +proposed evaluation methods. We hope this work can provide a comprehensive +evaluation of sketch synthesis algorithms, thereby aligning their performance +more closely with human understanding. + +
+
+
+
+
+ + ☆ Reference-free Axial Super-resolution of 3D Microscopy Images using + Implicit Neural Representation with a 2D Diffusion Prior MICCAI2024 + + +
+ Analysis and visualization of 3D microscopy images pose challenges due to +anisotropic axial resolution, demanding volumetric super-resolution along the +axial direction. While training a learning-based 3D super-resolution model +seems to be a straightforward solution, it requires ground truth isotropic +volumes and suffers from the curse of dimensionality. Therefore, existing +methods utilize 2D neural networks to reconstruct each axial slice, eventually +piecing together the entire volume. However, reconstructing each slice in the +pixel domain fails to give consistent reconstruction in all directions leading +to misalignment artifacts. In this work, we present a reconstruction framework +based on implicit neural representation (INR), which allows 3D coherency even +when optimized by independent axial slices in a batch-wise manner. Our method +optimizes a continuous volumetric representation from low-resolution axial +slices, using a 2D diffusion prior trained on high-resolution lateral slices +without requiring isotropic volumes. Through experiments on real and synthetic +anisotropic microscopy images, we demonstrate that our method surpasses other +state-of-the-art reconstruction methods. The source code is available on +GitHub: https://github.com/hvcl/INR-diffusion. + +
+
+ comment: MICCAI2024 accepted +
+
+
+
+
+ + ☆ Generative Dataset Distillation Based on Diffusion Model ECCV 2024 + + +
+ This paper presents our method for the generative track of The First Dataset +Distillation Challenge at ECCV 2024. Since the diffusion model has become the +mainstay of generative models because of its high-quality generative effects, +we focus on distillation methods based on the diffusion model. Considering that +the track can only generate a fixed number of images in 10 minutes using a +generative model for CIFAR-100 and Tiny-ImageNet datasets, we need to use a +generative model that can generate images at high speed. In this study, we +proposed a novel generative dataset distillation method based on Stable +Diffusion. Specifically, we use the SDXL-Turbo model which can generate images +at high speed and quality. Compared to other diffusion models that can only +generate images per class (IPC) = 1, our method can achieve an IPC = 10 for +Tiny-ImageNet and an IPC = 20 for CIFAR-100, respectively. Additionally, to +generate high-quality distilled datasets for CIFAR-100 and Tiny-ImageNet, we +use the class information as text prompts and post data augmentation for the +SDXL-Turbo model. Experimental results show the effectiveness of the proposed +method, and we achieved third place in the generative track of the ECCV 2024 DD +Challenge. Codes are available at https://github.com/Guang000/BANKO. + +
+
+ comment: The Third Place Winner in Generative Track of the ECCV 2024 DD + Challenge +
+
+
+
+
+ + ☆ Bi-Directional Deep Contextual Video Compression + + +
+ Deep video compression has made remarkable process in recent years, with the +majority of advancements concentrated on P-frame coding. Although efforts to +enhance B-frame coding are ongoing, their compression performance is still far +behind that of traditional bi-directional video codecs. In this paper, we +introduce a bi-directional deep contextual video compression scheme tailored +for B-frames, termed DCVC-B, to improve the compression performance of deep +B-frame coding. Our scheme mainly has three key innovations. First, we develop +a bi-directional motion difference context propagation method for effective +motion difference coding, which significantly reduces the bit cost of +bi-directional motions. Second, we propose a bi-directional contextual +compression model and a corresponding bi-directional temporal entropy model, to +make better use of the multi-scale temporal contexts. Third, we propose a +hierarchical quality structure-based training strategy, leading to an effective +bit allocation across large groups of pictures (GOP). Experimental results show +that our DCVC-B achieves an average reduction of 26.6% in BD-Rate compared to +the reference software for H.265/HEVC under random access conditions. +Remarkably, it surpasses the performance of the H.266/VVC reference software on +certain test datasets under the same configuration. + +
+
+
+
+
+ + ☆ Learning A Low-Level Vision Generalist via Visual Task Prompt + + +
+ Building a unified model for general low-level vision tasks holds significant +research and practical value. Current methods encounter several critical +issues. Multi-task restoration approaches can address multiple +degradation-to-clean restoration tasks, while their applicability to tasks with +different target domains (e.g., image stylization) is limited. Methods like +PromptGIP can handle multiple input-target domains but rely on the Masked +Autoencoder (MAE) paradigm. Consequently, they are tied to the ViT +architecture, resulting in suboptimal image reconstruction quality. In +addition, these methods are sensitive to prompt image content and often +struggle with low-frequency information processing. In this paper, we propose a +Visual task Prompt-based Image Processing (VPIP) framework to overcome these +challenges. VPIP employs visual task prompts to manage tasks with different +input-target domains and allows flexible selection of backbone network suitable +for general tasks. Besides, a new prompt cross-attention is introduced to +facilitate interaction between the input and prompt information. Based on the +VPIP framework, we train a low-level vision generalist model, namely GenLV, on +30 diverse tasks. Experimental results show that GenLV can successfully address +a variety of low-level tasks, significantly outperforming existing methods both +quantitatively and qualitatively. Codes are available at +https://github.com/chxy95/GenLV. + +
+
+ comment: Accepted to ACMMM24 +
+
+
+
+
+ + ☆ MM-UNet: A Mixed MLP Architecture for Improved Ophthalmic Image + Segmentation + + +
+ Ophthalmic image segmentation serves as a critical foundation for ocular +disease diagnosis. Although fully convolutional neural networks (CNNs) are +commonly employed for segmentation, they are constrained by inductive biases +and face challenges in establishing long-range dependencies. Transformer-based +models address these limitations but introduce substantial computational +overhead. Recently, a simple yet efficient Multilayer Perceptron (MLP) +architecture was proposed for image classification, achieving competitive +performance relative to advanced transformers. However, its effectiveness for +ophthalmic image segmentation remains unexplored. In this paper, we introduce +MM-UNet, an efficient Mixed MLP model tailored for ophthalmic image +segmentation. Within MM-UNet, we propose a multi-scale MLP (MMLP) module that +facilitates the interaction of features at various depths through a grouping +strategy, enabling simultaneous capture of global and local information. We +conducted extensive experiments on both a private anterior segment optical +coherence tomography (AS-OCT) image dataset and a public fundus image dataset. +The results demonstrated the superiority of our MM-UNet model in comparison to +state-of-the-art deep segmentation networks. + +
+
+ comment: OMIA2024 +
+
+
+
+
+ + ☆ Zero-Shot Dual-Path Integration Framework for Open-Vocabulary 3D + Instance Segmentation CVPR 2024 + + +
+ Open-vocabulary 3D instance segmentation transcends traditional +closed-vocabulary methods by enabling the identification of both previously +seen and unseen objects in real-world scenarios. It leverages a dual-modality +approach, utilizing both 3D point clouds and 2D multi-view images to generate +class-agnostic object mask proposals. Previous efforts predominantly focused on +enhancing 3D mask proposal models; consequently, the information that could +come from 2D association to 3D was not fully exploited. This bias towards 3D +data, while effective for familiar indoor objects, limits the system's +adaptability to new and varied object types, where 2D models offer greater +utility. Addressing this gap, we introduce Zero-Shot Dual-Path Integration +Framework that equally values the contributions of both 3D and 2D modalities. +Our framework comprises three components: 3D pathway, 2D pathway, and Dual-Path +Integration. 3D pathway generates spatially accurate class-agnostic mask +proposals of common indoor objects from 3D point cloud data using a pre-trained +3D model, while 2D pathway utilizes pre-trained open-vocabulary instance +segmentation model to identify a diverse array of object proposals from +multi-view RGB-D images. In Dual-Path Integration, our Conditional Integration +process, which operates in two stages, filters and merges the proposals from +both pathways adaptively. This process harmonizes output proposals to enhance +segmentation capabilities. Our framework, utilizing pre-trained models in a +zero-shot manner, is model-agnostic and demonstrates superior performance on +both seen and unseen data, as evidenced by comprehensive evaluations on the +ScanNet200 and qualitative results on ARKitScenes datasets. + +
+
+ comment: OpenSUN 3D: 2nd Workshop on Open-Vocabulary 3D Scene Understanding + (CVPR 2024) +
+
+
+
+
+ + ☆ S-RAF: A Simulation-Based Robustness Assessment Framework for + Responsible Autonomous Driving + + +
+ As artificial intelligence (AI) technology advances, ensuring the robustness +and safety of AI-driven systems has become paramount. However, varying +perceptions of robustness among AI developers create misaligned evaluation +metrics, complicating the assessment and certification of safety-critical and +complex AI systems such as autonomous driving (AD) agents. To address this +challenge, we introduce Simulation-Based Robustness Assessment Framework +(S-RAF) for autonomous driving. S-RAF leverages the CARLA Driving simulator to +rigorously assess AD agents across diverse conditions, including faulty +sensors, environmental changes, and complex traffic situations. By quantifying +robustness and its relationship with other safety-critical factors, such as +carbon emissions, S-RAF aids developers and stakeholders in building safe and +responsible driving agents, and streamlining safety certification processes. +Furthermore, S-RAF offers significant advantages, such as reduced testing +costs, and the ability to explore edge cases that may be unsafe to test in the +real world. The code for this framework is available here: +https://github.com/cognitive-robots/rai-leaderboard + +
+
+
+
+
+ + ☆ TAMER: Tree-Aware Transformer for Handwritten Mathematical Expression + Recognition + + +
+ Handwritten Mathematical Expression Recognition (HMER) has extensive +applications in automated grading and office automation. However, existing +sequence-based decoding methods, which directly predict $\LaTeX$ sequences, +struggle to understand and model the inherent tree structure of $\LaTeX$ and +often fail to ensure syntactic correctness in the decoded results. To address +these challenges, we propose a novel model named TAMER (Tree-Aware Transformer) +for handwritten mathematical expression recognition. TAMER introduces an +innovative Tree-aware Module while maintaining the flexibility and efficient +training of Transformer. TAMER combines the advantages of both sequence +decoding and tree decoding models by jointly optimizing sequence prediction and +tree structure prediction tasks, which enhances the model's understanding and +generalization of complex mathematical expression structures. During inference, +TAMER employs a Tree Structure Prediction Scoring Mechanism to improve the +structural validity of the generated $\LaTeX$ sequences. Experimental results +on CROHME datasets demonstrate that TAMER outperforms traditional sequence +decoding and tree decoding models, especially in handling complex mathematical +structures, achieving state-of-the-art (SOTA) performance. + +
+
+
+
+
+ + ☆ Tuning a SAM-Based Model with Multi-Cognitive Visual Adapter to Remote + Sensing Instance Segmentation + + +
+ The Segment Anything Model (SAM), a foundational model designed for +promptable segmentation tasks, demonstrates exceptional generalization +capabilities, making it highly promising for natural scene image segmentation. +However, SAM's lack of pretraining on massive remote sensing images and its +interactive structure limit its automatic mask prediction capabilities. In this +paper, a Multi-Cognitive SAM-Based Instance Segmentation Model (MC-SAM SEG) is +introduced to employ SAM on remote sensing domain. The SAM-Mona encoder +utilizing the Multi-cognitive Visual Adapter (Mona) is conducted to facilitate +SAM's transfer learning in remote sensing applications. The proposed method +named MC-SAM SEG extracts high-quality features by fine-tuning the SAM-Mona +encoder along with a feature aggregator. Subsequently, a pixel decoder and +transformer decoder are designed for prompt-free mask generation and instance +classification. The comprehensive experiments are conducted on the HRSID and +WHU datasets for instance segmentation tasks on Synthetic Aperture Radar (SAR) +images and optical remote sensing images respectively. The evaluation results +indicate the proposed method surpasses other deep learning algorithms and +verify its effectiveness and generalization. + +
+
+
+
+
+ + ☆ Tell Codec What Worth Compressing: Semantically Disentangled Image + Coding for Machine with LMMs + + +
+ We present a new image compression paradigm to achieve ``intelligently coding +for machine'' by cleverly leveraging the common sense of Large Multimodal +Models (LMMs). We are motivated by the evidence that large language/multimodal +models are powerful general-purpose semantics predictors for understanding the +real world. Different from traditional image compression typically optimized +for human eyes, the image coding for machines (ICM) framework we focus on +requires the compressed bitstream to more comply with different downstream +intelligent analysis tasks. To this end, we employ LMM to \textcolor{red}{tell +codec what to compress}: 1) first utilize the powerful semantic understanding +capability of LMMs w.r.t object grounding, identification, and importance +ranking via prompts, to disentangle image content before compression, 2) and +then based on these semantic priors we accordingly encode and transmit objects +of the image in order with a structured bitstream. In this way, diverse vision +benchmarks including image classification, object detection, instance +segmentation, etc., can be well supported with such a semantically structured +bitstream. We dub our method ``\textit{SDComp}'' for ``\textit{S}emantically +\textit{D}isentangled \textit{Comp}ression'', and compare it with +state-of-the-art codecs on a wide variety of different vision tasks. SDComp +codec leads to more flexible reconstruction results, promised decoded visual +quality, and a more generic/satisfactory intelligent task-supporting ability. + +
+
+
+
+
+ + ☆ EraW-Net: Enhance-Refine-Align W-Net for Scene-Associated Driver + Attention Estimation + + +
+ Associating driver attention with driving scene across two fields of views +(FOVs) is a hard cross-domain perception problem, which requires comprehensive +consideration of cross-view mapping, dynamic driving scene analysis, and driver +status tracking. Previous methods typically focus on a single view or map +attention to the scene via estimated gaze, failing to exploit the implicit +connection between them. Moreover, simple fusion modules are insufficient for +modeling the complex relationships between the two views, making information +integration challenging. To address these issues, we propose a novel method for +end-to-end scene-associated driver attention estimation, called EraW-Net. This +method enhances the most discriminative dynamic cues, refines feature +representations, and facilitates semantically aligned cross-domain integration +through a W-shaped architecture, termed W-Net. Specifically, a Dynamic Adaptive +Filter Module (DAF-Module) is proposed to address the challenges of frequently +changing driving environments by extracting vital regions. It suppresses the +indiscriminately recorded dynamics and highlights crucial ones by innovative +joint frequency-spatial analysis, enhancing the model's ability to parse +complex dynamics. Additionally, to track driver states during non-fixed facial +poses, we propose a Global Context Sharing Module (GCS-Module) to construct +refined feature representations by capturing hierarchical features that adapt +to various scales of head and eye movements. Finally, W-Net achieves systematic +cross-view information integration through its "Encoding-Independent Partial +Decoding-Fusion Decoding" structure, addressing semantic misalignment in +heterogeneous data integration. Experiments demonstrate that the proposed +method robustly and accurately estimates the mapping of driver attention in +scene on large public datasets. + +
+
+ comment: 13pages, 9 figures, +
+
+
+
+
+ + ☆ Unsupervised Non-Rigid Point Cloud Matching through Large Vision Models + + +
+ In this paper, we propose a novel learning-based framework for non-rigid +point cloud matching, which can be trained purely on point clouds without any +correspondence annotation but also be extended naturally to partial-to-full +matching. Our key insight is to incorporate semantic features derived from +large vision models (LVMs) to geometry-based shape feature learning. Our +framework effectively leverages the structural information contained in the +semantic features to address ambiguities arise from self-similarities among +local geometries. Furthermore, our framework also enjoys the strong +generalizability and robustness regarding partial observations of LVMs, leading +to improvements in the regarding point cloud matching tasks. In order to +achieve the above, we propose a pixel-to-point feature aggregation module, a +local and global attention network as well as a geometrical similarity loss +function. Experimental results show that our method achieves state-of-the-art +results in matching non-rigid point clouds in both near-isometric and +heterogeneous shape collection as well as more realistic partial and noisy +data. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ☆ S$^3$Attention: Improving Long Sequence Attention with Smoothed Skeleton + Sketching + + +
+ Attention based models have achieved many remarkable breakthroughs in +numerous applications. However, the quadratic complexity of Attention makes the +vanilla Attention based models hard to apply to long sequence tasks. Various +improved Attention structures are proposed to reduce the computation cost by +inducing low rankness and approximating the whole sequence by sub-sequences. +The most challenging part of those approaches is maintaining the proper balance +between information preservation and computation reduction: the longer +sub-sequences used, the better information is preserved, but at the price of +introducing more noise and computational costs. In this paper, we propose a +smoothed skeleton sketching based Attention structure, coined S$^3$Attention, +which significantly improves upon the previous attempts to negotiate this +trade-off. S$^3$Attention has two mechanisms to effectively minimize the impact +of noise while keeping the linear complexity to the sequence length: a +smoothing block to mix information over long sequences and a matrix sketching +method that simultaneously selects columns and rows from the input matrix. We +verify the effectiveness of S$^3$Attention both theoretically and empirically. +Extensive studies over Long Range Arena (LRA) datasets and six time-series +forecasting show that S$^3$Attention significantly outperforms both vanilla +Attention and other state-of-the-art variants of Attention structures. + +
+
+
+
+
+ + ☆ A New Chinese Landscape Paintings Generation Model based on Stable + Diffusion using DreamBooth HPCA + + +
+ This study mainly introduces a method combining the Stable Diffusion Model +(SDM) and Parameter-Efficient Fine-Tuning method for generating Chinese +Landscape Paintings. This training process is accelerated by combining LoRA +with pre-trained SDM and DreamBooth with pre-trained SDM, respectively. On the +Chinese Landscape Paintings Internet dataset used in this paper, this study +finds that SDM combined with DreamBooth exhibits superior performance, +outperforming other models, including the generic pre-trained SDM and +LoRA-based fine-tuning SDM. The SDM combined with DreamBooth achieves a FID of +12.75 on the dataset and outperforms all other models in terms of expert +evaluation, highlighting the model's versatility in the field of Chinese +Landscape Paintings given the unique identifier, high fidelity and high +quality. This study illustrates the potential of specialised fine-tuning method +to improve the performance of SDM on domain-specific tasks, particularly in the +domain of Landscape Paintings. + +
+
+ comment: accepted by AHPCAI +
+
+
+
+
+ + ☆ A training regime to learn unified representations from complementary + breast imaging modalities + + +
+ Full Field Digital Mammograms (FFDMs) and Digital Breast Tomosynthesis (DBT) +are the two most widely used imaging modalities for breast cancer screening. +Although DBT has increased cancer detection compared to FFDM, its widespread +adoption in clinical practice has been slowed by increased interpretation times +and a perceived decrease in the conspicuity of specific lesion types. +Specifically, the non-inferiority of DBT for microcalcifications remains under +debate. Due to concerns about the decrease in visual acuity, combined DBT-FFDM +acquisitions remain popular, leading to overall increased exam times and +radiation dosage. Enabling DBT to provide diagnostic information present in +both FFDM and DBT would reduce reliance on FFDM, resulting in a reduction in +both quantities. We propose a machine learning methodology that learns +high-level representations leveraging the complementary diagnostic signal from +both DBT and FFDM. Experiments on a large-scale data set validate our claims +and show that our representations enable more accurate breast lesion detection +than any DBT- or FFDM-based model. + +
+
+
+
+
+ + ☆ Detection and tracking of MAVs using a LiDAR with rosette scanning + pattern + + +
+ The usage of commercial Micro Aerial Vehicles (MAVs) has increased +drastically during the last decade. While the added value of MAVs to society is +apparent, their growing use is also coming with increasing risks like violating +public airspace at airports or committing privacy violations. To mitigate these +issues it is becoming critical to develop solutions that incorporate the +detection and tracking of MAVs with autonomous systems. This work presents a +method for the detection and tracking of MAVs using a novel, low-cost rosette +scanning LiDAR on a pan-tilt turret. Once the static background is captured, a +particle filter is utilized to detect a possible target and track its position +with a physical, programmable pan-tilt system. The tracking makes it possible +to keep the MAV in the center, maximizing the density of 3D points measured on +the target by the LiDAR sensor. The developed algorithm was evaluated within +the indoor MIcro aerial vehicle and MOtion capture (MIMO) arena and has +state-of-the-art tracking accuracy, stability, and fast re-detection time in +case of tracking loss. Based on the outdoor tests, it was possible to +significantly increase the detection distance and number of returned points +compared to other similar methods using LiDAR. + +
+
+
+
+
+ + ☆ Scaling up Multimodal Pre-training for Sign Language Understanding + + +
+ Sign language serves as the primary meaning of communication for the +deaf-mute community. Different from spoken language, it commonly conveys +information by the collaboration of manual features, i.e., hand gestures and +body movements, and non-manual features, i.e., facial expressions and mouth +cues. To facilitate communication between the deaf-mute and hearing people, a +series of sign language understanding (SLU) tasks have been studied in recent +years, including isolated/continuous sign language recognition (ISLR/CSLR), +gloss-free sign language translation (GF-SLT) and sign language retrieval +(SL-RT). Sign language recognition and translation aims to understand the +semantic meaning conveyed by sign languages from gloss-level and +sentence-level, respectively. In contrast, SL-RT focuses on retrieving sign +videos or corresponding texts from a closed-set under the query-by-example +search paradigm. These tasks investigate sign language topics from diverse +perspectives and raise challenges in learning effective representation of sign +language videos. To advance the development of sign language understanding, +exploring a generalized model that is applicable across various SLU tasks is a +profound research direction. + +
+
+ comment: Sign language recognition; Sign language translation; Sign language + retrieval +
+
+
+
+
+ + ☆ Language-Driven Interactive Shadow Detection ACM MM 2024 + + +
+ Traditional shadow detectors often identify all shadow regions of static +images or video sequences. This work presents the Referring Video Shadow +Detection (RVSD), which is an innovative task that rejuvenates the classic +paradigm by facilitating the segmentation of particular shadows in videos based +on descriptive natural language prompts. This novel RVSD not only achieves +segmentation of arbitrary shadow areas of interest based on descriptions +(flexibility) but also allows users to interact with visual content more +directly and naturally by using natural language prompts (interactivity), +paving the way for abundant applications ranging from advanced video editing to +virtual reality experiences. To pioneer the RVSD research, we curated a +well-annotated RVSD dataset, which encompasses 86 videos and a rich set of +15,011 paired textual descriptions with corresponding shadows. To the best of +our knowledge, this dataset is the first one for addressing RVSD. Based on this +dataset, we propose a Referring Shadow-Track Memory Network (RSM-Net) for +addressing the RVSD task. In our RSM-Net, we devise a Twin-Track Synergistic +Memory (TSM) to store intra-clip memory features and hierarchical inter-clip +memory features, and then pass these memory features into a memory read module +to refine features of the current video frame for referring shadow detection. +We also develop a Mixed-Prior Shadow Attention (MSA) to utilize physical priors +to obtain a coarse shadow map for learning more visual features by weighting it +with the input video frame. Experimental results show that our RSM-Net achieves +state-of-the-art performance for RVSD with a notable Overall IOU increase of +4.4\%. Our code and dataset are available at https://github.com/whq-xxh/RVSD. + +
+
+ comment: ACM MM 2024 +
+
+
+
+
+ + ☆ Privacy-Preserving Vision Transformer Using Images Encrypted with + Restricted Random Permutation Matrices + + +
+ We propose a novel method for privacy-preserving fine-tuning vision +transformers (ViTs) with encrypted images. Conventional methods using encrypted +images degrade model performance compared with that of using plain images due +to the influence of image encryption. In contrast, the proposed encryption +method using restricted random permutation matrices can provide a higher +performance than the conventional ones. + +
+
+ comment: 4 pages, 9 figures +
+
+
+
+
+ + ☆ Focus on Focus: Focus-oriented Representation Learning and Multi-view + Cross-modal Alignment for Glioma Grading + + +
+ Recently, multimodal deep learning, which integrates histopathology slides +and molecular biomarkers, has achieved a promising performance in glioma +grading. Despite great progress, due to the intra-modality complexity and +inter-modality heterogeneity, existing studies suffer from inadequate +histopathology representation learning and inefficient molecular-pathology +knowledge alignment. These two issues hinder existing methods to precisely +interpret diagnostic molecular-pathology features, thereby limiting their +grading performance. Moreover, the real-world applicability of existing +multimodal approaches is significantly restricted as molecular biomarkers are +not always available during clinical deployment. To address these problems, we +introduce a novel Focus on Focus (FoF) framework with paired pathology-genomic +training and applicable pathology-only inference, enhancing molecular-pathology +representation effectively. Specifically, we propose a Focus-oriented +Representation Learning (FRL) module to encourage the model to identify regions +positively or negatively related to glioma grading and guide it to focus on the +diagnostic areas with a consistency constraint. To effectively link the +molecular biomarkers to morphological features, we propose a Multi-view +Cross-modal Alignment (MCA) module that projects histopathology representations +into molecular subspaces, aligning morphological features with corresponding +molecular biomarker status by supervised contrastive learning. Experiments on +the TCGA GBM-LGG dataset demonstrate that our FoF framework significantly +improves the glioma grading. Remarkably, our FoF achieves superior performance +using only histopathology slides compared to existing multimodal methods. The +source code is available at https://github.com/peterlipan/FoF. + +
+
+
+
+
+ + ☆ GS-ID: Illumination Decomposition on Gaussian Splatting via Diffusion + Prior and Parametric Light Source Optimization + + +
+ We present GS-ID, a novel framework for illumination decomposition on +Gaussian Splatting, achieving photorealistic novel view synthesis and intuitive +light editing. Illumination decomposition is an ill-posed problem facing three +main challenges: 1) priors for geometry and material are often lacking; 2) +complex illumination conditions involve multiple unknown light sources; and 3) +calculating surface shading with numerous light sources is computationally +expensive. To address these challenges, we first introduce intrinsic diffusion +priors to estimate the attributes for physically based rendering. Then we +divide the illumination into environmental and direct components for joint +optimization. Last, we employ deferred rendering to reduce the computational +load. Our framework uses a learnable environment map and Spherical Gaussians +(SGs) to represent light sources parametrically, therefore enabling +controllable and photorealistic relighting on Gaussian Splatting. Extensive +experiments and applications demonstrate that GS-ID produces state-of-the-art +illumination decomposition results while achieving better geometry +reconstruction and rendering performance. + +
+
+ comment: 15 pages, 13 figures +
+
+
+
+
+ + ☆ Visual-Friendly Concept Protection via Selective Adversarial + Perturbations + + +
+ Personalized concept generation by tuning diffusion models with a few images +raises potential legal and ethical concerns regarding privacy and intellectual +property rights. Researchers attempt to prevent malicious personalization using +adversarial perturbations. However, previous efforts have mainly focused on the +effectiveness of protection while neglecting the visibility of perturbations. +They utilize global adversarial perturbations, which introduce noticeable +alterations to original images and significantly degrade visual quality. In +this work, we propose the Visual-Friendly Concept Protection (VCPro) framework, +which prioritizes the protection of key concepts chosen by the image owner +through adversarial perturbations with lower perceptibility. To ensure these +perturbations are as inconspicuous as possible, we introduce a relaxed +optimization objective to identify the least perceptible yet effective +adversarial perturbations, solved using the Lagrangian multiplier method. +Qualitative and quantitative experiments validate that VCPro achieves a better +trade-off between the visibility of perturbations and protection effectiveness, +effectively prioritizing the protection of target concepts in images with less +perceptible perturbations. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Efficient Image-to-Image Diffusion Classifier for Adversarial Robustness + + +
+ Diffusion models (DMs) have demonstrated great potential in the field of +adversarial robustness, where DM-based defense methods can achieve superior +defense capability without adversarial training. However, they all require huge +computational costs due to the usage of large-scale pre-trained DMs, making it +difficult to conduct full evaluation under strong attacks and compare with +traditional CNN-based methods. Simply reducing the network size and timesteps +in DMs could significantly harm the image generation quality, which invalidates +previous frameworks. To alleviate this issue, we redesign the diffusion +framework from generating high-quality images to predicting distinguishable +image labels. Specifically, we employ an image translation framework to learn +many-to-one mapping from input samples to designed orthogonal image labels. +Based on this framework, we introduce an efficient Image-to-Image diffusion +classifier with a pruned U-Net structure and reduced diffusion timesteps. +Besides the framework, we redesign the optimization objective of DMs to fit the +target of image classification, where a new classification loss is incorporated +in the DM-based image translation framework to distinguish the generated label +from those of other classes. We conduct sufficient evaluations of the proposed +classifier under various attacks on popular benchmarks. Extensive experiments +show that our method achieves better adversarial robustness with fewer +computational costs than DM-based and CNN-based methods. The code is available +at https://github.com/hfmei/IDC. + +
+
+
+
+
+ + ☆ CoSEC: A Coaxial Stereo Event Camera Dataset for Autonomous Driving + + +
+ Conventional frame camera is the mainstream sensor of the autonomous driving +scene perception, while it is limited in adverse conditions, such as low light. +Event camera with high dynamic range has been applied in assisting frame camera +for the multimodal fusion, which relies heavily on the pixel-level spatial +alignment between various modalities. Typically, existing multimodal datasets +mainly place event and frame cameras in parallel and directly align them +spatially via warping operation. However, this parallel strategy is less +effective for multimodal fusion, since the large disparity exacerbates spatial +misalignment due to the large event-frame baseline. We argue that baseline +minimization can reduce alignment error between event and frame cameras. In +this work, we introduce hybrid coaxial event-frame devices to build the +multimodal system, and propose a coaxial stereo event camera (CoSEC) dataset +for autonomous driving. As for the multimodal system, we first utilize the +microcontroller to achieve time synchronization, and then spatially calibrate +different sensors, where we perform intra- and inter-calibration of stereo +coaxial devices. As for the multimodal dataset, we filter LiDAR point clouds to +generate depth and optical flow labels using reference depth, which is further +improved by fusing aligned event and frame data in nighttime conditions. With +the help of the coaxial device, the proposed dataset can promote the all-day +pixel-level multimodal fusion. Moreover, we also conduct experiments to +demonstrate that the proposed dataset can improve the performance and +generalization of the multimodal fusion. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ Achieving Complex Image Edits via Function Aggregation with Diffusion + Models + + +
+ Diffusion models have demonstrated strong performance in generative tasks, +making them ideal candidates for image editing. Recent studies highlight their +ability to apply desired edits effectively by following textual instructions, +yet two key challenges persist. First, these models struggle to apply multiple +edits simultaneously, resulting in computational inefficiencies due to their +reliance on sequential processing. Second, relying on textual prompts to +determine the editing region can lead to unintended alterations in other parts +of the image. In this work, we introduce FunEditor, an efficient diffusion +model designed to learn atomic editing functions and perform complex edits by +aggregating simpler functions. This approach enables complex editing tasks, +such as object movement, by aggregating multiple functions and applying them +simultaneously to specific areas. FunEditor is 5 to 24 times faster inference +than existing methods on complex tasks like object movement. Our experiments +demonstrate that FunEditor significantly outperforms recent baselines, +including both inference-time optimization methods and fine-tuned models, +across various metrics, such as image quality assessment (IQA) and +object-background consistency. + +
+
+
+
+
+ + ☆ DFT-Based Adversarial Attack Detection in MRI Brain Imaging: Enhancing + Diagnostic Accuracy in Alzheimer's Case Studies + + +
+ Recent advancements in deep learning, particularly in medical imaging, have +significantly propelled the progress of healthcare systems. However, examining +the robustness of medical images against adversarial attacks is crucial due to +their real-world applications and profound impact on individuals' health. These +attacks can result in misclassifications in disease diagnosis, potentially +leading to severe consequences. Numerous studies have explored both the +implementation of adversarial attacks on medical images and the development of +defense mechanisms against these threats, highlighting the vulnerabilities of +deep neural networks to such adversarial activities. In this study, we +investigate adversarial attacks on images associated with Alzheimer's disease +and propose a defensive method to counteract these attacks. Specifically, we +examine adversarial attacks that employ frequency domain transformations on +Alzheimer's disease images, along with other well-known adversarial attacks. +Our approach utilizes a convolutional neural network (CNN)-based autoencoder +architecture in conjunction with the two-dimensional Fourier transform of +images for detection purposes. The simulation results demonstrate that our +detection and defense mechanism effectively mitigates several adversarial +attacks, thereby enhancing the robustness of deep neural networks against such +vulnerabilities. + +
+
+ comment: 10 pages, 4 figures, conference +
+
+
+
+
+ + ☆ TEXTOC: Text-driven Object-Centric Style Transfer + + +
+ We present Text-driven Object-Centric Style Transfer (TEXTOC), a novel method +that guides style transfer at an object-centric level using textual inputs. The +core of TEXTOC is our Patch-wise Co-Directional (PCD) loss, meticulously +designed for precise object-centric transformations that are closely aligned +with the input text. This loss combines a patch directional loss for +text-guided style direction and a patch distribution consistency loss for even +CLIP embedding distribution across object regions. It ensures a seamless and +harmonious style transfer across object regions. Key to our method are the +Text-Matched Patch Selection (TMPS) and Pre-fixed Region Selection (PRS) +modules for identifying object locations via text, eliminating the need for +segmentation masks. Lastly, we introduce an Adaptive Background Preservation +(ABP) loss to maintain the original style and structural essence of the image's +background. This loss is applied to dynamically identified background areas. +Extensive experiments underline the effectiveness of our approach in creating +visually coherent and textually aligned style transfers. + +
+
+
+
+
+ + ♻ ☆ DivCon: Divide and Conquer for Progressive Text-to-Image Generation + + +
+ Diffusion-driven text-to-image (T2I) generation has achieved remarkable +advancements. To further improve T2I models' capability in numerical and +spatial reasoning, the layout is employed as an intermedium to bridge large +language models and layout-based diffusion models. However, these methods still +struggle with generating images from textural prompts with multiple objects and +complicated spatial relationships. To tackle this challenge, we introduce a +divide-and-conquer approach which decouples the T2I generation task into simple +subtasks. Our approach divides the layout prediction stage into numerical & +spatial reasoning and bounding box prediction. Then, the layout-to-image +generation stage is conducted in an iterative manner to reconstruct objects +from easy ones to difficult ones. We conduct experiments on the HRS and NSR-1K +benchmarks and our approach outperforms previous state-of-the-art models with +notable margins. In addition, visual results demonstrate that our approach +significantly improves the controllability and consistency in generating +multiple objects from complex textural prompts. + +
+
+
+
+
+ + ♻ ☆ DopQ-ViT: Towards Distribution-Friendly and Outlier-Aware Post-Training + Quantization for Vision Transformers + + +
+ Vision transformers (ViTs) have garnered significant attention for their +performance in vision tasks, but the high computational cost and significant +latency issues have hindered widespread adoption. Post-training quantization +(PTQ), a promising method for model compression, still faces accuracy +degradation challenges with ViTs. There are two reasons for this: the existing +quantization paradigm does not fit the power-law distribution of post-Softmax +activations well, and accuracy inevitably decreases after reparameterizing +post-LayerNorm activations. We propose a Distribution-Friendly and +Outlier-Aware Post-training Quantization method for Vision Transformers, named +DopQ-ViT. DopQ-ViT analyzes the inefficiencies of current quantizers and +introduces a distribution-friendly Tan Quantizer called TanQ. TanQ focuses more +on values near 1, more accurately preserving the power-law distribution of +post-Softmax activations, and achieves favorable results. Besides, during the +reparameterization of post-LayerNorm activations from channel-wise to +layer-wise quantization, the accuracy degradation is mainly due to the +significant impact of outliers in the scaling factors. Therefore, DopQ-ViT +proposes a method to select Median as the Optimal Scaling Factor, denoted as +MOSF, which compensates for the influence of outliers and preserves the +performance of the quantization model. DopQ-ViT has been extensively validated +and significantly improves the performance of quantization models, especially +in low-bit settings. + +
+
+
+
+
+ + ♻ ☆ ChemVLM: Exploring the Power of Multimodal Large Language Models in + Chemistry Area + + +
+ Large Language Models (LLMs) have achieved remarkable success and have been +applied across various scientific fields, including chemistry. However, many +chemical tasks require the processing of visual information, which cannot be +successfully handled by existing chemical LLMs. This brings a growing need for +models capable of integrating multimodal information in the chemical domain. In +this paper, we introduce \textbf{ChemVLM}, an open-source chemical multimodal +large language model specifically designed for chemical applications. ChemVLM +is trained on a carefully curated bilingual multimodal dataset that enhances +its ability to understand both textual and visual chemical information, +including molecular structures, reactions, and chemistry examination questions. +We develop three datasets for comprehensive evaluation, tailored to Chemical +Optical Character Recognition (OCR), Multimodal Chemical Reasoning (MMCR), and +Multimodal Molecule Understanding tasks. We benchmark ChemVLM against a range +of open-source and proprietary multimodal large language models on various +tasks. Experimental results demonstrate that ChemVLM achieves competitive +performance across all evaluated tasks. Our model can be found at +https://huggingface.co/AI4Chem/ChemVLM-26B. + +
+
+ comment: 11 pages, updated version +
+
+
+
+
+ + ♻ ☆ SLAM for Visually Impaired People: a Survey + + +
+ In recent decades, several assistive technologies have been developed to +improve the ability of blind and visually impaired (BVI) individuals to +navigate independently and safely. At the same time, simultaneous localization +and mapping (SLAM) techniques have become sufficiently robust and efficient to +be adopted in developing these assistive technologies. We present the first +systematic literature review of 54 recent studies on SLAM-based solutions for +blind and visually impaired people, focusing on literature published from 2017 +onward. This review explores various localization and mapping techniques +employed in this context. We systematically identified and categorized diverse +SLAM approaches and analyzed their localization and mapping techniques, sensor +types, computing resources, and machine-learning methods. We discuss the +advantages and limitations of these techniques for blind and visually impaired +navigation. Moreover, we examine the major challenges described across studies, +including practical challenges and considerations that affect usability and +adoption. Our analysis also evaluates the effectiveness of these SLAM-based +solutions in real-world scenarios and user satisfaction, providing insights +into their practical impact on BVI mobility. The insights derived from this +review identify critical gaps and opportunities for future research activities, +particularly in addressing the challenges presented by dynamic and complex +environments. We explain how SLAM technology offers the potential to improve +the ability of visually impaired individuals to navigate effectively. Finally, +we present future opportunities and challenges in this domain. + +
+
+ comment: 47 pages, 42 tables, 6 figures +
+
+
+
+
+ + ♻ ☆ CeCNN: Copula-enhanced convolutional neural networks in joint prediction + of refraction error and axial length based on ultra-widefield fundus images + + +
+ The ultra-widefield (UWF) fundus image is an attractive 3D biomarker in +AI-aided myopia screening because it provides much richer myopia-related +information. Though axial length (AL) has been acknowledged to be highly +related to the two key targets of myopia screening, Spherical Equivalence (SE) +measurement and high myopia diagnosis, its prediction based on the UWF fundus +image is rarely considered. To save the high expense and time costs of +measuring SE and AL, we propose the Copula-enhanced Convolutional Neural +Network (CeCNN), a one-stop UWF-based ophthalmic AI framework to jointly +predict SE, AL, and myopia status. The CeCNN formulates a multiresponse +regression that relates multiple dependent discrete-continuous responses and +the image covariate, where the nonlinearity of the association is modeled by a +backbone CNN. To thoroughly describe the dependence structure among the +responses, we model and incorporate the conditional dependence among responses +in a CNN through a new copula-likelihood loss. We provide statistical +interpretations of the conditional dependence among responses, and reveal that +such dependence is beyond the dependence explained by the image covariate. We +heuristically justify that the proposed loss can enhance the estimation +efficiency of the CNN weights. We apply the CeCNN to the UWF dataset collected +by us and demonstrate that the CeCNN sharply enhances the predictive capability +of various backbone CNNs. Our study evidences the ophthalmology view that +besides SE, AL is also an important measure to myopia. + +
+
+
+
+
+ + ♻ ☆ Multi-task Image Restoration Guided By Robust DINO Features + + +
+ Multi-task image restoration has gained significant interest due to its +inherent versatility and efficiency compared to its single-task counterpart. +However, performance decline is observed with an increase in the number of +tasks, primarily attributed to the restoration model's challenge in handling +different tasks with distinct natures at the same time. Thus, a perspective +emerged aiming to explore the degradation-insensitive semantic commonalities +among different degradation tasks. In this paper, we observe that the features +of DINOv2 can effectively model semantic information and are independent of +degradation factors. Motivated by this observation, we propose +\mbox{\textbf{DINO-IR}}, a multi-task image restoration approach leveraging +robust features extracted from DINOv2 to solve multi-task image restoration +simultaneously. We first propose a pixel-semantic fusion (PSF) module to +dynamically fuse DINOV2's shallow features containing pixel-level information +and deep features containing degradation-independent semantic information. To +guide the restoration model with the features of DINOv2, we develop a +DINO-Restore adaption and fusion module to adjust the channel of fused features +from PSF and then integrate them with the features from the restoration model. +By formulating these modules into a unified deep model, we propose a DINO +perception contrastive loss to constrain the model training. Extensive +experimental results demonstrate that our DINO-IR performs favorably against +existing multi-task image restoration approaches in various tasks by a large +margin. The source codes and trained models will be made available. + +
+
+
+
+
+ + ♻ ☆ GLDiTalker: Speech-Driven 3D Facial Animation with Graph Latent + Diffusion Transformer + + +
+ Speech-driven talking head generation is an important but challenging task +for many downstream applications such as augmented reality. Existing methods +have achieved remarkable performance by utilizing autoregressive models or +diffusion models. However, most still suffer from modality inconsistencies, +specifically the misalignment between audio and mesh modalities, which causes +inconsistencies in motion diversity and lip-sync accuracy. To address this +issue, this paper introduces GLDiTalker, a novel speech-driven 3D facial +animation model that employs a Graph Latent Diffusion Transformer. The core +idea behind GLDiTalker is that the audio-mesh modality misalignment can be +resolved by diffusing the signal in a latent quantilized spatial-temporal +space. To achieve this, GLDiTalker builds upon a quantilized space-time +diffusion training pipeline, which consists of a Graph Enhanced Quantilized +Space Learning Stage and a Space-Time Powered Latent Diffusion Stage. The first +stage ensures lip-sync accuracy, while the second stage enhances motion +diversity. Together, these stages enable GLDiTalker to generate temporally and +spatially stable, realistic models. Extensive evaluations on several widely +used benchmarks demonstrate that our method achieves superior performance +compared to existing methods. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Gaussian Pancakes: Geometrically-Regularized 3D Gaussian Splatting for + Realistic Endoscopic Reconstruction + + +
+ Within colorectal cancer diagnostics, conventional colonoscopy techniques +face critical limitations, including a limited field of view and a lack of +depth information, which can impede the detection of precancerous lesions. +Current methods struggle to provide comprehensive and accurate 3D +reconstructions of the colonic surface which can help minimize the missing +regions and reinspection for pre-cancerous polyps. Addressing this, we +introduce 'Gaussian Pancakes', a method that leverages 3D Gaussian Splatting +(3D GS) combined with a Recurrent Neural Network-based Simultaneous +Localization and Mapping (RNNSLAM) system. By introducing geometric and depth +regularization into the 3D GS framework, our approach ensures more accurate +alignment of Gaussians with the colon surface, resulting in smoother 3D +reconstructions with novel viewing of detailed textures and structures. +Evaluations across three diverse datasets show that Gaussian Pancakes enhances +novel view synthesis quality, surpassing current leading methods with a 18% +boost in PSNR and a 16% improvement in SSIM. It also delivers over 100X faster +rendering and more than 10X shorter training times, making it a practical tool +for real-time applications. Hence, this holds promise for achieving clinical +translation for better detection and diagnosis of colorectal cancer. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ A Medical Data-Effective Learning Benchmark for Highly Efficient + Pre-training of Foundation Models + + +
+ Foundation models, pre-trained on massive datasets, have achieved +unprecedented generalizability. However, is it truly necessary to involve such +vast amounts of data in pre-training, consuming extensive computational +resources? This paper introduces data-effective learning, aiming to use data in +the most impactful way to pre-train foundation models. This involves strategies +that focus on data quality rather than quantity, ensuring the data used for +training has high informational value. Data-effective learning plays a profound +role in accelerating foundation model training, reducing computational costs, +and saving data storage, which is very important as the volume of medical data +in recent years has grown beyond many people's expectations. However, due to +the lack of standards and comprehensive benchmarks, research on medical +data-effective learning is poorly studied. To address this gap, our paper +introduces a comprehensive benchmark specifically for evaluating data-effective +learning in the medical field. This benchmark includes a dataset with millions +of data samples from 31 medical centers (DataDEL), a baseline method for +comparison (MedDEL), and a new evaluation metric (NormDEL) to objectively +measure data-effective learning performance. Our extensive experimental results +show the baseline MedDEL can achieve performance comparable to the original +large dataset with only 5% of the data. Establishing such an open +data-effective learning benchmark is crucial for the medical foundation model +research community because it facilitates efficient data use, promotes +collaborative breakthroughs, and fosters the development of cost-effective, +scalable, and impactful healthcare solutions. + +
+
+
+
+
+ + ♻ ☆ MIMIR: Masked Image Modeling for Mutual Information-based Adversarial + Robustness + + +
+ Vision Transformers (ViTs) achieve excellent performance in various tasks, +but they are also vulnerable to adversarial attacks. Building robust ViTs is +highly dependent on dedicated Adversarial Training (AT) strategies. However, +current ViTs' adversarial training only employs well-established training +approaches from convolutional neural network (CNN) training, where pre-training +provides the basis for AT fine-tuning with the additional help of tailored data +augmentations. In this paper, we take a closer look at the adversarial +robustness of ViTs by providing a novel theoretical Mutual Information (MI) +analysis in its autoencoder-based self-supervised pre-training. Specifically, +we show that MI between the adversarial example and its latent representation +in ViT-based autoencoders should be constrained by utilizing the MI bounds. +Based on this finding, we propose a masked autoencoder-based pre-training +method, MIMIR, that employs an MI penalty to facilitate the adversarial +training of ViTs. Extensive experiments show that MIMIR outperforms +state-of-the-art adversarially trained ViTs on benchmark datasets with higher +natural and robust accuracy, indicating that ViTs can substantially benefit +from exploiting MI. In addition, we consider two adaptive attacks by assuming +that the adversary is aware of the MIMIR design, which further verifies the +provided robustness. + +
+
+
+
+
+ + ♻ ☆ Motion-compensated MR CINE reconstruction with reconstruction-driven + motion estimation + + +
+ In cardiac CINE, motion-compensated MR reconstruction (MCMR) is an effective +approach to address highly undersampled acquisitions by incorporating motion +information between frames. In this work, we propose a novel perspective for +addressing the MCMR problem and a more integrated and efficient solution to the +MCMR field. Contrary to state-of-the-art (SOTA) MCMR methods which break the +original problem into two sub-optimization problems, i.e. motion estimation and +reconstruction, we formulate this problem as a single entity with one single +optimization. Our approach is unique in that the motion estimation is directly +driven by the ultimate goal, reconstruction, but not by the canonical +motion-warping loss (similarity measurement between motion-warped images and +target images). We align the objectives of motion estimation and +reconstruction, eliminating the drawbacks of artifacts-affected motion +estimation and therefore error-propagated reconstruction. Further, we can +deliver high-quality reconstruction and realistic motion without applying any +regularization/smoothness loss terms, circumventing the non-trivial weighting +factor tuning. We evaluate our method on two datasets: 1) an in-house acquired +2D CINE dataset for the retrospective study and 2) the public OCMR cardiac +dataset for the prospective study. The conducted experiments indicate that the +proposed MCMR framework can deliver artifact-free motion estimation and +high-quality MR images even for imaging accelerations up to 20x, outperforming +SOTA non-MCMR and MCMR methods in both qualitative and quantitative evaluation +across all experiments. The code is available at +https://github.com/JZPeterPan/MCMR-Recon-Driven-Motion. + +
+
+
+
+
+ + ♻ ☆ Beyond Full Label: Single-Point Prompt for Infrared Small Target Label + Generation + + +
+ In this work, we make the first attempt to construct a learning-based +single-point annotation paradigm for infrared small target label generation +(IRSTLG). Our intuition is that label generation requires just one more point +prompt than target detection: IRSTLG can be regarded as an infrared small +target detection (IRSTD) task with the target location hint. Based on this +insight, we introduce an energy double guided single-point prompt (EDGSP) +framework, which adeptly transforms the target detection network into a refined +label generation method. Specifically, the proposed EDGSP includes: 1) target +energy initialization (TEI) to create a foundational outline for sufficient +shape evolution of pseudo label, 2) double prompt embedding (DPE) for rapid +localization of interested regions and reinforcement of individual differences +to avoid label adhesion, and 3) bounding box-based matching (BBM) to eliminate +false alarms. Experimental results show that pseudo labels generated by three +baselines equipped with EDGSP achieve 100% object-level probability of +detection (Pd) and 0% false-alarm rate (Fa) on SIRST, NUDT-SIRST, and IRSTD-1k +datasets, with a pixel-level intersection over union (IoU) improvement of +13.28% over state-of-the-art (SOTA) label generation methods. In the practical +application of downstream IRSTD, EDGSP realizes, for the first time, a +single-point generated pseudo mask beyond the full label. Even with coarse +single-point annotations, it still achieves 99.5% performance of full labeling. + +
+
+
+
+
+ + ♻ ☆ CLIP-EBC: CLIP Can Count Accurately through Enhanced Blockwise + Classification + + +
+ We propose CLIP-EBC, the first fully CLIP-based model for accurate crowd +density estimation. While the CLIP model has demonstrated remarkable success in +addressing recognition tasks such as zero-shot image classification, its +potential for counting has been largely unexplored due to the inherent +challenges in transforming a regression problem, such as counting, into a +recognition task. In this work, we investigate and enhance CLIP's ability to +count, focusing specifically on the task of estimating crowd sizes from images. +Existing classification-based crowd-counting frameworks have significant +limitations, including the quantization of count values into bordering +real-valued bins and the sole focus on classification errors. These practices +result in label ambiguity near the shared borders and inaccurate prediction of +count values. Hence, directly applying CLIP within these frameworks may yield +suboptimal performance. + To address these challenges, we first propose the Enhanced Blockwise +Classification (EBC) framework. Unlike previous methods, EBC utilizes +integer-valued bins, effectively reducing ambiguity near bin boundaries. +Additionally, it incorporates a regression loss based on density maps to +improve the prediction of count values. Within our backbone-agnostic EBC +framework, we then introduce CLIP-EBC to fully leverage CLIP's recognition +capabilities for this task. Extensive experiments demonstrate the effectiveness +of EBC and the competitive performance of CLIP-EBC. Specifically, our EBC +framework can improve existing classification-based methods by up to 44.5% on +the UCF-QNRF dataset, and CLIP-EBC achieves state-of-the-art performance on the +NWPU-Crowd test set, with an MAE of 58.2 and an RMSE of 268.5, representing +improvements of 8.6% and 13.3% over the previous best method, STEERER. The code +and weights are available at https://github.com/Yiming-M/CLIP-EBC. + +
+
+
+
+
+ + ♻ ☆ DualFocus: Integrating Plausible Descriptions in Text-based Person + Re-identification + + +
+ Text-based Person Re-identification (TPR) aims to retrieve specific +individual images from datasets based on textual descriptions. Existing TPR +methods primarily focus on recognizing explicit and positive characteristics, +often overlooking the role of negative descriptions. This oversight can lead to +false positives-images that meet positive criteria but should be excluded based +on negative descriptions. To address these limitations, we introduce DualFocus, +a unified framework that integrates plausible descriptions to enhance the +interpretative accuracy of vision-language models in TPR tasks. DualFocus +leverages Dual (Positive/Negative) Attribute Prompt Learning (DAPL), which +incorporates Dual Image-Attribute Contrastive (DIAC) Learning and Sensitive +Image-Attributes Matching (SIAM) Learning, enabling the detection of +non-existent attributes and reducing false positives. To achieve a balance +between coarse and fine-grained alignment of visual and textual embeddings, we +propose the Dynamic Tokenwise Similarity (DTS) loss, which refines the +representation of both matching and non-matching descriptions, thereby +improving the matching process through detailed and adaptable similarity +assessments. The comprehensive experiments on CUHK-PEDES, ICFG-PEDES, and +RSTPReid, DualFocus demonstrates superior performance over state-of-the-art +methods, significantly enhancing both precision and robustness in TPR. + +
+
+
+
+
+ + ♻ ☆ Distilling High Diagnostic Value Patches for Whole Slide Image + Classification Using Attention Mechanism + + +
+ Multiple Instance Learning (MIL) has garnered widespread attention in the +field of Whole Slide Image (WSI) classification as it replaces pixel-level +manual annotation with diagnostic reports as labels, significantly reducing +labor costs. Recent research has shown that bag-level MIL methods often yield +better results because they can consider all patches of the WSI as a whole. +However, a drawback of such methods is the incorporation of more redundant +patches, leading to interference. To extract patches with high diagnostic value +while excluding interfering patches to address this issue, we developed an +attention-based feature distillation multi-instance learning (AFD-MIL) +approach. This approach proposed the exclusion of redundant patches as a +preprocessing operation in weakly supervised learning, directly mitigating +interference from extensive noise. It also pioneers the use of attention +mechanisms to distill features with high diagnostic value, as opposed to the +traditional practice of indiscriminately and forcibly integrating all patches. +Additionally, we introduced global loss optimization to finely control the +feature distillation module. AFD-MIL is orthogonal to many existing MIL +methods, leading to consistent performance improvements. This approach has +surpassed the current state-of-the-art method, achieving 91.47% ACC (accuracy) +and 94.29% AUC (area under the curve) on the Camelyon16 (Camelyon Challenge +2016, breast cancer), while 93.33% ACC and 98.17% AUC on the TCGA-NSCLC (The +Cancer Genome Atlas Program: non-small cell lung cancer). Different feature +distillation methods were used for the two datasets, tailored to the specific +diseases, thereby improving performance and interpretability. + +
+
+
+
+
+ + ♻ ☆ Adaptive Learning of Consistency and Inconsistency Information for Fake + News Detection + + +
+ The rapid advancement of social media platforms has significantly reduced the +cost of information dissemination, yet it has also led to a proliferation of +fake news, posing a threat to societal trust and credibility. Most of fake news +detection research focused on integrating text and image information to +represent the consistency of multiple modes in news content, while paying less +attention to inconsistent information. Besides, existing methods that leveraged +inconsistent information often caused one mode overshadowing another, leading +to ineffective use of inconsistent clue. To address these issues, we propose an +adaptive multi-modal feature fusion network (MFF-Net). Inspired by human +judgment processes for determining truth and falsity in news, MFF-Net focuses +on inconsistent parts when news content is generally consistent and consistent +parts when it is generally inconsistent. Specifically, MFF-Net extracts +semantic and global features from images and texts respectively, and learns +consistency information between modes through a multiple feature fusion module. +To deal with the problem of modal information being easily masked, we design a +single modal feature filtering strategy to capture inconsistent information +from corresponding modes separately. Finally, similarity scores are calculated +based on global features with adaptive adjustments made to achieve weighted +fusion of consistent and inconsistent features. Extensive experimental results +demonstrate that MFF-Net outperforms state-of-the-art methods across three +public news datasets derived from real social medias. + +
+
+
+
+
+ + ♻ ☆ Gradient Alignment Improves Test-Time Adaptation for Medical Image + Segmentation + + +
+ Although recent years have witnessed significant advancements in medical +image segmentation, the pervasive issue of domain shift among medical images +from diverse centres hinders the effective deployment of pre-trained models. +Many Test-time Adaptation (TTA) methods have been proposed to address this +issue by fine-tuning pre-trained models with test data during inference. These +methods, however, often suffer from less-satisfactory optimization due to +suboptimal optimization direction (dictated by the gradient) and fixed +step-size (predicated on the learning rate). In this paper, we propose the +Gradient alignment-based Test-time adaptation (GraTa) method to improve both +the gradient direction and learning rate in the optimization procedure. Unlike +conventional TTA methods, which primarily optimize the pseudo gradient derived +from a self-supervised objective, our method incorporates an auxiliary gradient +with the pseudo one to facilitate gradient alignment. Such gradient alignment +enables the model to excavate the similarities between different gradients and +correct the gradient direction to approximate the empirical gradient related to +the current segmentation task. Additionally, we design a dynamic learning rate +based on the cosine similarity between the pseudo and auxiliary gradients, +thereby empowering the adaptive fine-tuning of pre-trained models on diverse +test data. Extensive experiments establish the effectiveness of the proposed +gradient alignment and dynamic learning rate and substantiate the superiority +of our GraTa method over other state-of-the-art TTA methods on a benchmark +medical image segmentation task. The code and weights of pre-trained source +models will be available. + +
+
+
+
+
+ + ♻ ☆ HCS-TNAS: Hybrid Constraint-driven Semi-supervised Transformer-NAS for + Ultrasound Image Segmentation + + +
+ Precise ultrasound segmentation is vital for clinicians to provide +comprehensive diagnoses. However, developing a model that accurately segments +ultrasound images is challenging due to the images' low quality and the +scarcity of extensive labeled data. This results in two main solutions: (1) +optimizing multi-scale feature representations, and (2) increasing resistance +to data dependency. The first approach necessitates an advanced network +architecture, but a handcrafted network is knowledge-intensive and often yields +limited improvement. In contrast, neural architecture search (NAS) can more +easily attain optimal performance, albeit with significant computational costs. +Regarding the second issue, semi-supervised learning (SSL) is an established +method, but combining it with complex NAS faces the risk of overfitting to a +few labeled samples without extra constraints. Therefore, we introduce a hybrid +constraint-driven semi-supervised Transformer-NAS (HCS-TNAS), balancing both +solutions for segmentation. HCS-TNAS includes an Efficient NAS-ViT module for +multi-scale token search before ViT's attention calculation, effectively +capturing contextual and local information with lower computational costs, and +a hybrid SSL framework that adds network independence and contrastive learning +to the optimization for solving data dependency. By further developing a +stage-wise optimization strategy, a rational network structure is identified. +Experiments on public datasets show that HCS-TNAS achieves state-of-the-art +performance, pushing the limit of ultrasound segmentation. + +
+
+
+
+
+ + ♻ ☆ ICAL: Implicit Character-Aided Learning for Enhanced Handwritten + Mathematical Expression Recognition ICDAR 2024 + + +
+ Significant progress has been made in the field of handwritten mathematical +expression recognition, while existing encoder-decoder methods are usually +difficult to model global information in $LaTeX$. Therefore, this paper +introduces a novel approach, Implicit Character-Aided Learning (ICAL), to mine +the global expression information and enhance handwritten mathematical +expression recognition. Specifically, we propose the Implicit Character +Construction Module (ICCM) to predict implicit character sequences and use a +Fusion Module to merge the outputs of the ICCM and the decoder, thereby +producing corrected predictions. By modeling and utilizing implicit character +information, ICAL achieves a more accurate and context-aware interpretation of +handwritten mathematical expressions. Experimental results demonstrate that +ICAL notably surpasses the state-of-the-art(SOTA) models, improving the +expression recognition rate (ExpRate) by 2.25\%/1.81\%/1.39\% on the CROHME +2014/2016/2019 datasets respectively, and achieves a remarkable 69.06\% on the +challenging HME100k test set. We make our code available on the GitHub: +https://github.com/qingzhenduyu/ICAL + +
+
+ comment: ICDAR 2024 Oral Paper +
+
+
+
+
+ + ♻ ☆ FancyVideo: Towards Dynamic and Consistent Video Generation via + Cross-frame Textual Guidance + + +
+ Synthesizing motion-rich and temporally consistent videos remains a challenge +in artificial intelligence, especially when dealing with extended durations. +Existing text-to-video (T2V) models commonly employ spatial cross-attention for +text control, equivalently guiding different frame generations without +frame-specific textual guidance. Thus, the model's capacity to comprehend the +temporal logic conveyed in prompts and generate videos with coherent motion is +restricted. To tackle this limitation, we introduce FancyVideo, an innovative +video generator that improves the existing text-control mechanism with the +well-designed Cross-frame Textual Guidance Module (CTGM). Specifically, CTGM +incorporates the Temporal Information Injector (TII), Temporal Affinity Refiner +(TAR), and Temporal Feature Booster (TFB) at the beginning, middle, and end of +cross-attention, respectively, to achieve frame-specific textual guidance. +Firstly, TII injects frame-specific information from latent features into text +conditions, thereby obtaining cross-frame textual conditions. Then, TAR refines +the correlation matrix between cross-frame textual conditions and latent +features along the time dimension. Lastly, TFB boosts the temporal consistency +of latent features. Extensive experiments comprising both quantitative and +qualitative evaluations demonstrate the effectiveness of FancyVideo. Our video +demo, code and model are available at https://360cvgroup.github.io/FancyVideo/. + +
+
+
+
+
+ + ♻ ☆ PEANO-ViT: Power-Efficient Approximations of Non-Linearities in Vision + Transformers + + +
+ The deployment of Vision Transformers (ViTs) on hardware platforms, specially +Field-Programmable Gate Arrays (FPGAs), presents many challenges, which are +mainly due to the substantial computational and power requirements of their +non-linear functions, notably layer normalization, softmax, and Gaussian Error +Linear Unit (GELU). These critical functions pose significant obstacles to +efficient hardware implementation due to their complex mathematical operations +and the inherent resource count and architectural limitations of FPGAs. +PEANO-ViT offers a novel approach to streamlining the implementation of the +layer normalization layer by introducing a division-free technique that +simultaneously approximates the division and square root function. +Additionally, PEANO-ViT provides a multi-scale division strategy to eliminate +division operations in the softmax layer, aided by a Pade-based approximation +for the exponential function. Finally, PEANO-ViT introduces a piece-wise linear +approximation for the GELU function, carefully designed to bypass the +computationally intensive operations associated with GELU. In our comprehensive +evaluations, PEANO-ViT exhibits minimal accuracy degradation (<= 0.5% for +DeiT-B) while significantly enhancing power efficiency, achieving improvements +of 1.91x, 1.39x, 8.01x for layer normalization, softmax, and GELU, +respectively. This improvement is achieved through substantial reductions in +DSP, LUT, and register counts for these non-linear operations. Consequently, +PEANO-ViT enables efficient deployment of Vision Transformers on resource- and +power-constrained FPGA platforms. + +
+
+
+
+
+ + ♻ ☆ VersusDebias: Universal Zero-Shot Debiasing for Text-to-Image Models via + SLM-Based Prompt Engineering and Generative Adversary + + +
+ With the rapid development of Text-to-Image (T2I) models, biases in human +image generation against demographic social groups become a significant +concern, impacting fairness and ethical standards in AI. Some researchers +propose their methods to tackle with the issue. However, existing methods are +designed for specific models with fixed prompts, limiting their adaptability to +the fast-evolving models and diverse practical scenarios. Moreover, they +neglect the impact of hallucinations, leading to discrepancies between expected +and actual results. To address these issues, we introduce VersusDebias, a novel +and universal debiasing framework for biases in arbitrary T2I models, +consisting of an array generation (AG) module and an image generation (IG) +module. The self-adaptive AG module generates specialized attribute arrays to +post-process hallucinations and debias multiple attributes simultaneously. The +IG module employs a small language model to modify prompts according to the +arrays and drives the T2I model to generate debiased images, enabling zero-shot +debiasing. Extensive experiments demonstrate VersusDebias's capability to +debias any models across gender, race, and age simultaneously. In both +zero-shot and few-shot scenarios, VersusDebias outperforms existing methods, +showcasing its exceptional utility. Our work is accessible at +https://github.com/VersusDebias/VersusDebias to ensure reproducibility and +facilitate further research. + +
+
+
+
+
+ + ♻ ☆ Self-Learning Symmetric Multi-view Probabilistic Clustering + + +
+ Multi-view Clustering (MVC) has achieved significant progress, with many +efforts dedicated to learn knowledge from multiple views. However, most +existing methods are either not applicable or require additional steps for +incomplete MVC. Such a limitation results in poor-quality clustering +performance and poor missing view adaptation. Besides, noise or outliers might +significantly degrade the overall clustering performance, which are not handled +well by most existing methods. In this paper, we propose a novel unified +framework for incomplete and complete MVC named self-learning symmetric +multi-view probabilistic clustering (SLS-MPC). SLS-MPC proposes a novel +symmetric multi-view probability estimation and equivalently transforms +multi-view pairwise posterior matching probability into composition of each +view's individual distribution, which tolerates data missing and might extend +to any number of views. Then, SLS-MPC proposes a novel self-learning +probability function without any prior knowledge and hyper-parameters to learn +each view's individual distribution. Next, graph-context-aware refinement with +path propagation and co-neighbor propagation is used to refine pairwise +probability, which alleviates the impact of noise and outliers. Finally, +SLS-MPC proposes a probabilistic clustering algorithm to adjust clustering +assignments by maximizing the joint probability iteratively without category +information. Extensive experiments on multiple benchmarks show that SLS-MPC +outperforms previous state-of-the-art methods. + +
+
+ comment: accepted by IEEE Transactions on Knowledge and Data Engineering(TKDE) +
+
+
+
+
+ + ♻ ☆ BIGbench: A Unified Benchmark for Social Bias in Text-to-Image + Generative Models Based on Multi-modal LLM + + +
+ Text-to-Image (T2I) generative models are becoming increasingly crucial due +to their ability to generate high-quality images, which also raises concerns +about the social biases in their outputs, especially in the human generation. +Sociological research has established systematic classifications of bias. +However, existing bias research about T2I models conflates different types of +bias, impeding methodological progress. In this paper, we introduce BIGbench, a +unified benchmark for Biases of Image Generation, featuring a meticulously +designed dataset. Unlike existing benchmarks, BIGbench classifies and evaluates +biases across four dimensions: manifestation of bias, visibility of bias, +acquired attributes, and protected attributes, which ensures exceptional +accuracy for analysis. Furthermore, BIGbench applies advanced multi-modal large +language models to achieve fully automated and highly accurate evaluations. We +apply BIGbench to evaluate eight representative general T2I models and three +debiased methods. Our human evaluation results underscore BIGbench's +effectiveness in aligning images and identifying various biases. Besides, our +study also reveal new research directions about biases, such as the effect of +distillation and irrelevant protected attributes. Our benchmark is openly +accessible at https://github.com/BIGbench2024/BIGbench2024/ to ensure +reproducibility. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2405.17814 +
+
+
+
+
+ + ♻ ☆ Metric3D v2: A Versatile Monocular Geometric Foundation Model for + Zero-shot Metric Depth and Surface Normal Estimation + + +
+ We introduce Metric3D v2, a geometric foundation model for zero-shot metric +depth and surface normal estimation from a single image, which is crucial for +metric 3D recovery. While depth and normal are geometrically related and highly +complimentary, they present distinct challenges. SoTA monocular depth methods +achieve zero-shot generalization by learning affine-invariant depths, which +cannot recover real-world metrics. Meanwhile, SoTA normal estimation methods +have limited zero-shot performance due to the lack of large-scale labeled data. +To tackle these issues, we propose solutions for both metric depth estimation +and surface normal estimation. For metric depth estimation, we show that the +key to a zero-shot single-view model lies in resolving the metric ambiguity +from various camera models and large-scale data training. We propose a +canonical camera space transformation module, which explicitly addresses the +ambiguity problem and can be effortlessly plugged into existing monocular +models. For surface normal estimation, we propose a joint depth-normal +optimization module to distill diverse data knowledge from metric depth, +enabling normal estimators to learn beyond normal labels. Equipped with these +modules, our depth-normal models can be stably trained with over 16 million of +images from thousands of camera models with different-type annotations, +resulting in zero-shot generalization to in-the-wild images with unseen camera +settings. Our method enables the accurate recovery of metric 3D structures on +randomly collected internet images, paving the way for plausible single-image +metrology. Our project page is at https://JUGGHM.github.io/Metric3Dv2. + +
+
+ comment: Our project page is at https://JUGGHM.github.io/Metric3Dv2. Accpeted + to TPAMI. arXiv admin note: text overlap with arXiv:2307.10984 +
+
+
+
+
+ + ♻ ☆ HandRefiner: Refining Malformed Hands in Generated Images by + Diffusion-based Conditional Inpainting + + +
+ Diffusion models have achieved remarkable success in generating realistic +images but suffer from generating accurate human hands, such as incorrect +finger counts or irregular shapes. This difficulty arises from the complex task +of learning the physical structure and pose of hands from training images, +which involves extensive deformations and occlusions. For correct hand +generation, our paper introduces a lightweight post-processing solution called +$\textbf{HandRefiner}$. HandRefiner employs a conditional inpainting approach +to rectify malformed hands while leaving other parts of the image untouched. We +leverage the hand mesh reconstruction model that consistently adheres to the +correct number of fingers and hand shape, while also being capable of fitting +the desired hand pose in the generated image. Given a generated failed image +due to malformed hands, we utilize ControlNet modules to re-inject such correct +hand information. Additionally, we uncover a phase transition phenomenon within +ControlNet as we vary the control strength. It enables us to take advantage of +more readily available synthetic data without suffering from the domain gap +between realistic and synthetic hands. Experiments demonstrate that HandRefiner +can significantly improve the generation quality quantitatively and +qualitatively. The code is available at +https://github.com/wenquanlu/HandRefiner . + +
+
+
+
+
+ + ♻ ☆ AdaDiff: Accelerating Diffusion Models through Step-Wise Adaptive + Computation + + +
+ Diffusion models achieve great success in generating diverse and +high-fidelity images, yet their widespread application, especially in real-time +scenarios, is hampered by their inherently slow generation speed. The slow +generation stems from the necessity of multi-step network inference. While some +certain predictions benefit from the full computation of the model in each +sampling iteration, not every iteration requires the same amount of +computation, potentially leading to inefficient computation. Unlike typical +adaptive computation challenges that deal with single-step generation problems, +diffusion processes with a multi-step generation need to dynamically adjust +their computational resource allocation based on the ongoing assessment of each +step's importance to the final image output, presenting a unique set of +challenges. In this work, we propose AdaDiff, an adaptive framework that +dynamically allocates computation resources in each sampling step to improve +the generation efficiency of diffusion models. To assess the effects of changes +in computational effort on image quality, we present a timestep-aware +uncertainty estimation module (UEM). Integrated at each intermediate layer, the +UEM evaluates the predictive uncertainty. This uncertainty measurement serves +as an indicator for determining whether to terminate the inference process. +Additionally, we introduce an uncertainty-aware layer-wise loss aimed at +bridging the performance gap between full models and their adaptive +counterparts. + +
+
+
+
+
+ + ♻ ☆ Incomplete Multimodal Industrial Anomaly Detection via Cross-Modal + Distillation + + +
+ Recent studies of multimodal industrial anomaly detection (IAD) based on 3D +point clouds and RGB images have highlighted the importance of exploiting the +redundancy and complementarity among modalities for accurate classification and +segmentation. However, achieving multimodal IAD in practical production lines +remains a work in progress. It is essential to consider the trade-offs between +the costs and benefits associated with the introduction of new modalities while +ensuring compatibility with current processes. Existing quality control +processes combine rapid in-line inspections, such as optical and infrared +imaging with high-resolution but time-consuming near-line characterization +techniques, including industrial CT and electron microscopy to manually or +semi-automatically locate and analyze defects in the production of Li-ion +batteries and composite materials. Given the cost and time limitations, only a +subset of the samples can be inspected by all in-line and near-line methods, +and the remaining samples are only evaluated through one or two forms of +in-line inspection. To fully exploit data for deep learning-driven automatic +defect detection, the models must have the ability to leverage multimodal +training and handle incomplete modalities during inference. In this paper, we +propose CMDIAD, a Cross-Modal Distillation framework for IAD to demonstrate the +feasibility of a Multi-modal Training, Few-modal Inference (MTFI) pipeline. Our +findings show that the MTFI pipeline can more effectively utilize incomplete +multimodal information compared to applying only a single modality for training +and inference. Moreover, we investigate the reasons behind the asymmetric +performance improvement using point clouds or RGB images as the main modality +of inference. This provides a foundation for our future multimodal dataset +construction with additional modalities from manufacturing scenarios. + +
+
+
+
+
+ + ♻ ☆ OC3D: Weakly Supervised Outdoor 3D Object Detection with Only Coarse + Click Annotation + + +
+ LiDAR-based outdoor 3D object detection has received widespread attention. +However, training 3D detectors from the LiDAR point cloud typically relies on +expensive bounding box annotations. This paper presents OC3D, an innovative +weakly supervised method requiring only coarse clicks on the bird's eye view of +the 3D point cloud. A key challenge here is the absence of complete geometric +descriptions of the target objects from such simple click annotations. To +address this problem, our proposed OC3D adopts a two-stage strategy. In the +first stage, we initially design a novel dynamic and static classification +strategy and then propose the Click2Box and Click2Mask modules to generate +box-level and mask-level pseudo-labels for static and dynamic instances, +respectively. In the second stage, we design a Mask2Box module, leveraging the +learning capabilities of neural networks to update mask-level pseudo-labels, +which contain less information, to box-level pseudo-labels. Experimental +results on the widely used KITTI and nuScenes datasets demonstrate that our +OC3D with only coarse clicks achieves state-of-the-art performance compared to +weakly-supervised 3D detection methods. Combining OC3D with a missing click +mining strategy, we propose an OC3D++ pipeline, which requires only 0.2% +annotation cost in the KITTI dataset to achieve performance comparable to fully +supervised methods. The code will be made publicly available. + +
+
+
+
+
+ + ♻ ☆ Enhanced Self-Checkout System for Retail Based on Improved YOLOv10 + + +
+ With the rapid advancement of deep learning technologies, computer vision has +shown immense potential in retail automation. This paper presents a novel +self-checkout system for retail based on an improved YOLOv10 network, aimed at +enhancing checkout efficiency and reducing labor costs. We propose targeted +optimizations to the YOLOv10 model, by incorporating the detection head +structure from YOLOv8, which significantly improves product recognition +accuracy. Additionally, we develop a post-processing algorithm tailored for +self-checkout scenarios, to further enhance the application of system. +Experimental results demonstrate that our system outperforms existing methods +in both product recognition accuracy and checkout speed. This research not only +provides a new technical solution for retail automation but offers valuable +insights into optimizing deep learning models for real-world applications. + +
+
+
+
+
+ + ♻ ☆ Exploring learning environments for label\-efficient cancer diagnosis + + +
+ Despite significant research efforts and advancements, cancer remains a +leading cause of mortality. Early cancer prediction has become a crucial focus +in cancer research to streamline patient care and improve treatment outcomes. +Manual tumor detection by histopathologists can be time consuming, prompting +the need for computerized methods to expedite treatment planning. Traditional +approaches to tumor detection rely on supervised learning, necessitates a large +amount of annotated data for model training. However, acquiring such extensive +labeled data can be laborious and time\-intensive. This research examines the +three learning environments: supervised learning (SL), semi\-supervised +learning (Semi\-SL), and self\-supervised learning (Self\-SL): to predict +kidney, lung, and breast cancer. Three pre\-trained deep learning models +(Residual Network\-50, Visual Geometry Group\-16, and EfficientNetB0) are +evaluated based on these learning settings using seven carefully curated +training sets. To create the first training set (TS1), SL is applied to all +annotated image samples. Five training sets (TS2\-TS6) with different ratios of +labeled and unlabeled cancer images are used to evaluateSemi\-SL. Unlabeled +cancer images from the final training set (TS7) are utilized for Self\-SL +assessment. Among different learning environments, outcomes from the Semi\-SL +setting show a strong degree of agreement with the outcomes achieved in the SL +setting. The uniform pattern of observations from the pre\-trained models +across all three datasets validates the methodology and techniques of the +research. Based on modest number of labeled samples and minimal computing cost, +our study suggests that the Semi\-SL option can be a highly viable replacement +for the SL option under label annotation constraint scenarios. + +
+
+ comment: Submitted to the journal +
+
+
+
+
+ + ♻ ☆ Multistatic-Radar RCS-Signature Recognition of Aerial Vehicles: A + Bayesian Fusion Approach + + +
+ Radar Automated Target Recognition (RATR) for Unmanned Aerial Vehicles (UAVs) +involves transmitting Electromagnetic Waves (EMWs) and performing target type +recognition on the received radar echo, crucial for defense and aerospace +applications. Previous studies highlighted the advantages of multistatic radar +configurations over monostatic ones in RATR. However, fusion methods in +multistatic radar configurations often suboptimally combine classification +vectors from individual radars probabilistically. To address this, we propose a +fully Bayesian RATR framework employing Optimal Bayesian Fusion (OBF) to +aggregate classification probability vectors from multiple radars. OBF, based +on expected 0-1 loss, updates a Recursive Bayesian Classification (RBC) +posterior distribution for target UAV type, conditioned on historical +observations across multiple time steps. We evaluate the approach using +simulated random walk trajectories for seven drones, correlating target aspect +angles to Radar Cross Section (RCS) measurements in an anechoic chamber. +Comparing against single radar Automated Target Recognition (ATR) systems and +suboptimal fusion methods, our empirical results demonstrate that the OBF +method integrated with RBC significantly enhances classification accuracy +compared to other fusion methods and single radar configurations. + +
+
+ comment: Accepted to IEEE Transactions on Aerospace and Electronic Systems +
+
+
+
+
+ + ♻ ☆ Novel-View Acoustic Synthesis from 3D Reconstructed Rooms + + +
+ We investigate the benefit of combining blind audio recordings with 3D scene +information for novel-view acoustic synthesis. Given audio recordings from 2-4 +microphones and the 3D geometry and material of a scene containing multiple +unknown sound sources, we estimate the sound anywhere in the scene. We identify +the main challenges of novel-view acoustic synthesis as sound source +localization, separation, and dereverberation. While naively training an +end-to-end network fails to produce high-quality results, we show that +incorporating room impulse responses (RIRs) derived from 3D reconstructed rooms +enables the same network to jointly tackle these tasks. Our method outperforms +existing methods designed for the individual tasks, demonstrating its +effectiveness at utilizing 3D visual information. In a simulated study on the +Matterport3D-NVAS dataset, our model achieves near-perfect accuracy on source +localization, a PSNR of 26.44dB and a SDR of 14.23dB for source separation and +dereverberation, resulting in a PSNR of 25.55 dB and a SDR of 14.20 dB on +novel-view acoustic synthesis. We release our code and model on our project +website at https://github.com/apple/ml-nvas3d. Please wear headphones when +listening to the results. + +
+
+ comment: Interspeech 2024 +
+
+
+
+
+ + ♻ ☆ Component Selection for Craft Assembly Tasks + + +
+ Inspired by traditional handmade crafts, where a person improvises assemblies +based on the available objects, we formally introduce the Craft Assembly Task. +It is a robotic assembly task that involves building an accurate representation +of a given target object using the available objects, which do not directly +correspond to its parts. In this work, we focus on selecting the subset of +available objects for the final craft, when the given input is an RGB image of +the target in the wild. We use a mask segmentation neural network to identify +visible parts, followed by retrieving labelled template meshes. These meshes +undergo pose optimization to determine the most suitable template. Then, we +propose to simplify the parts of the transformed template mesh to primitive +shapes like cuboids or cylinders. Finally, we design a search algorithm to find +correspondences in the scene based on local and global proportions. We develop +baselines for comparison that consider all possible combinations, and choose +the highest scoring combination for common metrics used in foreground maps and +mask accuracy. Our approach achieves comparable results to the baselines for +two different scenes, and we show qualitative results for an implementation in +a real-world scenario. + +
+
+ comment: Published on IEEE RA-L +
+
+
+
+
+ + ♻ ☆ Interactive Character Control with Auto-Regressive Motion Diffusion + Models + + +
+ Real-time character control is an essential component for interactive +experiences, with a broad range of applications, including physics simulations, +video games, and virtual reality. The success of diffusion models for image +synthesis has led to the use of these models for motion synthesis. However, the +majority of these motion diffusion models are primarily designed for offline +applications, where space-time models are used to synthesize an entire sequence +of frames simultaneously with a pre-specified length. To enable real-time +motion synthesis with diffusion model that allows time-varying controls, we +propose A-MDM (Auto-regressive Motion Diffusion Model). Our conditional +diffusion model takes an initial pose as input, and auto-regressively generates +successive motion frames conditioned on the previous frame. Despite its +streamlined network architecture, which uses simple MLPs, our framework is +capable of generating diverse, long-horizon, and high-fidelity motion +sequences. Furthermore, we introduce a suite of techniques for incorporating +interactive controls into A-MDM, such as task-oriented sampling, in-painting, +and hierarchical reinforcement learning. These techniques enable a pre-trained +A-MDM to be efficiently adapted for a variety of new downstream tasks. We +conduct a comprehensive suite of experiments to demonstrate the effectiveness +of A-MDM, and compare its performance against state-of-the-art auto-regressive +methods. + +
+
+
+
+
+ + ♻ ☆ Relative-Interior Solution for the (Incomplete) Linear Assignment + Problem with Applications to the Quadratic Assignment Problem + + +
+ We study the set of optimal solutions of the dual linear programming +formulation of the linear assignment problem (LAP) to propose a method for +computing a solution from the relative interior of this set. Assuming that an +arbitrary dual-optimal solution and an optimal assignment are available (for +which many efficient algorithms already exist), our method computes a +relative-interior solution in linear time. Since the LAP occurs as a subproblem +in the linear programming (LP) relaxation of the quadratic assignment problem +(QAP), we employ our method as a new component in the family of dual-ascent +algorithms that provide bounds on the optimal value of the QAP. To make our +results applicable to the incomplete QAP, which is of interest in practical +use-cases, we also provide a linear-time reduction from the incomplete LAP to +the complete LAP along with a mapping that preserves optimality and membership +in the relative interior. Our experiments on publicly available benchmarks +indicate that our approach with relative-interior solution can frequently +provide bounds near the optimum of the LP relaxation and its runtime is much +lower when compared to a commercial LP solver. + +
+
+
+
+
+
+
+
+ + Information Retrieval 17 + +
+
+
+ + ☆ EasyRec: Simple yet Effective Language Models for Recommendation + + +
+ Deep neural networks have become a powerful technique for learning +representations from user-item interaction data in collaborative filtering (CF) +for recommender systems. However, many existing methods heavily rely on unique +user and item IDs, which limits their ability to perform well in practical +zero-shot learning scenarios where sufficient training data may be unavailable. +Inspired by the success of language models (LMs) and their strong +generalization capabilities, a crucial question arises: How can we harness the +potential of language models to empower recommender systems and elevate its +generalization capabilities to new heights? In this study, we propose EasyRec - +an effective and easy-to-use approach that seamlessly integrates text-based +semantic understanding with collaborative signals. EasyRec employs a +text-behavior alignment framework, which combines contrastive learning with +collaborative language model tuning, to ensure a strong alignment between the +text-enhanced semantic space and the collaborative behavior information. +Extensive empirical evaluations across diverse real-world datasets demonstrate +the superior performance of EasyRec compared to state-of-the-art alternative +models, particularly in the challenging text-based zero-shot recommendation +scenarios. Furthermore, the study highlights the potential of seamlessly +integrating EasyRec as a plug-and-play component into text-enhanced +collaborative filtering frameworks, thereby empowering existing recommender +systems to elevate their recommendation performance and adapt to the evolving +user preferences in dynamic environments. For better result reproducibility of +our EasyRec framework, the model implementation details, source code, and +datasets are available at the link: https://github.com/HKUDS/EasyRec. + +
+
+
+
+
+ + ☆ Beyond KAN: Introducing KarSein for Adaptive High-Order Feature + Interaction Modeling in CTR Prediction + + +
+ Modeling feature interactions is crucial for click-through rate (CTR) +prediction, particularly when it comes to high-order explicit interactions. +Traditional methods struggle with this task because they often predefine a +maximum interaction order, which relies heavily on prior knowledge and can +limit the model's effectiveness. Additionally, modeling high-order interactions +typically leads to increased computational costs. Therefore, the challenge lies +in adaptively modeling high-order feature interactions while maintaining +efficiency. To address this issue, we introduce Kolmogorov-Arnold Represented +Sparse Efficient Interaction Network (KarSein), designed to optimize both +predictive accuracy and computational efficiency. We firstly identify +limitations of directly applying Kolmogorov-Arnold Networks (KAN) to CTR and +then introduce KarSein to overcome these issues. It features a novel +architecture that reduces the computational costs of KAN and supports embedding +vectors as feature inputs. Additionally, KarSein employs guided symbolic +regression to address the challenge of KAN in spontaneously learning +multiplicative relationships. Extensive experiments demonstrate KarSein's +superior performance, achieving significant predictive accuracy with minimal +computational overhead. Furthermore, KarSein maintains strong global +explainability while enabling the removal of redundant features, resulting in a +sparse network structure. These advantages also position KarSein as a promising +method for efficient inference. + +
+
+ comment: KarSein for CTR +
+
+
+
+
+ + ☆ Multimodal Relational Triple Extraction with Query-based Entity Object + Transformer + + +
+ Multimodal Relation Extraction is crucial for constructing flexible and +realistic knowledge graphs. Recent studies focus on extracting the relation +type with entity pairs present in different modalities, such as one entity in +the text and another in the image. However, existing approaches require +entities and objects given beforehand, which is costly and impractical. To +address the limitation, we propose a novel task, Multimodal Entity-Object +Relational Triple Extraction, which aims to extract all triples (entity span, +relation, object region) from image-text pairs. To facilitate this study, we +modified a multimodal relation extraction dataset MORE, which includes 21 +relation types, to create a new dataset containing 20,264 triples, averaging +5.75 triples per image-text pair. Moreover, we propose QEOT, a query-based +model with a selective attention mechanism, to dynamically explore the +interaction and fusion of textual and visual information. In particular, the +proposed method can simultaneously accomplish entity extraction, relation +classification, and object detection with a set of queries. Our method is +suitable for downstream applications and reduces error accumulation due to the +pipeline-style approaches. Extensive experimental results demonstrate that our +proposed method outperforms the existing baselines by 8.06% and achieves +state-of-the-art performance. + +
+
+ comment: 15 pages, 7 figures, preprint +
+
+
+
+
+ + ☆ SC-Rec: Enhancing Generative Retrieval with Self-Consistent Reranking + for~Sequential Recommendation + + +
+ Language Models (LMs) are increasingly employed in recommendation systems due +to their advanced language understanding and generation capabilities. Recent +recommender systems based on generative retrieval have leveraged the +inferential abilities of LMs to directly generate the index tokens of the next +item, based on item sequences within the user's interaction history. Previous +studies have mostly focused on item indices based solely on textual semantic or +collaborative information. However, although the standalone effectiveness of +these aspects has been demonstrated, the integration of this information has +remained unexplored. Our in-depth analysis finds that there is a significant +difference in the knowledge captured by the model from heterogeneous item +indices and diverse input prompts, which can have a high potential for +complementarity. In this paper, we propose SC-Rec, a unified recommender system +that learns diverse preference knowledge from two distinct item indices and +multiple prompt templates. Furthermore, SC-Rec adopts a novel reranking +strategy that aggregates a set of ranking results, inferred based on different +indices and prompts, to achieve the self-consistency of the model. Our +empirical evaluation on three real-world datasets demonstrates that SC-Rec +considerably outperforms the state-of-the-art methods for sequential +recommendation, effectively incorporating complementary knowledge from varied +outputs of the model. + +
+
+
+
+
+ + ☆ OptDist: Learning Optimal Distribution for Customer Lifetime Value + Prediction CIKM 2024 + + +
+ Customer Lifetime Value (CLTV) prediction is a critical task in business +applications. Accurately predicting CLTV is challenging in real-world business +scenarios, as the distribution of CLTV is complex and mutable. Firstly, there +is a large number of users without any consumption consisting of a long-tailed +part that is too complex to fit. Secondly, the small set of high-value users +spent orders of magnitude more than a typical user leading to a wide range of +the CLTV distribution which is hard to capture in a single distribution. +Existing approaches for CLTV estimation either assume a prior probability +distribution and fit a single group of distribution-related parameters for all +samples, or directly learn from the posterior distribution with manually +predefined buckets in a heuristic manner. However, all these methods fail to +handle complex and mutable distributions. In this paper, we propose a novel +optimal distribution selection model OptDist for CLTV prediction, which +utilizes an adaptive optimal sub-distribution selection mechanism to improve +the accuracy of complex distribution modeling. Specifically, OptDist trains +several candidate sub-distribution networks in the distribution learning module +(DLM) for modeling the probability distribution of CLTV. Then, a distribution +selection module (DSM) is proposed to select the sub-distribution for each +sample, thus making the selection automatically and adaptively. Besides, we +design an alignment mechanism that connects both modules, which effectively +guides the optimization. We conduct extensive experiments on both two public +and one private dataset to verify that OptDist outperforms state-of-the-art +baselines. Furthermore, OptDist has been deployed on a large-scale financial +platform for customer acquisition marketing campaigns and the online +experiments also demonstrate the effectiveness of OptDist. + +
+
+ comment: CIKM 2024 +
+
+
+
+
+ + ☆ Collaborative Cross-modal Fusion with Large Language Model for + Recommendation CIKM 2024 + + +
+ Despite the success of conventional collaborative filtering (CF) approaches +for recommendation systems, they exhibit limitations in leveraging semantic +knowledge within the textual attributes of users and items. Recent focus on the +application of large language models for recommendation (LLM4Rec) has +highlighted their capability for effective semantic knowledge capture. However, +these methods often overlook the collaborative signals in user behaviors. Some +simply instruct-tune a language model, while others directly inject the +embeddings of a CF-based model, lacking a synergistic fusion of different +modalities. To address these issues, we propose a framework of Collaborative +Cross-modal Fusion with Large Language Models, termed CCF-LLM, for +recommendation. In this framework, we translate the user-item interactions into +a hybrid prompt to encode both semantic knowledge and collaborative signals, +and then employ an attentive cross-modal fusion strategy to effectively fuse +latent embeddings of both modalities. Extensive experiments demonstrate that +CCF-LLM outperforms existing methods by effectively utilizing semantic and +collaborative signals in the LLM4Rec context. + +
+
+ comment: 10 pages, 4 figures, accepted by CIKM 2024 +
+
+
+
+
+ + ☆ Don't Click the Bait: Title Debiasing News Recommendation via + Cross-Field Contrastive Learning + + +
+ News recommendation emerges as a primary means for users to access content of +interest from the vast amount of news. The title clickbait extensively exists +in news domain and increases the difficulty for news recommendation to offer +satisfactory services for users. Fortunately, we find that news abstract, as a +critical field of news, aligns cohesively with the news authenticity. To this +end, we propose a Title Debiasing News Recommendation with Cross-field +Contrastive learning (TDNR-C2) to overcome the title bias by incorporating news +abstract. Specifically, a multi-field knowledge extraction module is devised to +extract multi-view knowledge about news from various fields. Afterwards, we +present a cross-field contrastive learning module to conduct bias removal via +contrasting learned knowledge from title and abstract fileds. Experimental +results on a real-world dataset demonstrate the superiority of the proposed +TDNR-C2 over existing state-of-the-art methods. Further analysis also indicates +the significance of news abstract for title debiasing. + +
+
+
+
+
+ + ☆ MuRAR: A Simple and Effective Multimodal Retrieval and Answer Refinement + Framework for Multimodal Question Answering + + +
+ Recent advancements in retrieval-augmented generation (RAG) have demonstrated +impressive performance in the question-answering (QA) task. However, most +previous works predominantly focus on text-based answers. While some studies +address multimodal data, they still fall short in generating comprehensive +multimodal answers, particularly for explaining concepts or providing +step-by-step tutorials on how to accomplish specific goals. This capability is +especially valuable for applications such as enterprise chatbots and settings +such as customer service and educational systems, where the answers are sourced +from multimodal data. In this paper, we introduce a simple and effective +framework named MuRAR (Multimodal Retrieval and Answer Refinement). MuRAR +enhances text-based answers by retrieving relevant multimodal data and refining +the responses to create coherent multimodal answers. This framework can be +easily extended to support multimodal answers in enterprise chatbots with +minimal modifications. Human evaluation results indicate that multimodal +answers generated by MuRAR are more useful and readable compared to plain text +answers. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Meta Knowledge for Retrieval Augmented Large Language Models KDD 2024 + + +
+ Retrieval Augmented Generation (RAG) is a technique used to augment Large +Language Models (LLMs) with contextually relevant, time-critical, or +domain-specific information without altering the underlying model parameters. +However, constructing RAG systems that can effectively synthesize information +from large and diverse set of documents remains a significant challenge. We +introduce a novel data-centric RAG workflow for LLMs, transforming the +traditional retrieve-then-read system into a more advanced +prepare-then-rewrite-then-retrieve-then-read framework, to achieve higher +domain expert-level understanding of the knowledge base. Our methodology relies +on generating metadata and synthetic Questions and Answers (QA) for each +document, as well as introducing the new concept of Meta Knowledge Summary (MK +Summary) for metadata-based clusters of documents. The proposed innovations +enable personalized user-query augmentation and in-depth information retrieval +across the knowledge base. Our research makes two significant contributions: +using LLMs as evaluators and employing new comparative performance metrics, we +demonstrate that (1) using augmented queries with synthetic question matching +significantly outperforms traditional RAG pipelines that rely on document +chunking (p < 0.01), and (2) meta knowledge-augmented queries additionally +significantly improve retrieval precision and recall, as well as the final +answers breadth, depth, relevancy, and specificity. Our methodology is +cost-effective, costing less than $20 per 2000 research papers using Claude 3 +Haiku, and can be adapted with any fine-tuning of either the language or +embedding models to further enhance the performance of end-to-end RAG +pipelines. + +
+
+ comment: Accepted in Workshop on Generative AI for Recommender Systems and + Personalization, KDD 2024 +
+
+
+
+
+ + ☆ From Lazy to Prolific: Tackling Missing Labels in Open Vocabulary + Extreme Classification by Positive-Unlabeled Sequence Learning + + +
+ Open-vocabulary Extreme Multi-label Classification (OXMC) extends traditional +XMC by allowing prediction beyond an extremely large, predefined label set +(typically $10^3$ to $10^{12}$ labels), addressing the dynamic nature of +real-world labeling tasks. However, self-selection bias in data annotation +leads to significant missing labels in both training and test data, +particularly for less popular inputs. This creates two critical challenges: +generation models learn to be "lazy'" by under-generating labels, and +evaluation becomes unreliable due to insufficient annotation in the test set. +In this work, we introduce Positive-Unlabeled Sequence Learning (PUSL), which +reframes OXMC as an infinite keyphrase generation task, addressing the +generation model's laziness. Additionally, we propose to adopt a suite of +evaluation metrics, F1@$\mathcal{O}$ and newly proposed B@$k$, to reliably +assess OXMC models with incomplete ground truths. In a highly imbalanced +e-commerce dataset with substantial missing labels, PUSL generates 30% more +unique labels, and 72% of its predictions align with actual user queries. On +the less skewed EURLex-4.3k dataset, PUSL demonstrates superior F1 scores, +especially as label counts increase from 15 to 30. Our approach effectively +tackles both the modeling and evaluation challenges in OXMC with missing +labels. + +
+
+
+
+
+ + ☆ ASGM-KG: Unveiling Alluvial Gold Mining Through Knowledge Graphs + + +
+ Artisanal and Small-Scale Gold Mining (ASGM) is a low-cost yet highly +destructive mining practice, leading to environmental disasters across the +world's tropical watersheds. The topic of ASGM spans multiple domains of +research and information, including natural and social systems, and knowledge +is often atomized across a diversity of media and documents. We therefore +introduce a knowledge graph (ASGM-KG) that consolidates and provides crucial +information about ASGM practices and their environmental effects. The current +version of ASGM-KG consists of 1,899 triples extracted using a large language +model (LLM) from documents and reports published by both non-governmental and +governmental organizations. These documents were carefully selected by a group +of tropical ecologists with expertise in ASGM. This knowledge graph was +validated using two methods. First, a small team of ASGM experts reviewed and +labeled triples as factual or non-factual. Second, we devised and applied an +automated factual reduction framework that relies on a search engine and an LLM +for labeling triples. Our framework performs as well as five baselines on a +publicly available knowledge graph and achieves over 90 accuracy on our ASGM-KG +validated by domain experts. ASGM-KG demonstrates an advancement in knowledge +aggregation and representation for complex, interdisciplinary environmental +crises such as ASGM. + +
+
+
+
+
+ + ☆ RoarGraph: A Projected Bipartite Graph for Efficient Cross-Modal + Approximate Nearest Neighbor Search VLDB + + +
+ Approximate Nearest Neighbor Search (ANNS) is a fundamental and critical +component in many applications, including recommendation systems and large +language model-based applications. With the advancement of multimodal neural +models, which transform data from different modalities into a shared +high-dimensional space as feature vectors, cross-modal ANNS aims to use the +data vector from one modality (e.g., texts) as the query to retrieve the most +similar items from another (e.g., images or videos). However, there is an +inherent distribution gap between embeddings from different modalities, and +cross-modal queries become Out-of-Distribution (OOD) to the base data. +Consequently, state-of-the-art ANNS approaches suffer poor performance for OOD +workloads. In this paper, we quantitatively analyze the properties of the OOD +workloads to gain an understanding of their ANNS efficiency. Unlike +single-modal workloads, we reveal OOD queries spatially deviate from base data, +and the k-nearest neighbors of an OOD query are distant from each other in the +embedding space. The property breaks the assumptions of existing ANNS +approaches and mismatches their design for efficient search. With insights from +the OOD workloads, we propose pRojected bipartite Graph (RoarGraph), an +efficient ANNS graph index built under the guidance of query distribution. +Extensive experiments show that RoarGraph significantly outperforms +state-of-the-art approaches on modern cross-modal datasets, achieving up to +3.56x faster search speed at a 90% recall rate for OOD queries. + +
+
+ comment: to be published in PVLDB +
+
+
+
+
+ + ☆ Personalized Federated Collaborative Filtering: A Variational + AutoEncoder Approach + + +
+ Federated Collaborative Filtering (FedCF) is an emerging field focused on +developing a new recommendation framework with preserving privacy in a +federated setting. Existing FedCF methods typically combine distributed +Collaborative Filtering (CF) algorithms with privacy-preserving mechanisms, and +then preserve personalized information into a user embedding vector. However, +the user embedding is usually insufficient to preserve the rich information of +the fine-grained personalization across heterogeneous clients. This paper +proposes a novel personalized FedCF method by preserving users' personalized +information into a latent variable and a neural model simultaneously. +Specifically, we decompose the modeling of user knowledge into two encoders, +each designed to capture shared knowledge and personalized knowledge +separately. A personalized gating network is then applied to balance +personalization and generalization between the global and local encoders. +Moreover, to effectively train the proposed framework, we model the CF problem +as a specialized Variational AutoEncoder (VAE) task by integrating user +interaction vector reconstruction with missing value prediction. The decoder is +trained to reconstruct the implicit feedback from items the user has interacted +with, while also predicting items the user might be interested in but has not +yet interacted with. Experimental results on benchmark datasets demonstrate +that the proposed method outperforms other baseline methods, showcasing +superior performance. + +
+
+ comment: 10 pages, 3 figures, 4 tables, conference +
+
+
+
+
+ + ♻ ☆ Address-Specific Sustainable Accommodation Choice Through Real-World + Data Integration + + +
+ Consumers wish to choose sustainable accommodation for their travels, and in +the case of corporations, may be required to do so. Yet accommodation +marketplaces provide no meaningful capability for sustainable choice: typically +CO2 estimates are provided that are identical for all accommodation of the same +type across an entire country. We propose a decision support system that +enables real choice of sustainable accommodation. We develop a data-driven +address-specific metric called EcoGrade, which integrates government approved +datasets and uses interpolation where data is sparse. We validate the metric on +10,000 UK addresses in 10 cities, showing the match of our interpolations to +reality is statistically significant. We show how the metric has been embedded +into a decision support system for a global accommodation marketplace and +tested by real users over several months with positive user feedback. In the +EU, forty percent of final energy consumption is from buildings. We need to +encourage all building owners to make their accommodation more efficient. The +rental sector is one area where change can occur rapidly, as rented +accommodation is renovated frequently. We anticipate our decision support +system using EcoGrade will encourage this positive change. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ♻ ☆ Guaranteeing Accuracy and Fairness under Fluctuating User Traffic: A + Bankruptcy-Inspired Re-ranking Approach + + +
+ Out of sustainable and economical considerations, two-sided recommendation +platforms must satisfy the needs of both users and providers. Previous studies +often show that the two sides' needs show different urgency: providers need a +relatively long-term exposure demand while users want more short-term and +accurate service. However, our empirical study reveals that previous methods +for trading off fairness-accuracy often fail to guarantee long-term fairness +and short-term accuracy simultaneously in real applications of fluctuating user +traffic. Especially, when user traffic is low, the user experience often drops +a lot. Our theoretical analysis also confirms that user traffic is a key factor +in such a trade-off problem. How to guarantee accuracy and fairness under +fluctuating user traffic remains a problem. Inspired by the bankruptcy problem +in economics, we propose a novel fairness-aware re-ranking approach named +BankFair. Intuitively, BankFair employs the Talmud rule to leverage periods of +abundant user traffic to offset periods of user traffic scarcity, ensuring +consistent user service at every period while upholding long-term fairness. +Specifically, BankFair consists of two modules: (1) employing the Talmud rule +to determine the required fairness degree under varying periods of user +traffic; and (2) conducting an online re-ranking algorithm based on the +fairness degree determined by the Talmud rule. Experiments on two real-world +recommendation datasets show that BankFair outperforms all baselines regarding +accuracy and provider fairness. + +
+
+
+
+
+ + ♻ ☆ Robust Neural Information Retrieval: An Adversarial and + Out-of-distribution Perspective + + +
+ Recent advances in neural information retrieval (IR) models have +significantly enhanced their effectiveness over various IR tasks. The +robustness of these models, essential for ensuring their reliability in +practice, has also garnered significant attention. With a wide array of +research on robust IR being proposed, we believe it is the opportune moment to +consolidate the current status, glean insights from existing methodologies, and +lay the groundwork for future development. We view the robustness of IR to be a +multifaceted concept, emphasizing its necessity against adversarial attacks, +out-of-distribution (OOD) scenarios and performance variance. With a focus on +adversarial and OOD robustness, we dissect robustness solutions for dense +retrieval models (DRMs) and neural ranking models (NRMs), respectively, +recognizing them as pivotal components of the neural IR pipeline. We provide an +in-depth discussion of existing methods, datasets, and evaluation metrics, +shedding light on challenges and future directions in the era of large language +models. To the best of our knowledge, this is the first comprehensive survey on +the robustness of neural IR models, and we will also be giving our first +tutorial presentation at SIGIR 2024 +\url{https://sigir2024-robust-information-retrieval.github.io}. Along with the +organization of existing work, we introduce a Benchmark for robust IR (BestIR), +a heterogeneous evaluation benchmark for robust neural information retrieval, +which is publicly available at \url{https://github.com/Davion-Liu/BestIR}. We +hope that this study provides useful clues for future research on the +robustness of IR models and helps to develop trustworthy search engines +\url{https://github.com/Davion-Liu/Awesome-Robustness-in-Information-Retrieval}. + +
+
+ comment: Survey paper +
+
+
+
+
+ + ♻ ☆ TWIN V2: Scaling Ultra-Long User Behavior Sequence Modeling for Enhanced + CTR Prediction at Kuaishou CIKM 2024 + + +
+ The significance of modeling long-term user interests for CTR prediction +tasks in large-scale recommendation systems is progressively gaining attention +among researchers and practitioners. Existing work, such as SIM and TWIN, +typically employs a two-stage approach to model long-term user behavior +sequences for efficiency concerns. The first stage rapidly retrieves a subset +of sequences related to the target item from a long sequence using a +search-based mechanism namely the General Search Unit (GSU), while the second +stage calculates the interest scores using the Exact Search Unit (ESU) on the +retrieved results. Given the extensive length of user behavior sequences +spanning the entire life cycle, potentially reaching up to 10^6 in scale, there +is currently no effective solution for fully modeling such expansive user +interests. To overcome this issue, we introduced TWIN-V2, an enhancement of +TWIN, where a divide-and-conquer approach is applied to compress life-cycle +behaviors and uncover more accurate and diverse user interests. Specifically, a +hierarchical clustering method groups items with similar characteristics in +life-cycle behaviors into a single cluster during the offline phase. By +limiting the size of clusters, we can compress behavior sequences well beyond +the magnitude of 10^5 to a length manageable for online inference in GSU +retrieval. Cluster-aware target attention extracts comprehensive and +multi-faceted long-term interests of users, thereby making the final +recommendation results more accurate and diverse. Extensive offline experiments +on a multi-billion-scale industrial dataset and online A/B tests have +demonstrated the effectiveness of TWIN-V2. Under an efficient deployment +framework, TWIN-V2 has been successfully deployed to the primary traffic that +serves hundreds of millions of daily active users at Kuaishou. + +
+
+ comment: Accepted by CIKM 2024 +
+
+
+
+
+
+
+
+ + Machine Learning 131 + +
+
+
+ + ☆ Accelerating Giant Impact Simulations with Machine Learning + + +
+ Constraining planet formation models based on the observed exoplanet +population requires generating large samples of synthetic planetary systems, +which can be computationally prohibitive. A significant bottleneck is +simulating the giant impact phase, during which planetary embryos evolve +gravitationally and combine to form planets, which may themselves experience +later collisions. To accelerate giant impact simulations, we present a machine +learning (ML) approach to predicting collisional outcomes in multiplanet +systems. Trained on more than 500,000 $N$-body simulations of three-planet +systems, we develop an ML model that can accurately predict which two planets +will experience a collision, along with the state of the post-collision +planets, from a short integration of the system's initial conditions. Our model +greatly improves on non-ML baselines that rely on metrics from dynamics theory, +which struggle to accurately predict which pair of planets will experience a +collision. By combining with a model for predicting long-term stability, we +create an efficient ML-based giant impact emulator, which can predict the +outcomes of giant impact simulations with a speedup of up to four orders of +magnitude. We expect our model to enable analyses that would not otherwise be +computationally feasible. As such, we release our full training code, along +with an easy-to-use API for our collision outcome model and giant impact +emulator. + +
+
+ comment: 15 pages, 7 figures, 1 table. Easy-to-use API available at + https://github.com/dtamayo/spock +
+
+
+
+
+ + ☆ PEDAL: Enhancing Greedy Decoding with Large Language Models using + Diverse Exemplars + + +
+ Self-ensembling techniques with diverse reasoning paths such as +Self-Consistency have demonstrated remarkable gains in accuracy for Large +Language Models (LLMs). However, such techniques depend on the availability of +an accurate answer extraction process to aggregate across multiple outputs. +Moreover, they acquire higher inference cost, in comparison to Greedy Decoding, +due to generation of relatively higher number of output tokens. Research has +shown that the free form text outputs from Self-Consistency can be aggregated +reliably using LLMs to produce the final output. Additionally, recent +advancements in LLM inference have demonstrated that usage of diverse exemplars +in prompts have the ability to induce diversity in the LLM outputs. Such proven +techniques can be easily extended to self-ensembling based approaches to +achieve enhanced results in text generation. In this paper, we introduce PEDAL +(Prompts based on Exemplar Diversity Aggregated using LLMs), a hybrid +self-ensembling approach, that combines the strengths of diverse exemplar based +prompts and LLM based aggregation to achieve improvement in overall +performance. On the publicly available SVAMP and ARC datasets, our experiments +reveal that PEDAL can achieve better accuracy than Greedy Decoding based +strategies with lower inference cost compared to Self Consistency based +approaches. + +
+
+
+
+
+ + ☆ A Hassle-free Algorithm for Private Learning in Practice: Don't Use Tree + Aggregation, Use BLTs + + +
+ The state-of-the-art for training on-device language models for mobile +keyboard applications combines federated learning (FL) with differential +privacy (DP) via the DP-Follow-the-Regularized-Leader (DP-FTRL) algorithm. Two +variants of DP-FTRL are used in practice, tree aggregation and matrix +factorization. However, tree aggregation suffers from significantly suboptimal +privacy/utility tradeoffs, while matrix mechanisms require expensive +optimization parameterized by hard-to-estimate-in-advance constants, and high +runtime memory costs.This paper extends the recently introduced Buffered Linear +Toeplitz (BLT) mechanism to multi-participation scenarios. Our BLT-DP-FTRL +maintains the ease-of-use advantages of tree aggregation, while essentially +matching matrix factorization in terms of utility and privacy. We evaluate +BLT-DP-FTRL on the StackOverflow dataset, serving as a re-producible simulation +benchmark, and across four on-device language model tasks in a production FL +system. Our empirical results highlight the advantages of the BLT mechanism and +elevate the practicality and effectiveness of DP in real-world scenarios. + +
+
+
+
+
+ + ☆ Visual Agents as Fast and Slow Thinkers + + +
+ Achieving human-level intelligence requires refining cognitive distinctions +between System 1 and System 2 thinking. While contemporary AI, driven by large +language models, demonstrates human-like traits, it falls short of genuine +cognition. Transitioning from structured benchmarks to real-world scenarios +presents challenges for visual agents, often leading to inaccurate and overly +confident responses. To address the challenge, we introduce FaST, which +incorporates the Fast and Slow Thinking mechanism into visual agents. FaST +employs a switch adapter to dynamically select between System 1/2 modes, +tailoring the problem-solving approach to different task complexity. It tackles +uncertain and unseen objects by adjusting model confidence and integrating new +contextual data. With this novel design, we advocate a flexible system, +hierarchical reasoning capabilities, and a transparent decision-making +pipeline, all of which contribute to its ability to emulate human-like +cognitive processes in visual intelligence. Empirical results demonstrate that +FaST outperforms various well-known baselines, achieving 80.8% accuracy over +VQA^{v2} for visual question answering and 48.7% GIoU score over ReasonSeg for +reasoning segmentation, demonstrate FaST's superior performance. Extensive +testing validates the efficacy and robustness of FaST's core components, +showcasing its potential to advance the development of cognitive visual agents +in AI systems. + +
+
+
+
+
+ + ☆ Stochastic Bandits Robust to Adversarial Attacks + + +
+ This paper investigates stochastic multi-armed bandit algorithms that are +robust to adversarial attacks, where an attacker can first observe the +learner's action and {then} alter their reward observation. We study two cases +of this model, with or without the knowledge of an attack budget $C$, defined +as an upper bound of the summation of the difference between the actual and +altered rewards. For both cases, we devise two types of algorithms with regret +bounds having additive or multiplicative $C$ dependence terms. For the known +attack budget case, we prove our algorithms achieve the regret bound of +${O}((K/\Delta)\log T + KC)$ and $\tilde{O}(\sqrt{KTC})$ for the additive and +multiplicative $C$ terms, respectively, where $K$ is the number of arms, $T$ is +the time horizon, $\Delta$ is the gap between the expected rewards of the +optimal arm and the second-best arm, and $\tilde{O}$ hides the logarithmic +factors. For the unknown case, we prove our algorithms achieve the regret bound +of $\tilde{O}(\sqrt{KT} + KC^2)$ and $\tilde{O}(KC\sqrt{T})$ for the additive +and multiplicative $C$ terms, respectively. In addition to these upper bound +results, we provide several lower bounds showing the tightness of our bounds +and the optimality of our algorithms. These results delineate an intrinsic +separation between the bandits with attacks and corruption models [Lykouris et +al., 2018]. + +
+
+
+
+
+ + ☆ GeoTransformer: Enhancing Urban Forecasting with Geospatial Attention + Mechanisms + + +
+ Recent advancements have focused on encoding urban spatial information into +high-dimensional spaces, with notable efforts dedicated to integrating +sociodemographic data and satellite imagery. These efforts have established +foundational models in this field. However, the effective utilization of these +spatial representations for urban forecasting applications remains +under-explored. To address this gap, we introduce GeoTransformer, a novel +structure that synergizes the Transformer architecture with geospatial +statistics prior. GeoTransformer employs an innovative geospatial attention +mechanism to incorporate extensive urban information and spatial dependencies +into a unified predictive model. Specifically, we compute geospatial weighted +attention scores between the target region and surrounding regions and leverage +the integrated urban information for predictions. Extensive experiments on GDP +and ride-share demand prediction tasks demonstrate that GeoTransformer +significantly outperforms existing baseline models, showcasing its potential to +enhance urban forecasting tasks. + +
+
+
+
+
+ + ☆ HistoGym: A Reinforcement Learning Environment for Histopathological + Image Analysis + + +
+ In pathological research, education, and clinical practice, the +decision-making process based on pathological images is critically important. +This significance extends to digital pathology image analysis: its adequacy is +demonstrated by the extensive information contained within tissue structures, +which is essential for accurate cancer classification and grading. +Additionally, its necessity is highlighted by the inherent requirement for +interpretability in the conclusions generated by algorithms. For humans, +determining tumor type and grade typically involves multi-scale analysis, which +presents a significant challenge for AI algorithms. Traditional patch-based +methods are inadequate for modeling such complex structures, as they fail to +capture the intricate, multi-scale information inherent in whole slide images. +Consequently, there is a pressing need for advanced AI techniques capable of +efficiently and accurately replicating this complex analytical process. To +address this issue, we introduce HistoGym, an open-source reinforcement +learning environment for histopathological image analysis. Following OpenAI Gym +APIs, HistoGym aims to foster whole slide image diagnosis by mimicking the +real-life processes of doctors. Leveraging the pyramid feature of WSIs and the +OpenSlide API, HistoGym provides a unified framework for various clinical +tasks, including tumor detection and classification. We detail the observation, +action, and reward specifications tailored for the histopathological image +analysis domain and provide an open-source Python-based interface for both +clinicians and researchers. To accommodate different clinical demands, we offer +various scenarios for different organs and cancers, including both WSI-based +and selected region-based scenarios, showcasing several noteworthy results. + +
+
+
+
+
+ + ☆ Shapley Marginal Surplus for Strong Models + + +
+ Shapley values have seen widespread use in machine learning as a way to +explain model predictions and estimate the importance of covariates. Accurately +explaining models is critical in real-world models to both aid in decision +making and to infer the properties of the true data-generating process (DGP). +In this paper, we demonstrate that while model-based Shapley values might be +accurate explainers of model predictions, machine learning models themselves +are often poor explainers of the DGP even if the model is highly accurate. +Particularly in the presence of interrelated or noisy variables, the output of +a highly predictive model may fail to account for these relationships. This +implies explanations of a trained model's behavior may fail to provide +meaningful insight into the DGP. In this paper we introduce a novel variable +importance algorithm, Shapley Marginal Surplus for Strong Models, that samples +the space of possible models to come up with an inferential measure of feature +importance. We compare this method to other popular feature importance methods, +both Shapley-based and non-Shapley based, and demonstrate significant +outperformance in inferential capabilities relative to other methods. + +
+
+
+
+
+ + ☆ Entropy Coding of Unordered Data Structures ICLR 2024 + + +
+ We present shuffle coding, a general method for optimal compression of +sequences of unordered objects using bits-back coding. Data structures that can +be compressed using shuffle coding include multisets, graphs, hypergraphs, and +others. We release an implementation that can easily be adapted to different +data types and statistical models, and demonstrate that our implementation +achieves state-of-the-art compression rates on a range of graph datasets +including molecular data. + +
+
+ comment: Published at ICLR 2024 +
+
+
+
+
+ + ☆ LEVIS: Large Exact Verifiable Input Spaces for Neural Networks + + +
+ The robustness of neural networks is paramount in safety-critical +applications. While most current robustness verification methods assess the +worst-case output under the assumption that the input space is known, +identifying a verifiable input space $\mathcal{C}$, where no adversarial +examples exist, is crucial for effective model selection, robustness +evaluation, and the development of reliable control strategies. To address this +challenge, we introduce a novel framework, $\texttt{LEVIS}$, comprising +$\texttt{LEVIS}$-$\alpha$ and $\texttt{LEVIS}$-$\beta$. +$\texttt{LEVIS}$-$\alpha$ locates the largest possible verifiable ball within +the central region of $\mathcal{C}$ that intersects at least two boundaries. In +contrast, $\texttt{LEVIS}$-$\beta$ integrates multiple verifiable balls to +encapsulate the entirety of the verifiable space comprehensively. Our +contributions are threefold: (1) We propose $\texttt{LEVIS}$ equipped with +three pioneering techniques that identify the maximum verifiable ball and the +nearest adversarial point along collinear or orthogonal directions. (2) We +offer a theoretical analysis elucidating the properties of the verifiable balls +acquired through $\texttt{LEVIS}$-$\alpha$ and $\texttt{LEVIS}$-$\beta$. (3) We +validate our methodology across diverse applications, including electrical +power flow regression and image classification, showcasing performance +enhancements and visualizations of the searching characteristics. + +
+
+
+
+
+ + ☆ Optimal Symmetries in Binary Classification + + +
+ We explore the role of group symmetries in binary classification tasks, +presenting a novel framework that leverages the principles of Neyman-Pearson +optimality. Contrary to the common intuition that larger symmetry groups lead +to improved classification performance, our findings show that selecting the +appropriate group symmetries is crucial for optimising generalisation and +sample efficiency. We develop a theoretical foundation for designing group +equivariant neural networks that align the choice of symmetries with the +underlying probability distributions of the data. Our approach provides a +unified methodology for improving classification accuracy across a broad range +of applications by carefully tailoring the symmetry group to the specific +characteristics of the problem. Theoretical analysis and experimental results +demonstrate that optimal classification performance is not always associated +with the largest equivariant groups possible in the domain, even when the +likelihood ratio is invariant under one of its proper subgroups, but rather +with those subgroups themselves. This work offers insights and practical +guidelines for constructing more effective group equivariant architectures in +diverse machine-learning contexts. + +
+
+ comment: 13 pages, 1 figure, 2 tables +
+
+
+
+
+ + ☆ An Empirical Examination of Balancing Strategy for Counterfactual + Estimation on Time Series ICML 2024 + + +
+ Counterfactual estimation from observations represents a critical endeavor in +numerous application fields, such as healthcare and finance, with the primary +challenge being the mitigation of treatment bias. The balancing strategy aimed +at reducing covariate disparities between different treatment groups serves as +a universal solution. However, when it comes to the time series data, the +effectiveness of balancing strategies remains an open question, with a thorough +analysis of the robustness and applicability of balancing strategies still +lacking. This paper revisits counterfactual estimation in the temporal setting +and provides a brief overview of recent advancements in balancing strategies. +More importantly, we conduct a critical empirical examination for the +effectiveness of the balancing strategies within the realm of temporal +counterfactual estimation in various settings on multiple datasets. Our +findings could be of significant interest to researchers and practitioners and +call for a reexamination of the balancing strategy in time series settings. + +
+
+ comment: ICML 2024 Carema Ready Version. 20 Pages, 12 Figures, 10 Tables +
+
+
+
+
+ + ☆ CAT: Caution Aware Transfer in Reinforcement Learning via Distributional + Risk + + +
+ Transfer learning in reinforcement learning (RL) has become a pivotal +strategy for improving data efficiency in new, unseen tasks by utilizing +knowledge from previously learned tasks. This approach is especially beneficial +in real-world deployment scenarios where computational resources are +constrained and agents must adapt rapidly to novel environments. However, +current state-of-the-art methods often fall short in ensuring safety during the +transfer process, particularly when unforeseen risks emerge in the deployment +phase. In this work, we address these limitations by introducing a novel +Caution-Aware Transfer Learning (CAT) framework. Unlike traditional approaches +that limit risk considerations to mean-variance, we define "caution" as a more +generalized and comprehensive notion of risk. Our core innovation lies in +optimizing a weighted sum of reward return and caution-based on state-action +occupancy measures-during the transfer process, allowing for a rich +representation of diverse risk factors. To the best of our knowledge, this is +the first work to explore the optimization of such a generalized risk notion +within the context of transfer RL. Our contributions are threefold: (1) We +propose a Caution-Aware Transfer (CAT) framework that evaluates source policies +within the test environment and constructs a new policy that balances reward +maximization and caution. (2) We derive theoretical sub-optimality bounds for +our method, providing rigorous guarantees of its efficacy. (3) We empirically +validate CAT, demonstrating that it consistently outperforms existing methods +by delivering safer policies under varying risk conditions in the test tasks. + +
+
+
+
+
+ + ☆ Constructing Domain-Specific Evaluation Sets for LLM-as-a-judge + + +
+ Large Language Models (LLMs) have revolutionized the landscape of machine +learning, yet current benchmarks often fall short in capturing the diverse +behavior of these models in real-world applications. A benchmark's usefulness +is determined by its ability to clearly differentiate between models of varying +capabilities (separability) and closely align with human preferences. Existing +frameworks like Alpaca-Eval 2.0 LC +\cite{dubois2024lengthcontrolledalpacaevalsimpleway} and Arena-Hard v0.1 +\cite{li2024crowdsourced} are limited by their focus on general-purpose queries +and lack of diversity across domains such as law, medicine, and multilingual +contexts. In this paper, we address these limitations by introducing a novel +data pipeline that curates diverse, domain-specific evaluation sets tailored +for LLM-as-a-Judge frameworks. Our approach leverages a combination of manual +curation, semi-supervised learning to generate clusters, and stratified +sampling to ensure balanced representation across a wide range of domains and +languages. The resulting evaluation set, which includes 1573 samples across 14 +categories, demonstrates high separability (84\%) across ten top-ranked models, +and agreement (84\%) with Chatbot Arena and (0.915) Spearman correlation. The +agreement values are 9\% better than Arena Hard and 20\% better than AlpacaEval +2.0 LC, while the Spearman coefficient is 0.7 more than the next best +benchmark, showcasing a significant improvement in the usefulness of the +benchmark. We further provide an open-source evaluation tool that enables +fine-grained analysis of model performance across user-defined categories, +offering valuable insights for practitioners. This work contributes to the +ongoing effort to enhance the transparency, diversity, and effectiveness of LLM +evaluation methodologies. + +
+
+ comment: 14 pages, 8 figures +
+
+
+
+
+ + ☆ Representation Learning of Geometric Trees + + +
+ Geometric trees are characterized by their tree-structured layout and +spatially constrained nodes and edges, which significantly impacts their +topological attributes. This inherent hierarchical structure plays a crucial +role in domains such as neuron morphology and river geomorphology, but +traditional graph representation methods often overlook these specific +characteristics of tree structures. To address this, we introduce a new +representation learning framework tailored for geometric trees. It first +features a unique message passing neural network, which is both provably +geometrical structure-recoverable and rotation-translation invariant. To +address the data label scarcity issue, our approach also includes two +innovative training targets that reflect the hierarchical ordering and +geometric structure of these geometric trees. This enables fully +self-supervised learning without explicit labels. We validate our method's +effectiveness on eight real-world datasets, demonstrating its capability to +represent geometric trees. + +
+
+
+
+
+ + ☆ Neighbor Overlay-Induced Graph Attention Network + + +
+ Graph neural networks (GNNs) have garnered significant attention due to their +ability to represent graph data. Among various GNN variants, graph attention +network (GAT) stands out since it is able to dynamically learn the importance +of different nodes. However, present GATs heavily rely on the smoothed node +features to obtain the attention coefficients rather than graph structural +information, which fails to provide crucial contextual cues for node +representations. To address this issue, this study proposes a neighbor +overlay-induced graph attention network (NO-GAT) with the following two-fold +ideas: a) learning favorable structural information, i.e., overlaid neighbors, +outside the node feature propagation process from an adjacency matrix; b) +injecting the information of overlaid neighbors into the node feature +propagation process to compute the attention coefficient jointly. Empirical +studies on graph benchmark datasets indicate that the proposed NO-GAT +consistently outperforms state-of-the-art models. + +
+
+
+
+
+ + ☆ A Transparency Paradox? Investigating the Impact of Explanation + Specificity and Autonomous Vehicle Perceptual Inaccuracies on Passengers + + +
+ Transparency in automated systems could be afforded through the provision of +intelligible explanations. While transparency is desirable, might it lead to +catastrophic outcomes (such as anxiety), that could outweigh its benefits? It's +quite unclear how the specificity of explanations (level of transparency) +influences recipients, especially in autonomous driving (AD). In this work, we +examined the effects of transparency mediated through varying levels of +explanation specificity in AD. We first extended a data-driven explainer model +by adding a rule-based option for explanation generation in AD, and then +conducted a within-subject lab study with 39 participants in an immersive +driving simulator to study the effect of the resulting explanations. +Specifically, our investigation focused on: (1) how different types of +explanations (specific vs. abstract) affect passengers' perceived safety, +anxiety, and willingness to take control of the vehicle when the vehicle +perception system makes erroneous predictions; and (2) the relationship between +passengers' behavioural cues and their feelings during the autonomous drives. +Our findings showed that passengers felt safer with specific explanations when +the vehicle's perception system had minimal errors, while abstract explanations +that hid perception errors led to lower feelings of safety. Anxiety levels +increased when specific explanations revealed perception system errors (high +transparency). We found no significant link between passengers' visual patterns +and their anxiety levels. Our study suggests that passengers prefer clear and +specific explanations (high transparency) when they originate from autonomous +vehicles (AVs) with optimal perceptual accuracy. + +
+
+ comment: Submitted to Transportation Research Part F: Traffic Psychology and + Behaviour. arXiv admin note: text overlap with arXiv:2307.00633 +
+
+
+
+
+ + ☆ NEAR: A Training-Free Pre-Estimator of Machine Learning Model + Performance + + +
+ Artificial neural networks have been shown to be state-of-the-art machine +learning models in a wide variety of applications, including natural language +processing and image recognition. However, building a performant neural network +is a laborious task and requires substantial computing power. Neural +Architecture Search (NAS) addresses this issue by an automatic selection of the +optimal network from a set of potential candidates. While many NAS methods +still require training of (some) neural networks, zero-cost proxies promise to +identify the optimal network without training. In this work, we propose the +zero-cost proxy Network Expressivity by Activation Rank (NEAR). It is based on +the effective rank of the pre- and post-activation matrix, i.e., the values of +a neural network layer before and after applying its activation function. We +demonstrate the cutting-edge correlation between this network score and the +model accuracy on NAS-Bench-101 and NATS-Bench-SSS/TSS. In addition, we present +a simple approach to estimate the optimal layer sizes in multi-layer +perceptrons. Furthermore, we show that this score can be utilized to select +hyperparameters such as the activation function and the neural network weight +initialization scheme. + +
+
+ comment: 12 pages, 4 figures, 10 tables +
+
+
+
+
+ + ☆ Speckle Noise Analysis for Synthetic Aperture Radar (SAR) Space Data + + +
+ This research tackles the challenge of speckle noise in Synthetic Aperture +Radar (SAR) space data, a prevalent issue that hampers the clarity and utility +of SAR images. The study presents a comparative analysis of six distinct +speckle noise reduction techniques: Lee Filtering, Frost Filtering, Kuan +Filtering, Gaussian Filtering, Median Filtering, and Bilateral Filtering. These +methods, selected for their unique approaches to noise reduction and image +preservation, were applied to SAR datasets sourced from the Alaska Satellite +Facility (ASF). The performance of each technique was evaluated using a +comprehensive set of metrics, including Peak Signal-to-Noise Ratio (PSNR), Mean +Squared Error (MSE), Structural Similarity Index (SSIM), Equivalent Number of +Looks (ENL), and Speckle Suppression Index (SSI). The study concludes that both +the Lee and Kuan Filters are effective, with the choice of filter depending on +the specific application requirements for image quality and noise suppression. +This work provides valuable insights into optimizing SAR image processing, with +significant implications for remote sensing, environmental monitoring, and +geological surveying. + +
+
+
+
+
+ + ☆ Pessimistic Iterative Planning for Robust POMDPs + + +
+ Robust partially observable Markov decision processes (robust POMDPs) extend +classical POMDPs to handle additional uncertainty on the transition and +observation probabilities via so-called uncertainty sets. Policies for robust +POMDPs must not only be memory-based to account for partial observability but +also robust against model uncertainty to account for the worst-case instances +from the uncertainty sets. We propose the pessimistic iterative planning (PIP) +framework, which finds robust memory-based policies for robust POMDPs. PIP +alternates between two main steps: (1) selecting an adversarial (non-robust) +POMDP via worst-case probability instances from the uncertainty sets; and (2) +computing a finite-state controller (FSC) for this adversarial POMDP. We +evaluate the performance of this FSC on the original robust POMDP and use this +evaluation in step (1) to select the next adversarial POMDP. Within PIP, we +propose the rFSCNet algorithm. In each iteration, rFSCNet finds an FSC through +a recurrent neural network trained using supervision policies optimized for the +adversarial POMDP. The empirical evaluation in four benchmark environments +showcases improved robustness against a baseline method in an ablation study +and competitive performance compared to a state-of-the-art robust POMDP solver. + +
+
+
+
+
+ + ☆ SYMPOL: Symbolic Tree-Based On-Policy Reinforcement Learning + + +
+ Reinforcement learning (RL) has seen significant success across various +domains, but its adoption is often limited by the black-box nature of neural +network policies, making them difficult to interpret. In contrast, symbolic +policies allow representing decision-making strategies in a compact and +interpretable way. However, learning symbolic policies directly within +on-policy methods remains challenging. In this paper, we introduce SYMPOL, a +novel method for SYMbolic tree-based on-POLicy RL. SYMPOL employs a tree-based +model integrated with a policy gradient method, enabling the agent to learn and +adapt its actions while maintaining a high level of interpretability. We +evaluate SYMPOL on a set of benchmark RL tasks, demonstrating its superiority +over alternative tree-based RL approaches in terms of performance and +interpretability. To the best of our knowledge, this is the first method, that +allows a gradient-based end-to-end learning of interpretable, axis-aligned +decision trees on-policy. Therefore, SYMPOL can become the foundation for a new +class of interpretable RL based on decision trees. Our implementation is +available under: https://github.com/s-marton/SYMPOL + +
+
+
+
+
+ + ☆ SE-SGformer: A Self-Explainable Signed Graph Transformer for Link Sign + Prediction + + +
+ Signed Graph Neural Networks (SGNNs) have been shown to be effective in +analyzing complex patterns in real-world situations where positive and negative +links coexist. However, SGNN models suffer from poor explainability, which +limit their adoptions in critical scenarios that require understanding the +rationale behind predictions. To the best of our knowledge, there is currently +no research work on the explainability of the SGNN models. Our goal is to +address the explainability of decision-making for the downstream task of link +sign prediction specific to signed graph neural networks. Since post-hoc +explanations are not derived directly from the models, they may be biased and +misrepresent the true explanations. Therefore, in this paper we introduce a +Self-Explainable Signed Graph transformer (SE-SGformer) framework, which can +not only outputs explainable information while ensuring high prediction +accuracy. Specifically, We propose a new Transformer architecture for signed +graphs and theoretically demonstrate that using positional encoding based on +signed random walks has greater expressive power than current SGNN methods and +other positional encoding graph Transformer-based approaches. We constructs a +novel explainable decision process by discovering the $K$-nearest (farthest) +positive (negative) neighbors of a node to replace the neural network-based +decoder for predicting edge signs. These $K$ positive (negative) neighbors +represent crucial information about the formation of positive (negative) edges +between nodes and thus can serve as important explanatory information in the +decision-making process. We conducted experiments on several real-world +datasets to validate the effectiveness of SE-SGformer, which outperforms the +state-of-the-art methods by improving 2.2\% prediction accuracy and 73.1\% +explainablity accuracy in the best-case scenario. + +
+
+
+
+
+ + ☆ ML Study of MaliciousTransactions in Ethereum + + +
+ Smart contracts are a major tool in Ethereum transactions. Therefore hackers +can exploit them by adding code vulnerabilities to their sources and using +these vulnerabilities for performing malicious transactions. This paper +presents two successful approaches for detecting malicious contracts: one uses +opcode and relies on GPT2 and the other uses the Solidity source and a LORA +fine-tuned CodeLlama. Finally, we present an XGBOOST model that combines gas +properties and Hexa-decimal signatures for detecting malicious transactions. +This approach relies on early assumptions that maliciousness is manifested by +the uncommon usage of the contracts' functions and the effort to pursue the +transaction. + +
+
+
+
+
+ + ☆ Beyond KAN: Introducing KarSein for Adaptive High-Order Feature + Interaction Modeling in CTR Prediction + + +
+ Modeling feature interactions is crucial for click-through rate (CTR) +prediction, particularly when it comes to high-order explicit interactions. +Traditional methods struggle with this task because they often predefine a +maximum interaction order, which relies heavily on prior knowledge and can +limit the model's effectiveness. Additionally, modeling high-order interactions +typically leads to increased computational costs. Therefore, the challenge lies +in adaptively modeling high-order feature interactions while maintaining +efficiency. To address this issue, we introduce Kolmogorov-Arnold Represented +Sparse Efficient Interaction Network (KarSein), designed to optimize both +predictive accuracy and computational efficiency. We firstly identify +limitations of directly applying Kolmogorov-Arnold Networks (KAN) to CTR and +then introduce KarSein to overcome these issues. It features a novel +architecture that reduces the computational costs of KAN and supports embedding +vectors as feature inputs. Additionally, KarSein employs guided symbolic +regression to address the challenge of KAN in spontaneously learning +multiplicative relationships. Extensive experiments demonstrate KarSein's +superior performance, achieving significant predictive accuracy with minimal +computational overhead. Furthermore, KarSein maintains strong global +explainability while enabling the removal of redundant features, resulting in a +sparse network structure. These advantages also position KarSein as a promising +method for efficient inference. + +
+
+ comment: KarSein for CTR +
+
+
+
+
+ + ☆ Beam Prediction based on Large Language Models + + +
+ Millimeter-wave (mmWave) communication is promising for next-generation +wireless networks but suffers from significant path loss, requiring extensive +antenna arrays and frequent beam training. Traditional deep learning models, +such as long short-term memory (LSTM), enhance beam tracking accuracy however +are limited by poor robustness and generalization. In this letter, we use large +language models (LLMs) to improve the robustness of beam prediction. By +converting time series data into text-based representations and employing the +Prompt-as-Prefix (PaP) technique for contextual enrichment, our approach +unleashes the strength of LLMs for time series forecasting. Simulation results +demonstrate that our LLM-based method offers superior robustness and +generalization compared to LSTM-based models, showcasing the potential of LLMs +in wireless communications. + +
+
+
+
+
+ + ☆ Efficient Multi-Policy Evaluation for Reinforcement Learning + + +
+ To unbiasedly evaluate multiple target policies, the dominant approach among +RL practitioners is to run and evaluate each target policy separately. However, +this evaluation method is far from efficient because samples are not shared +across policies, and running target policies to evaluate themselves is actually +not optimal. In this paper, we address these two weaknesses by designing a +tailored behavior policy to reduce the variance of estimators across all target +policies. Theoretically, we prove that executing this behavior policy with +manyfold fewer samples outperforms on-policy evaluation on every target policy +under characterized conditions. Empirically, we show our estimator has a +substantially lower variance compared with previous best methods and achieves +state-of-the-art performance in a broad range of environments. + +
+
+
+
+
+ + ☆ RBLA: Rank-Based-LoRA-Aggregation for Fine-tuning Heterogeneous Models + in FLaaS + + +
+ Federated Learning (FL) is a promising privacy-aware distributed learning +framework that can be deployed on various devices, such as mobile phones, +desktops, and devices equipped with CPUs or GPUs. In the context of +server-based Federated Learning as a Service (FLaas), FL enables the central +server to coordinate the training process across multiple devices without +direct access to the local data, thereby enhancing privacy and data security. +Low-Rank Adaptation (LoRA) is a method that fine-tunes models efficiently by +focusing on a low-dimensional subspace of the model's parameters. This approach +significantly reduces computational and memory costs compared to fine-tuning +all parameters from scratch. When integrated with FL, especially in a FLaas +environment, LoRA allows for flexible and efficient deployment across diverse +hardware with varying computational capabilities by adjusting the local model's +rank. However, in LoRA-enabled FL, different clients may train models with +varying ranks, which poses a challenge for model aggregation on the server. +Current methods of aggregating models of different ranks require padding +weights to a uniform shape, which can degrade the global model's performance. +To address this issue, we propose Rank-Based LoRA Aggregation (RBLA), a novel +model aggregation method designed for heterogeneous LoRA structures. RBLA +preserves key features across models with different ranks. This paper analyzes +the issues with current padding methods that reshape models for aggregation in +a FLaas environment. Then, we introduce RBLA, a rank-based aggregation method +that maintains both low-rank and high-rank features. Finally, we demonstrate +the effectiveness of RBLA through comparative experiments with state-of-the-art +methods. + +
+
+
+
+
+ + ☆ Turning Trash into Treasure: Accelerating Inference of Large Language + Models with Token Recycling + + +
+ The rapid growth in the parameters of large language models (LLMs) has made +inference latency a fundamental bottleneck, limiting broader application of +LLMs. Speculative decoding represents a lossless approach to accelerate +inference through a guess-and-verify paradigm, leveraging the parallel +capabilities of modern hardware. Some speculative decoding methods rely on +additional structures to guess draft tokens, such as small models or +parameter-efficient architectures, which need extra training before use. +Alternatively, retrieval-based train-free techniques build libraries from +pre-existing corpora or by n-gram generation. However, they face challenges +like large storage requirements, time-consuming retrieval, and limited +adaptability. Observing that candidate tokens generated during the decoding +process are likely to reoccur in future sequences, we propose Token Recycling. +This approach stores candidate tokens in an adjacency matrix and employs a +breadth-first search (BFS)-like algorithm on the matrix to construct a draft +tree. The tree is then validated through tree attention. New candidate tokens +from the decoding process are then used to update the matrix. Token Recycling +requires \textless2MB of additional storage and achieves approximately 2x +speedup across all sizes of LLMs. It significantly outperforms existing +train-free methods by 30\% and even a training method by 25\%. It can be +directly applied to any existing LLMs and tasks without the need for +adaptation. + +
+
+ comment: under review +
+
+
+
+
+ + ☆ Explore-then-Commit Algorithms for Decentralized Two-Sided Matching + Markets + + +
+ Online learning in a decentralized two-sided matching markets, where the +demand-side (players) compete to match with the supply-side (arms), has +received substantial interest because it abstracts out the complex interactions +in matching platforms (e.g. UpWork, TaskRabbit). However, past works assume +that each arm knows their preference ranking over the players (one-sided +learning), and each player aim to learn the preference over arms through +successive interactions. Moreover, several (impractical) assumptions on the +problem are usually made for theoretical tractability such as broadcast +player-arm match Liu et al. (2020; 2021); Kong & Li (2023) or serial +dictatorship Sankararaman et al. (2021); Basu et al. (2021); Ghosh et al. +(2022). In this paper, we study a decentralized two-sided matching market, +where we do not assume that the preference ranking over players are known to +the arms apriori. Furthermore, we do not have any structural assumptions on the +problem. We propose a multi-phase explore-then-commit type algorithm namely +epoch-based CA-ETC (collision avoidance explore then commit) (\texttt{CA-ETC} +in short) for this problem that does not require any communication across +agents (players and arms) and hence decentralized. We show that for the initial +epoch length of $T_{\circ}$ and subsequent epoch-lengths of $2^{l/\gamma} +T_{\circ}$ (for the $l-$th epoch with $\gamma \in (0,1)$ as an input parameter +to the algorithm), \texttt{CA-ETC} yields a player optimal expected regret of +$\mathcal{O}\left(T_{\circ} (\frac{K \log T}{T_{\circ} \Delta^2})^{1/\gamma} + +T_{\circ} (\frac{T}{T_{\circ}})^\gamma\right)$ for the $i$-th player, where $T$ +is the learning horizon, $K$ is the number of arms and $\Delta$ is an +appropriately defined problem gap. Furthermore, we propose a blackboard +communication based baseline achieving logarithmic regret in $T$. + +
+
+ comment: Accepted at International Symposium of Information Theory (ISIT) 2024 +
+
+
+
+
+ + ☆ Can Large Language Models Improve the Adversarial Robustness of Graph + Neural Networks? + + +
+ Graph neural networks (GNNs) are vulnerable to adversarial perturbations, +especially for topology attacks, and many methods that improve the robustness +of GNNs have received considerable attention. Recently, we have witnessed the +significant success of large language models (LLMs), leading many to explore +the great potential of LLMs on GNNs. However, they mainly focus on improving +the performance of GNNs by utilizing LLMs to enhance the node features. +Therefore, we ask: Will the robustness of GNNs also be enhanced with the +powerful understanding and inference capabilities of LLMs? By presenting the +empirical results, we find that despite that LLMs can improve the robustness of +GNNs, there is still an average decrease of 23.1% in accuracy, implying that +the GNNs remain extremely vulnerable against topology attack. Therefore, +another question is how to extend the capabilities of LLMs on graph adversarial +robustness. In this paper, we propose an LLM-based robust graph structure +inference framework, LLM4RGNN, which distills the inference capabilities of +GPT-4 into a local LLM for identifying malicious edges and an LM-based edge +predictor for finding missing important edges, so as to recover a robust graph +structure. Extensive experiments demonstrate that LLM4RGNN consistently +improves the robustness across various GNNs. Even in some cases where the +perturbation ratio increases to 40%, the accuracy of GNNs is still better than +that on the clean graph. + +
+
+
+
+
+ + ☆ Research on Personalized Compression Algorithm for Pre-trained Models + Based on Homomorphic Entropy Increase + + +
+ In this article, we explore the challenges and evolution of two key +technologies in the current field of AI: Vision Transformer model and Large +Language Model (LLM). Vision Transformer captures global information by +splitting images into small pieces and leveraging Transformer's multi-head +attention mechanism, but its high reference count and compute overhead limit +deployment on mobile devices. At the same time, the rapid development of LLM +has revolutionized natural language processing, but it also faces huge +deployment challenges. To address these issues, we investigate model pruning +techniques, with a particular focus on how to reduce redundant parameters +without losing accuracy to accommodate personalized data and +resource-constrained environments. In this paper, a new layered pruning +strategy is proposed to distinguish the personalized layer from the common +layer by compressed sensing and random sampling, thus significantly reducing +the model parameters. Our experimental results show that the introduced step +buffering mechanism further improves the accuracy of the model after pruning, +providing new directions and possibilities for the deployment of efficient and +personalized AI models on mobile devices in the future. + +
+
+
+
+
+ + ☆ A Mean Field Ansatz for Zero-Shot Weight Transfer + + +
+ The pre-training cost of large language models (LLMs) is prohibitive. One +cutting-edge approach to reduce the cost is zero-shot weight transfer, also +known as model growth for some cases, which magically transfers the weights +trained in a small model to a large model. However, there are still some +theoretical mysteries behind the weight transfer. In this paper, inspired by +prior applications of mean field theory to neural network dynamics, we +introduce a mean field ansatz to provide a theoretical explanation for weight +transfer. Specifically, we propose the row-column (RC) ansatz under the mean +field point of view, which describes the measure structure of the weights in +the neural network (NN) and admits a close measure dynamic. Thus, the weights +of different sizes NN admit a common distribution under proper assumptions, and +weight transfer methods can be viewed as sampling methods. We empirically +validate the RC ansatz by exploring simple MLP examples and LLMs such as GPT-3 +and Llama-3.1. We show the mean-field point of view is adequate under suitable +assumptions which can provide theoretical support for zero-shot weight +transfer. + +
+
+ comment: 40 pages, 6 Figures, 1 table +
+
+
+
+
+ + ☆ Neural Reward Machines + + +
+ Non-markovian Reinforcement Learning (RL) tasks are very hard to solve, +because agents must consider the entire history of state-action pairs to act +rationally in the environment. Most works use symbolic formalisms (as Linear +Temporal Logic or automata) to specify the temporally-extended task. These +approaches only work in finite and discrete state environments or continuous +problems for which a mapping between the raw state and a symbolic +interpretation is known as a symbol grounding (SG) function. Here, we define +Neural Reward Machines (NRM), an automata-based neurosymbolic framework that +can be used for both reasoning and learning in non-symbolic non-markovian RL +domains, which is based on the probabilistic relaxation of Moore Machines. We +combine RL with semisupervised symbol grounding (SSSG) and we show that NRMs +can exploit high-level symbolic knowledge in non-symbolic environments without +any knowledge of the SG function, outperforming Deep RL methods which cannot +incorporate prior knowledge. Moreover, we advance the research in SSSG, +proposing an algorithm for analysing the groundability of temporal +specifications, which is more efficient than baseline techniques of a factor +$10^3$. + +
+
+
+
+
+ + ☆ Misclassification excess risk bounds for PAC-Bayesian classification via + convexified loss + + +
+ PAC-Bayesian bounds have proven to be a valuable tool for deriving +generalization bounds and for designing new learning algorithms in machine +learning. However, it typically focus on providing generalization bounds with +respect to a chosen loss function. In classification tasks, due to the +non-convex nature of the 0-1 loss, a convex surrogate loss is often used, and +thus current PAC-Bayesian bounds are primarily specified for this convex +surrogate. This work shifts its focus to providing misclassification excess +risk bounds for PAC-Bayesian classification when using a convex surrogate loss. +Our key ingredient here is to leverage PAC-Bayesian relative bounds in +expectation rather than relying on PAC-Bayesian bounds in probability. We +demonstrate our approach in several important applications. + +
+
+
+
+
+ + ☆ A Multivocal Literature Review on Privacy and Fairness in Federated + Learning + + +
+ Federated Learning presents a way to revolutionize AI applications by +eliminating the necessity for data sharing. Yet, research has shown that +information can still be extracted during training, making additional +privacy-preserving measures such as differential privacy imperative. To +implement real-world federated learning applications, fairness, ranging from a +fair distribution of performance to non-discriminative behaviour, must be +considered. Particularly in high-risk applications (e.g. healthcare), avoiding +the repetition of past discriminatory errors is paramount. As recent research +has demonstrated an inherent tension between privacy and fairness, we conduct a +multivocal literature review to examine the current methods to integrate +privacy and fairness in federated learning. Our analyses illustrate that the +relationship between privacy and fairness has been neglected, posing a critical +risk for real-world applications. We highlight the need to explore the +relationship between privacy, fairness, and performance, advocating for the +creation of integrated federated learning frameworks. + +
+
+ comment: Accepted for publication at the Internationale Tagung + Wirtschaftsinformatik 2024 +
+
+
+
+
+ + ☆ A new perspective on Bayesian Operational Modal Analysis + + +
+ In the field of operational modal analysis (OMA), obtained modal information +is frequently used to assess the current state of aerospace, mechanical, +offshore and civil structures. However, the stochasticity of operational +systems and the lack of forcing information can lead to inconsistent results. +Quantifying the uncertainty of the recovered modal parameters through OMA is +therefore of significant value. In this article, a new perspective on Bayesian +OMA is proposed: a Bayesian stochastic subspace identification (SSI) algorithm. +Distinct from existing approaches to Bayesian OMA, a hierarchical probabilistic +model is embedded at the core of covariance-driven SSI. Through substitution of +canonical correlation analysis with a Bayesian equivalent, posterior +distributions over the modal properties are obtained. Two inference schemes are +presented for the proposed Bayesian formulation: Markov Chain Monte Carlo and +variational Bayes. Two case studies are then explored. The first is benchmark +study using data from a simulated, multi degree-of-freedom, linear system. +Following application of Bayesian SSI, it is shown that the same posterior is +targeted and recovered by both inference schemes, with good agreement between +the posterior mean and the conventional SSI result. The second study applies +the variational form to data obtained from an in-service structure: The Z24 +bridge. The results of this study are presented at single model orders, and +then using a stabilisation diagram. The recovered posterior uncertainty is +presented and compared to the classic SSI result. It is observed that the +posterior distributions with mean values coinciding with the natural +frequencies exhibit lower variance than values situated away from the natural +frequencies. + +
+
+
+
+
+ + ☆ MIA-Tuner: Adapting Large Language Models as Pre-training Text Detector + + +
+ The increasing parameters and expansive dataset of large language models +(LLMs) highlight the urgent demand for a technical solution to audit the +underlying privacy risks and copyright issues associated with LLMs. Existing +studies have partially addressed this need through an exploration of the +pre-training data detection problem, which is an instance of a membership +inference attack (MIA). This problem involves determining whether a given piece +of text has been used during the pre-training phase of the target LLM. Although +existing methods have designed various sophisticated MIA score functions to +achieve considerable detection performance in pre-trained LLMs, how to achieve +high-confidence detection and how to perform MIA on aligned LLMs remain +challenging. In this paper, we propose MIA-Tuner, a novel instruction-based MIA +method, which instructs LLMs themselves to serve as a more precise pre-training +data detector internally, rather than design an external MIA score function. +Furthermore, we design two instruction-based safeguards to respectively +mitigate the privacy risks brought by the existing methods and MIA-Tuner. To +comprehensively evaluate the most recent state-of-the-art LLMs, we collect a +more up-to-date MIA benchmark dataset, named WIKIMIA-24, to replace the widely +adopted benchmark WIKIMIA. We conduct extensive experiments across various +aligned and unaligned LLMs over the two benchmark datasets. The results +demonstrate that MIA-Tuner increases the AUC of MIAs from 0.7 to a +significantly high level of 0.9. + +
+
+ comment: code and dataset: https://github.com/wjfu99/MIA-Tuner +
+
+
+
+
+ + ☆ Mitigating Backdoor Attacks in Federated Learning via Flipping Weight + Updates of Low-Activation Input Neurons + + +
+ Federated learning enables multiple clients to collaboratively train machine +learning models under the overall planning of the server while adhering to +privacy requirements. However, the server cannot directly oversee the local +training process, creating an opportunity for malicious clients to introduce +backdoors. Existing research shows that backdoor attacks activate specific +neurons in the compromised model, which remain dormant when processing clean +data. Leveraging this insight, we propose a method called Flipping Weight +Updates of Low-Activation Input Neurons (FLAIN) to defend against backdoor +attacks in federated learning. Specifically, after completing global training, +we employ an auxiliary dataset to identify low-activation input neurons and +flip the associated weight updates. We incrementally raise the threshold for +low-activation inputs and flip the weight updates iteratively, until the +performance degradation on the auxiliary data becomes unacceptable. Extensive +experiments validate that our method can effectively reduce the success rate of +backdoor attacks to a low level in various attack scenarios including those +with non-IID data distribution or high MCRs, causing only minimal performance +degradation on clean data. + +
+
+
+
+
+ + ☆ TextCAVs: Debugging vision models using text MICCAI 2024 + + +
+ Concept-based interpretability methods are a popular form of explanation for +deep learning models which provide explanations in the form of high-level human +interpretable concepts. These methods typically find concept activation vectors +(CAVs) using a probe dataset of concept examples. This requires labelled data +for these concepts -- an expensive task in the medical domain. We introduce +TextCAVs: a novel method which creates CAVs using vision-language models such +as CLIP, allowing for explanations to be created solely using text descriptions +of the concept, as opposed to image exemplars. This reduced cost in testing +concepts allows for many concepts to be tested and for users to interact with +the model, testing new ideas as they are thought of, rather than a delay caused +by image collection and annotation. In early experimental results, we +demonstrate that TextCAVs produces reasonable explanations for a chest x-ray +dataset (MIMIC-CXR) and natural images (ImageNet), and that these explanations +can be used to debug deep learning-based models. + +
+
+ comment: 11 pages, 2 figures. Accepted at iMIMIC Workshop at MICCAI 2024 +
+
+
+
+
+ + ☆ Modeling the Neonatal Brain Development Using Implicit Neural + Representations MICCAI 2024 + + +
+ The human brain undergoes rapid development during the third trimester of +pregnancy. In this work, we model the neonatal development of the infant brain +in this age range. As a basis, we use MR images of preterm- and term-birth +neonates from the developing human connectome project (dHCP). We propose a +neural network, specifically an implicit neural representation (INR), to +predict 2D- and 3D images of varying time points. In order to model a +subject-specific development process, it is necessary to disentangle the age +from the subjects' identity in the latent space of the INR. We propose two +methods, Subject Specific Latent Vectors (SSL) and Stochastic Global Latent +Augmentation (SGLA), enabling this disentanglement. We perform an analysis of +the results and compare our proposed model to an age-conditioned denoising +diffusion model as a baseline. We also show that our method can be applied in a +memory-efficient way, which is especially important for 3D data. + +
+
+ comment: Preprint, Accepted for PRIME MICCAI 2024 +
+
+
+
+
+ + ☆ The Power of Bias: Optimizing Client Selection in Federated Learning + with Heterogeneous Differential Privacy + + +
+ To preserve the data privacy, the federated learning (FL) paradigm emerges in +which clients only expose model gradients rather than original data for +conducting model training. To enhance the protection of model gradients in FL, +differentially private federated learning (DPFL) is proposed which incorporates +differentially private (DP) noises to obfuscate gradients before they are +exposed. Yet, an essential but largely overlooked problem in DPFL is the +heterogeneity of clients' privacy requirement, which can vary significantly +between clients and extremely complicates the client selection problem in DPFL. +In other words, both the data quality and the influence of DP noises should be +taken into account when selecting clients. To address this problem, we conduct +convergence analysis of DPFL under heterogeneous privacy, a generic client +selection strategy, popular DP mechanisms and convex loss. Based on convergence +analysis, we formulate the client selection problem to minimize the value of +loss function in DPFL with heterogeneous privacy, which is a convex +optimization problem and can be solved efficiently. Accordingly, we propose the +DPFL-BCS (biased client selection) algorithm. The extensive experiment results +with real datasets under both convex and non-convex loss functions indicate +that DPFL-BCS can remarkably improve model utility compared with the SOTA +baselines. + +
+
+
+
+
+ + ☆ Solving The Quantum Many-Body Hamiltonian Learning Problem with Neural + Differential Equations + + +
+ Understanding and characterising quantum many-body dynamics remains a +significant challenge due to both the exponential complexity required to +represent quantum many-body Hamiltonians, and the need to accurately track +states in time under the action of such Hamiltonians. This inherent complexity +limits our ability to characterise quantum many-body systems, highlighting the +need for innovative approaches to unlock their full potential. To address this +challenge, we propose a novel method to solve the Hamiltonian Learning (HL) +problem-inferring quantum dynamics from many-body state trajectories-using +Neural Differential Equations combined with an Ansatz Hamiltonian. Our method +is reliably convergent, experimentally friendly, and interpretable, making it a +stable solution for HL on a set of Hamiltonians previously unlearnable in the +literature. In addition to this, we propose a new quantitative benchmark based +on power laws, which can objectively compare the reliability and generalisation +capabilities of any two HL algorithms. Finally, we benchmark our method against +state-of-the-art HL algorithms with a 1D spin-1/2 chain proof of concept. + +
+
+
+
+
+ + ☆ Navigating Uncertainties in Machine Learning for Structural Dynamics: A + Comprehensive Review of Probabilistic and Non-Probabilistic Approaches in + Forward and Inverse Problems + + +
+ In the era of big data, machine learning (ML) has become a powerful tool in +various fields, notably impacting structural dynamics. ML algorithms offer +advantages by modeling physical phenomena based on data, even in the absence of +underlying mechanisms. However, uncertainties such as measurement noise and +modeling errors can compromise the reliability of ML predictions, highlighting +the need for effective uncertainty awareness to enhance prediction robustness. +This paper presents a comprehensive review on navigating uncertainties in ML, +categorizing uncertainty-aware approaches into probabilistic methods (including +Bayesian and frequentist perspectives) and non-probabilistic methods (such as +interval learning and fuzzy learning). Bayesian neural networks, known for +their uncertainty quantification and nonlinear mapping capabilities, are +emphasized for their superior performance and potential. The review covers +various techniques and methodologies for addressing uncertainties in ML, +discussing fundamentals and implementation procedures of each method. While +providing a concise overview of fundamental concepts, the paper refrains from +in-depth critical explanations. Strengths and limitations of each approach are +examined, along with their applications in structural dynamic forward problems +like response prediction, sensitivity assessment, and reliability analysis, and +inverse problems like system identification, model updating, and damage +identification. Additionally, the review identifies research gaps and suggests +future directions for investigations, aiming to provide comprehensive insights +to the research community. By offering an extensive overview of both +probabilistic and non-probabilistic approaches, this review aims to assist +researchers and practitioners in making informed decisions when utilizing ML +techniques to address uncertainties in structural dynamic problems. + +
+
+ comment: 114 pages, 27 figures, 6 tables, references added +
+
+
+
+
+ + ☆ A survey on secure decentralized optimization and learning + + +
+ Decentralized optimization has become a standard paradigm for solving +large-scale decision-making problems and training large machine learning models +without centralizing data. However, this paradigm introduces new privacy and +security risks, with malicious agents potentially able to infer private data or +impair the model accuracy. Over the past decade, significant advancements have +been made in developing secure decentralized optimization and learning +frameworks and algorithms. This survey provides a comprehensive tutorial on +these advancements. We begin with the fundamentals of decentralized +optimization and learning, highlighting centralized aggregation and distributed +consensus as key modules exposed to security risks in federated and distributed +optimization, respectively. Next, we focus on privacy-preserving algorithms, +detailing three cryptographic tools and their integration into decentralized +optimization and learning systems. Additionally, we examine resilient +algorithms, exploring the design and analysis of resilient aggregation and +consensus protocols that support these systems. We conclude the survey by +discussing current trends and potential future directions. + +
+
+ comment: 38 pages +
+
+
+
+
+ + ☆ DeepDFA: Automata Learning through Neural Probabilistic Relaxations + + +
+ In this work, we introduce DeepDFA, a novel approach to identifying +Deterministic Finite Automata (DFAs) from traces, harnessing a differentiable +yet discrete model. Inspired by both the probabilistic relaxation of DFAs and +Recurrent Neural Networks (RNNs), our model offers interpretability +post-training, alongside reduced complexity and enhanced training efficiency +compared to traditional RNNs. Moreover, by leveraging gradient-based +optimization, our method surpasses combinatorial approaches in both scalability +and noise resilience. Validation experiments conducted on target regular +languages of varying size and complexity demonstrate that our approach is +accurate, fast, and robust to noise in both the input symbols and the output +labels of training data, integrating the strengths of both logical grammar +induction and deep learning. + +
+
+
+
+
+ + ☆ Generative Dataset Distillation Based on Diffusion Model ECCV 2024 + + +
+ This paper presents our method for the generative track of The First Dataset +Distillation Challenge at ECCV 2024. Since the diffusion model has become the +mainstay of generative models because of its high-quality generative effects, +we focus on distillation methods based on the diffusion model. Considering that +the track can only generate a fixed number of images in 10 minutes using a +generative model for CIFAR-100 and Tiny-ImageNet datasets, we need to use a +generative model that can generate images at high speed. In this study, we +proposed a novel generative dataset distillation method based on Stable +Diffusion. Specifically, we use the SDXL-Turbo model which can generate images +at high speed and quality. Compared to other diffusion models that can only +generate images per class (IPC) = 1, our method can achieve an IPC = 10 for +Tiny-ImageNet and an IPC = 20 for CIFAR-100, respectively. Additionally, to +generate high-quality distilled datasets for CIFAR-100 and Tiny-ImageNet, we +use the class information as text prompts and post data augmentation for the +SDXL-Turbo model. Experimental results show the effectiveness of the proposed +method, and we achieved third place in the generative track of the ECCV 2024 DD +Challenge. Codes are available at https://github.com/Guang000/BANKO. + +
+
+ comment: The Third Place Winner in Generative Track of the ECCV 2024 DD + Challenge +
+
+
+
+
+ + ☆ RadioDiff: An Effective Generative Diffusion Model for Sampling-Free + Dynamic Radio Map Construction + + +
+ Radio map (RM) is a promising technology that can obtain pathloss based on +only location, which is significant for 6G network applications to reduce the +communication costs for pathloss estimation. However, the construction of RM in +traditional is either computationally intensive or depends on costly +sampling-based pathloss measurements. Although the neural network (NN)-based +method can efficiently construct the RM without sampling, its performance is +still suboptimal. This is primarily due to the misalignment between the +generative characteristics of the RM construction problem and the +discrimination modeling exploited by existing NN-based methods. Thus, to +enhance RM construction performance, in this paper, the sampling-free RM +construction is modeled as a conditional generative problem, where a denoised +diffusion-based method, named RadioDiff, is proposed to achieve high-quality RM +construction. In addition, to enhance the diffusion model's capability of +extracting features from dynamic environments, an attention U-Net with an +adaptive fast Fourier transform module is employed as the backbone network to +improve the dynamic environmental features extracting capability. Meanwhile, +the decoupled diffusion model is utilized to further enhance the construction +performance of RMs. Moreover, a comprehensive theoretical analysis of why the +RM construction is a generative problem is provided for the first time, from +both perspectives of data features and NN training methods. Experimental +results show that the proposed RadioDiff achieves state-of-the-art performance +in all three metrics of accuracy, structural similarity, and peak +signal-to-noise ratio. The code is available at +https://github.com/UNIC-Lab/RadioDiff. + +
+
+
+
+
+ + ☆ A Mechanistic Interpretation of Syllogistic Reasoning in Auto-Regressive + Language Models + + +
+ Recent studies on logical reasoning in auto-regressive Language Models (LMs) +have sparked a debate on whether such models can learn systematic reasoning +principles during pre-training or merely exploit superficial patterns in the +training data. This paper presents a mechanistic interpretation of syllogistic +reasoning in LMs to further enhance our understanding of internal dynamics. +Specifically, we present a methodology for circuit discovery aimed at +disentangling content-independent reasoning mechanisms from world knowledge +acquired during pre-training. Through two distinct intervention methods, we +uncover a sufficient and necessary circuit involving middle-term suppression +that elucidates how LMs transfer information to derive valid conclusions from +premises. Furthermore, we investigate how belief biases manifest in syllogistic +reasoning, finding evidence of partial contamination from additional attention +heads responsible for encoding commonsense and contextualized knowledge. +Finally, we explore the generalization of the discovered mechanisms across +various syllogistic schemes and model sizes, finding that the identified +circuit is sufficient and necessary for all the schemes on which the model +achieves high downstream accuracy ($\geq$ 60\%). Overall, our findings suggest +that LMs indeed learn transferable content-independent reasoning mechanisms, +but that, at the same time, such mechanisms do not involve generalisable and +abstract logical primitives, being susceptible to contamination by the same +world knowledge acquired during pre-training. + +
+
+
+
+
+ + ☆ OptDist: Learning Optimal Distribution for Customer Lifetime Value + Prediction CIKM 2024 + + +
+ Customer Lifetime Value (CLTV) prediction is a critical task in business +applications. Accurately predicting CLTV is challenging in real-world business +scenarios, as the distribution of CLTV is complex and mutable. Firstly, there +is a large number of users without any consumption consisting of a long-tailed +part that is too complex to fit. Secondly, the small set of high-value users +spent orders of magnitude more than a typical user leading to a wide range of +the CLTV distribution which is hard to capture in a single distribution. +Existing approaches for CLTV estimation either assume a prior probability +distribution and fit a single group of distribution-related parameters for all +samples, or directly learn from the posterior distribution with manually +predefined buckets in a heuristic manner. However, all these methods fail to +handle complex and mutable distributions. In this paper, we propose a novel +optimal distribution selection model OptDist for CLTV prediction, which +utilizes an adaptive optimal sub-distribution selection mechanism to improve +the accuracy of complex distribution modeling. Specifically, OptDist trains +several candidate sub-distribution networks in the distribution learning module +(DLM) for modeling the probability distribution of CLTV. Then, a distribution +selection module (DSM) is proposed to select the sub-distribution for each +sample, thus making the selection automatically and adaptively. Besides, we +design an alignment mechanism that connects both modules, which effectively +guides the optimization. We conduct extensive experiments on both two public +and one private dataset to verify that OptDist outperforms state-of-the-art +baselines. Furthermore, OptDist has been deployed on a large-scale financial +platform for customer acquisition marketing campaigns and the online +experiments also demonstrate the effectiveness of OptDist. + +
+
+ comment: CIKM 2024 +
+
+
+
+
+ + ☆ S-RAF: A Simulation-Based Robustness Assessment Framework for + Responsible Autonomous Driving + + +
+ As artificial intelligence (AI) technology advances, ensuring the robustness +and safety of AI-driven systems has become paramount. However, varying +perceptions of robustness among AI developers create misaligned evaluation +metrics, complicating the assessment and certification of safety-critical and +complex AI systems such as autonomous driving (AD) agents. To address this +challenge, we introduce Simulation-Based Robustness Assessment Framework +(S-RAF) for autonomous driving. S-RAF leverages the CARLA Driving simulator to +rigorously assess AD agents across diverse conditions, including faulty +sensors, environmental changes, and complex traffic situations. By quantifying +robustness and its relationship with other safety-critical factors, such as +carbon emissions, S-RAF aids developers and stakeholders in building safe and +responsible driving agents, and streamlining safety certification processes. +Furthermore, S-RAF offers significant advantages, such as reduced testing +costs, and the ability to explore edge cases that may be unsafe to test in the +real world. The code for this framework is available here: +https://github.com/cognitive-robots/rai-leaderboard + +
+
+
+
+
+ + ☆ GrassNet: State Space Model Meets Graph Neural Network + + +
+ Designing spectral convolutional networks is a formidable task in graph +learning. In traditional spectral graph neural networks (GNNs), +polynomial-based methods are commonly used to design filters via the Laplacian +matrix. In practical applications, however, these polynomial methods encounter +inherent limitations, which primarily arise from the the low-order truncation +of polynomial filters and the lack of overall modeling of the graph spectrum. +This leads to poor performance of existing spectral approaches on real-world +graph data, especially when the spectrum is highly concentrated or contains +many numerically identical values, as they tend to apply the exact same +modulation to signals with the same frequencies. To overcome these issues, in +this paper, we propose Graph State Space Network (GrassNet), a novel graph +neural network with theoretical support that provides a simple yet effective +scheme for designing and learning arbitrary graph spectral filters. In +particular, our GrassNet introduces structured state space models (SSMs) to +model the correlations of graph signals at different frequencies and derives a +unique rectification for each frequency in the graph spectrum. To the best of +our knowledge, our work is the first to employ SSMs for the design of GNN +spectral filters, and it theoretically offers greater expressive power compared +with polynomial filters. Extensive experiments on nine public benchmarks reveal +that GrassNet achieves superior performance in real-world graph modeling tasks. + +
+
+
+
+
+ + ☆ S$^3$Attention: Improving Long Sequence Attention with Smoothed Skeleton + Sketching + + +
+ Attention based models have achieved many remarkable breakthroughs in +numerous applications. However, the quadratic complexity of Attention makes the +vanilla Attention based models hard to apply to long sequence tasks. Various +improved Attention structures are proposed to reduce the computation cost by +inducing low rankness and approximating the whole sequence by sub-sequences. +The most challenging part of those approaches is maintaining the proper balance +between information preservation and computation reduction: the longer +sub-sequences used, the better information is preserved, but at the price of +introducing more noise and computational costs. In this paper, we propose a +smoothed skeleton sketching based Attention structure, coined S$^3$Attention, +which significantly improves upon the previous attempts to negotiate this +trade-off. S$^3$Attention has two mechanisms to effectively minimize the impact +of noise while keeping the linear complexity to the sequence length: a +smoothing block to mix information over long sequences and a matrix sketching +method that simultaneously selects columns and rows from the input matrix. We +verify the effectiveness of S$^3$Attention both theoretically and empirically. +Extensive studies over Long Range Arena (LRA) datasets and six time-series +forecasting show that S$^3$Attention significantly outperforms both vanilla +Attention and other state-of-the-art variants of Attention structures. + +
+
+
+
+
+ + ☆ A training regime to learn unified representations from complementary + breast imaging modalities + + +
+ Full Field Digital Mammograms (FFDMs) and Digital Breast Tomosynthesis (DBT) +are the two most widely used imaging modalities for breast cancer screening. +Although DBT has increased cancer detection compared to FFDM, its widespread +adoption in clinical practice has been slowed by increased interpretation times +and a perceived decrease in the conspicuity of specific lesion types. +Specifically, the non-inferiority of DBT for microcalcifications remains under +debate. Due to concerns about the decrease in visual acuity, combined DBT-FFDM +acquisitions remain popular, leading to overall increased exam times and +radiation dosage. Enabling DBT to provide diagnostic information present in +both FFDM and DBT would reduce reliance on FFDM, resulting in a reduction in +both quantities. We propose a machine learning methodology that learns +high-level representations leveraging the complementary diagnostic signal from +both DBT and FFDM. Experiments on a large-scale data set validate our claims +and show that our representations enable more accurate breast lesion detection +than any DBT- or FFDM-based model. + +
+
+
+
+
+ + ☆ Linear combinations of latents in diffusion models: interpolation and + beyond + + +
+ Generative models are crucial for applications like data synthesis and +augmentation. Diffusion, Flow Matching and Continuous Normalizing Flows have +shown effectiveness across various modalities, and rely on Gaussian latent +variables for generation. As any generated object is directly associated with a +particular latent variable, we can manipulate the variables to exert control +over the generation process. However, standard approaches for combining latent +variables, such as spherical interpolation, only apply or work well in special +cases. Moreover, current methods for obtaining low-dimensional representations +of the data, important for e.g. surrogate models for search and creative +applications, are network and data modality specific. In this work we show that +the standard methods to combine variables do not yield intermediates following +the distribution the models are trained to expect. We propose Combination of +Gaussian variables (COG), a novel interpolation method that addresses this, is +easy to implement yet matches or improves upon current methods. COG addresses +linear combinations in general and, as we demonstrate, also supports other +operations including e.g. defining subspaces of the latent space, simplifying +the creation of expressive low-dimensional spaces of high-dimensional objects +using generative models based on Gaussian latents. + +
+
+
+
+
+ + ☆ ABQ-LLM: Arbitrary-Bit Quantized Inference Acceleration for Large + Language Models + + +
+ Large Language Models (LLMs) have revolutionized natural language processing +tasks. However, their practical application is constrained by substantial +memory and computational demands. Post-training quantization (PTQ) is +considered an effective method to accelerate LLM inference. Despite its growing +popularity in LLM model compression, PTQ deployment faces two major challenges. +First, low-bit quantization leads to performance degradation. Second, +restricted by the limited integer computing unit type on GPUs, quantized matrix +operations with different precisions cannot be effectively accelerated. To +address these issues, we introduce a novel arbitrary-bit quantization algorithm +and inference framework, ABQ-LLM. It achieves superior performance across +various quantization settings and enables efficient arbitrary-precision +quantized inference on the GPU. ABQ-LLM introduces several key innovations: (1) +a distribution correction method for transformer blocks to mitigate +distribution differences caused by full quantization of weights and +activations, improving performance at low bit-widths. (2) the bit balance +strategy to counteract performance degradation from asymmetric distribution +issues at very low bit-widths (e.g., 2-bit). (3) an innovative quantization +acceleration framework that reconstructs the quantization matrix multiplication +of arbitrary precision combinations based on BTC (Binary TensorCore) +equivalents, gets rid of the limitations of INT4/INT8 computing units. ABQ-LLM +can convert each component bit width gain into actual acceleration gain, +maximizing performance under mixed precision(e.g., W6A6, W2A8). Based on W2*A8 +quantization configuration on LLaMA-7B model, it achieved a WikiText2 +perplexity of 7.59 (2.17$\downarrow $ vs 9.76 in AffineQuant). Compared to +SmoothQuant, we realized 1.6$\times$ acceleration improvement and 2.7$\times$ +memory compression gain. + +
+
+
+
+
+ + ☆ Where is the signal in tokenization space? + + +
+ Large Language Models (LLMs) are typically shipped with tokenizers that +deterministically encode text into so-called canonical token sequences, to +which the LLMs assign probability values. One common assumption is that the +probability of a piece of text is the probability of its canonical token +sequence. However, the tokenization of a string is not unique: e.g., the Llama2 +tokenizer encodes Tokens as [Tok,ens], but [Tok,en,s] also represents the same +text. In this paper, we study non-canonical tokenizations. We prove that, given +a string, it is computationally hard to find the most likely tokenization for +an autoregressive LLM, as well as to compute the marginal probability over all +possible tokenizations. We then show how the marginal is, in most cases, +indistinguishable from the canonical probability. Surprisingly, we then +empirically demonstrate the existence of a significant amount of signal hidden +within tokenization space. Notably, by simply aggregating the probabilities of +non-canonical tokenizations, we achieve improvements across a range of LLM +evaluation benchmarks for a variety of architectures, including transformers +and state space models. + +
+
+
+
+
+ + ☆ Blockchain-Enabled Accountability in Data Supply Chain: A Data Bill of + Materials Approach + + +
+ In the era of advanced artificial intelligence, highlighted by large-scale +generative models like GPT-4, ensuring the traceability, verifiability, and +reproducibility of datasets throughout their lifecycle is paramount for +research institutions and technology companies. These organisations +increasingly rely on vast corpora to train and fine-tune advanced AI models, +resulting in intricate data supply chains that demand effective data governance +mechanisms. In addition, the challenge intensifies as diverse stakeholders may +use assorted tools, often without adequate measures to ensure the +accountability of data and the reliability of outcomes. In this study, we adapt +the concept of ``Software Bill of Materials" into the field of data governance +and management to address the above challenges, and introduce ``Data Bill of +Materials" (DataBOM) to capture the dependency relationship between different +datasets and stakeholders by storing specific metadata. We demonstrate a +platform architecture for providing blockchain-based DataBOM services, present +the interaction protocol for stakeholders, and discuss the minimal requirements +for DataBOM metadata. The proposed solution is evaluated in terms of +feasibility and performance via case study and quantitative analysis +respectively. + +
+
+
+
+
+ + ☆ Unsupervised Transfer Learning via Adversarial Contrastive Training + + +
+ Learning a data representation for downstream supervised learning tasks under +unlabeled scenario is both critical and challenging. In this paper, we propose +a novel unsupervised transfer learning approach using adversarial contrastive +training (ACT). Our experimental results demonstrate outstanding classification +accuracy with both fine-tuned linear probe and K-NN protocol across various +datasets, showing competitiveness with existing state-of-the-art +self-supervised learning methods. Moreover, we provide an end-to-end +theoretical guarantee for downstream classification tasks in a misspecified, +over-parameterized setting, highlighting how a large amount of unlabeled data +contributes to prediction accuracy. Our theoretical findings suggest that the +testing error of downstream tasks depends solely on the efficiency of data +augmentation used in ACT when the unlabeled sample size is sufficiently large. +This offers a theoretical understanding of learning downstream tasks with a +small sample size. + +
+
+
+
+
+ + ☆ Detecting Unsuccessful Students in Cybersecurity Exercises in Two + Different Learning Environments + + +
+ This full paper in the research track evaluates the usage of data logged from +cybersecurity exercises in order to predict students who are potentially at +risk of performing poorly. Hands-on exercises are essential for learning since +they enable students to practice their skills. In cybersecurity, hands-on +exercises are often complex and require knowledge of many topics. Therefore, +students may miss solutions due to gaps in their knowledge and become +frustrated, which impedes their learning. Targeted aid by the instructor helps, +but since the instructor's time is limited, efficient ways to detect struggling +students are needed. This paper develops automated tools to predict when a +student is having difficulty. We formed a dataset with the actions of 313 +students from two countries and two learning environments: KYPO CRP and +EDURange. These data are used in machine learning algorithms to predict the +success of students in exercises deployed in these environments. After +extracting features from the data, we trained and cross-validated eight +classifiers for predicting the exercise outcome and evaluated their predictive +power. The contribution of this paper is comparing two approaches to feature +engineering, modeling, and classification performance on data from two learning +environments. Using the features from either learning environment, we were able +to detect and distinguish between successful and struggling students. A +decision tree classifier achieved the highest balanced accuracy and sensitivity +with data from both learning environments. The results show that activity data +from cybersecurity exercises are suitable for predicting student success. In a +potential application, such models can aid instructors in detecting struggling +students and providing targeted help. We publish data and code for building +these models so that others can adopt or adapt them. + +
+
+ comment: To appear for publication in the FIE 2024 conference proceedings +
+
+
+
+
+ + ☆ Inverse design with conditional cascaded diffusion models + + +
+ Adjoint-based design optimizations are usually computationally expensive and +those costs scale with resolution. To address this, researchers have proposed +machine learning approaches for inverse design that can predict +higher-resolution solutions from lower cost/resolution ones. Due to the recent +success of diffusion models over traditional generative models, we extend the +use of diffusion models for multi-resolution tasks by proposing the conditional +cascaded diffusion model (cCDM). Compared to GANs, cCDM is more stable to +train, and each diffusion model within the cCDM can be trained independently, +thus each model's parameters can be tuned separately to maximize the +performance of the pipeline. Our study compares cCDM against a cGAN model with +transfer learning. + Our results demonstrate that the cCDM excels in capturing finer details, +preserving volume fraction constraints, and minimizing compliance errors in +multi-resolution tasks when a sufficient amount of high-resolution training +data (more than 102 designs) is available. Furthermore, we explore the impact +of training data size on the performance of both models. While both models show +decreased performance with reduced high-resolution training data, the cCDM +loses its superiority to the cGAN model with transfer learning when training +data is limited (less than 102), and we show the break-even point for this +transition. Also, we highlight that while the diffusion model may achieve +better pixel-wise performance in both low-resolution and high-resolution +scenarios, this does not necessarily guarantee that the model produces optimal +compliance error or constraint satisfaction. + +
+
+ comment: Accepted for presentation at IDETC/CIE 2024 conference, Washington, + DC. 11 pages, 9 figures +
+
+
+
+
+ + ☆ Mitigating Degree Bias in Signed Graph Neural Networks AAAI + + +
+ Like Graph Neural Networks (GNNs), Signed Graph Neural Networks (SGNNs) are +also up against fairness issues from source data and typical aggregation +method. In this paper, we are pioneering to make the investigation of fairness +in SGNNs expanded from GNNs. We identify the issue of degree bias within signed +graphs, offering a new perspective on the fairness issues related to SGNNs. To +handle the confronted bias issue, inspired by previous work on degree bias, a +new Model-Agnostic method is consequently proposed to enhance representation of +nodes with different degrees, which named as Degree Debiased Signed Graph +Neural Network (DD-SGNN) . More specifically, in each layer, we make a transfer +from nodes with high degree to nodes with low degree inside a head-to-tail +triplet, which to supplement the underlying domain missing structure of the +tail nodes and meanwhile maintain the positive and negative semantics specified +by balance theory in signed graphs. We make extensive experiments on four +real-world datasets. The result verifies the validity of the model, that is, +our model mitigates the degree bias issue without compromising +performance($\textit{i.e.}$, AUC, F1). The code is provided in supplementary +material. + +
+
+ comment: 10 pages, 7 figures, The 39th Annual AAAI Conference on Artificial + Intelligence +
+
+
+
+
+ + ☆ The Limitations of Model Retraining in the Face of Performativity ICML + + +
+ We study stochastic optimization in the context of performative shifts, where +the data distribution changes in response to the deployed model. We demonstrate +that naive retraining can be provably suboptimal even for simple distribution +shifts. The issue worsens when models are retrained given a finite number of +samples at each retraining step. We show that adding regularization to +retraining corrects both of these issues, attaining provably optimal models in +the face of distribution shifts. Our work advocates rethinking how machine +learning models are retrained in the presence of performative effects. + +
+
+ comment: Accepted to 2024 ICML Workshop on Humans, Algorithmic Decision-Making + and Society +
+
+
+
+
+ + ☆ Optimal Sketching for Residual Error Estimation for Matrix and Vector + Norms ICLR 2024 + + +
+ We study the problem of residual error estimation for matrix and vector norms +using a linear sketch. Such estimates can be used, for example, to quickly +assess how useful a more expensive low-rank approximation computation will be. +The matrix case concerns the Frobenius norm and the task is to approximate the +$k$-residual $\|A - A_k\|_F$ of the input matrix $A$ within a +$(1+\epsilon)$-factor, where $A_k$ is the optimal rank-$k$ approximation. We +provide a tight bound of $\Theta(k^2/\epsilon^4)$ on the size of bilinear +sketches, which have the form of a matrix product $SAT$. This improves the +previous $O(k^2/\epsilon^6)$ upper bound in (Andoni et al. SODA 2013) and gives +the first non-trivial lower bound, to the best of our knowledge. In our +algorithm, our sketching matrices $S$ and $T$ can both be sparse matrices, +allowing for a very fast update time. We demonstrate that this gives a +substantial advantage empirically, for roughly the same sketch size and +accuracy as in previous work. + For the vector case, we consider the $\ell_p$-norm for $p>2$, where the task +is to approximate the $k$-residual $\|x - x_k\|_p$ up to a constant factor, +where $x_k$ is the optimal $k$-sparse approximation to $x$. Such vector norms +are frequently studied in the data stream literature and are useful for finding +frequent items or so-called heavy hitters. We establish an upper bound of +$O(k^{2/p}n^{1-2/p}\operatorname{poly}(\log n))$ for constant $\epsilon$ on the +dimension of a linear sketch for this problem. Our algorithm can be extended to +the $\ell_p$ sparse recovery problem with the same sketching dimension, which +seems to be the first such bound for $p > 2$. We also show an +$\Omega(k^{2/p}n^{1-2/p})$ lower bound for the sparse recovery problem, which +is tight up to a $\mathrm{poly}(\log n)$ factor. + +
+
+ comment: Published as a conference paper at ICLR 2024 +
+
+
+
+
+ + ☆ Fishers Harvest Parallel Unlearning in Inherited Model Networks + + +
+ Unlearning in various learning frameworks remains challenging, with the +continuous growth and updates of models exhibiting complex inheritance +relationships. This paper presents a novel unlearning framework, which enables +fully parallel unlearning among models exhibiting inheritance. A key enabler is +the new Unified Model Inheritance Graph (UMIG), which captures the inheritance +using a Directed Acyclic Graph (DAG).Central to our framework is the new Fisher +Inheritance Unlearning (FIUn) algorithm, which utilizes the Fisher Information +Matrix (FIM) from initial unlearning models to pinpoint impacted parameters in +inherited models. By employing FIM, the FIUn method breaks the sequential +dependencies among the models, facilitating simultaneous unlearning and +reducing computational overhead. We further design to merge disparate FIMs into +a single matrix, synchronizing updates across inherited models. Experiments +confirm the effectiveness of our unlearning framework. For single-class tasks, +it achieves complete unlearning with 0\% accuracy for unlearned labels while +maintaining 94.53\% accuracy for retained labels on average. For multi-class +tasks, the accuracy is 1.07\% for unlearned labels and 84.77\% for retained +labels on average. Our framework accelerates unlearning by 99\% compared to +alternative methods. + +
+
+
+
+
+ + ☆ Adversarial Contrastive Learning Based Physics-Informed Temporal + Networks for Cuffless Blood Pressure Estimation + + +
+ Time series data mining is immensely important in extensive applications, +such as traffic, medical, and e-commerce. In this paper, we focus on medical +temporal variation modeling, \emph{i.e.,} cuffless blood pressure (BP) +monitoring which has great value in cardiovascular healthcare. Although +providing a comfortable user experience, such methods are suffering from the +demand for a significant amount of realistic data to train an individual model +for each subject, especially considering the invasive or obtrusive BP +ground-truth measurements. To tackle this challenge, we introduce a novel +physics-informed temporal network~(PITN) with adversarial contrastive learning +to enable precise BP estimation with very limited data. Specifically, we first +enhance the physics-informed neural network~(PINN) with the temporal block for +investigating BP dynamics' multi-periodicity for personal cardiovascular cycle +modeling and temporal variation. We then employ adversarial training to +generate extra physiological time series data, improving PITN's robustness in +the face of sparse subject-specific training data. Furthermore, we utilize +contrastive learning to capture the discriminative variations of cardiovascular +physiologic phenomena. This approach aggregates physiological signals with +similar blood pressure values in latent space while separating clusters of +samples with dissimilar blood pressure values. Experiments on three +widely-adopted datasets with different modailties (\emph{i.e.,} bioimpedance, +PPG, millimeter-wave) demonstrate the superiority and effectiveness of the +proposed methods over previous state-of-the-art approaches. The code is +available at~\url{https://github.com/Zest86/ACL-PITN}. + +
+
+ comment: 14 pages, 8 figures +
+
+
+
+
+ + ☆ An Unsupervised Learning Framework Combined with Heuristics for the + Maximum Minimal Cut Problem + + +
+ The Maximum Minimal Cut Problem (MMCP), a NP-hard combinatorial optimization +(CO) problem, has not received much attention due to the demanding and +challenging bi-connectivity constraint. Moreover, as a CO problem, it is also a +daunting task for machine learning, especially without labeled instances. To +deal with these problems, this work proposes an unsupervised learning framework +combined with heuristics for MMCP that can provide valid and high-quality +solutions. As far as we know, this is the first work that explores machine +learning and heuristics to solve MMCP. The unsupervised solver is inspired by a +relaxation-plus-rounding approach, the relaxed solution is parameterized by +graph neural networks, and the cost and penalty of MMCP are explicitly written +out, which can train the model end-to-end. A crucial observation is that each +solution corresponds to at least one spanning tree. Based on this finding, a +heuristic solver that implements tree transformations by adding vertices is +utilized to repair and improve the solution quality of the unsupervised solver. +Alternatively, the graph is simplified while guaranteeing solution consistency, +which reduces the running time. We conduct extensive experiments to evaluate +our framework and give a specific application. The results demonstrate the +superiority of our method against two techniques designed. + +
+
+
+
+
+ + ☆ Enhancing Events in Neutrino Telescopes through Deep Learning-Driven + Super-Resolution + + +
+ Recent discoveries by neutrino telescopes, such as the IceCube Neutrino +Observatory, relied extensively on machine learning (ML) tools to infer +physical quantities from the raw photon hits detected. Neutrino telescope +reconstruction algorithms are limited by the sparse sampling of photons by the +optical modules due to the relatively large spacing ($10-100\,{\rm m})$ between +them. In this letter, we propose a novel technique that learns photon transport +through the detector medium through the use of deep learning-driven +super-resolution of data events. These ``improved'' events can then be +reconstructed using traditional or ML techniques, resulting in improved +resolution. Our strategy arranges additional ``virtual'' optical modules within +an existing detector geometry and trains a convolutional neural network to +predict the hits on these virtual optical modules. We show that this technique +improves the angular reconstruction of muons in a generic ice-based neutrino +telescope. Our results readily extend to water-based neutrino telescopes and +other event morphologies. + +
+
+ comment: 5+1 pages, 4+1 figures +
+
+
+
+
+ + ☆ Context-Aware Assistant Selection for Improved Inference Acceleration + with Large Language Models + + +
+ Despite their widespread adoption, large language models (LLMs) remain +prohibitive to use under resource constraints, with their ever growing sizes +only increasing the barrier for use. One noted issue is the high latency +associated with auto-regressive generation, rendering large LLMs use dependent +on advanced computing infrastructure. Assisted decoding, where a smaller draft +model guides a larger target model's generation, has helped alleviate this, but +remains dependent on alignment between the two models. Thus if the draft model +is insufficiently capable on some domain relative to the target model, +performance can degrade. Alternatively, one can leverage multiple draft models +to better cover the expertise of the target, but when multiple black-box draft +models are available, selecting an assistant without details about its +construction can be difficult. To better understand this decision making +problem, we observe it as a contextual bandit, where a policy must choose a +draft model based on a context. We show that even without prior knowledge of +the draft models, creating an offline dataset from only outputs of independent +draft/target models and training a policy over the alignment of these outputs +can accelerate performance on multiple domains provided the candidates are +effective. Further results show this to hold on various settings with multiple +assisted decoding candidates, highlighting its flexibility and the advantageous +role that such decision making can play. + +
+
+ comment: 14 pages (9 pages main content + references + appendix) +
+
+
+
+
+ + ♻ ☆ Surprise-Adaptive Intrinsic Motivation for Unsupervised Reinforcement + Learning + + +
+ Both entropy-minimizing and entropy-maximizing (curiosity) objectives for +unsupervised reinforcement learning (RL) have been shown to be effective in +different environments, depending on the environment's level of natural +entropy. However, neither method alone results in an agent that will +consistently learn intelligent behavior across environments. In an effort to +find a single entropy-based method that will encourage emergent behaviors in +any environment, we propose an agent that can adapt its objective online, +depending on the entropy conditions by framing the choice as a multi-armed +bandit problem. We devise a novel intrinsic feedback signal for the bandit, +which captures the agent's ability to control the entropy in its environment. +We demonstrate that such agents can learn to control entropy and exhibit +emergent behaviors in both high- and low-entropy regimes and can learn skillful +behaviors in benchmark tasks. Videos of the trained agents and summarized +findings can be found on our project page +https://sites.google.com/view/surprise-adaptive-agents + +
+
+ comment: Published at the Reinforcement Learning Conference 2024 +
+
+
+
+
+ + ♻ ☆ Self-Taught Optimizer (STOP): Recursively Self-Improving Code Generation + + +
+ Several recent advances in AI systems solve problems by providing a +"scaffolding" program that structures multiple calls to language models (LMs) +to generate better outputs. A scaffolding program is written in a programming +language such as Python. In this work, we use a language-model-infused +scaffolding program to improve itself. We start with a seed "improver" that +improves an input program according to a given utility function by querying an +LM several times and returning the best solution. We then run this seed +improver to improve itself. Across a small set of downstream tasks, the +resulting improved improver generates programs with significantly better +performance than its seed improver. A variety of self-improvement strategies +are proposed by the language model, including beam search, genetic algorithms, +and simulated annealing. Since the language models themselves are not altered, +this is not full recursive self-improvement. Nonetheless, it demonstrates that +a modern language model, GPT-4 in our experiments, is capable of writing code +that can call itself to improve itself. We consider concerns around the +development of self-improving technologies and evaluate the frequency with +which the generated code bypasses a sandbox. + +
+
+ comment: Published as a conference paper at COLM 2024 +
+
+
+
+
+ + ♻ ☆ Potion: Towards Poison Unlearning + + +
+ Adversarial attacks by malicious actors on machine learning systems, such as +introducing poison triggers into training datasets, pose significant risks. The +challenge in resolving such an attack arises in practice when only a subset of +the poisoned data can be identified. This necessitates the development of +methods to remove, i.e. unlearn, poison triggers from already trained models +with only a subset of the poison data available. The requirements for this task +significantly deviate from privacy-focused unlearning where all of the data to +be forgotten by the model is known. Previous work has shown that the +undiscovered poisoned samples lead to a failure of established unlearning +methods, with only one method, Selective Synaptic Dampening (SSD), showing +limited success. Even full retraining, after the removal of the identified +poison, cannot address this challenge as the undiscovered poison samples lead +to a reintroduction of the poison trigger in the model. Our work addresses two +key challenges to advance the state of the art in poison unlearning. First, we +introduce a novel outlier-resistant method, based on SSD, that significantly +improves model protection and unlearning performance. Second, we introduce +Poison Trigger Neutralisation (PTN) search, a fast, parallelisable, +hyperparameter search that utilises the characteristic "unlearning versus model +protection" trade-off to find suitable hyperparameters in settings where the +forget set size is unknown and the retain set is contaminated. We benchmark our +contributions using ResNet-9 on CIFAR10 and WideResNet-28x10 on CIFAR100. +Experimental results show that our method heals 93.72% of poison compared to +SSD with 83.41% and full retraining with 40.68%. We achieve this while also +lowering the average model accuracy drop caused by unlearning from 5.68% (SSD) +to 1.41% (ours). + +
+
+
+
+
+ + ♻ ☆ S-BDT: Distributed Differentially Private Boosted Decision Trees + + +
+ We introduce S-BDT: a novel $(\varepsilon,\delta)$-differentially private +distributed gradient boosted decision tree (GBDT) learner that improves the +protection of single training data points (privacy) while achieving meaningful +learning goals, such as accuracy or regression error (utility). S-BDT uses less +noise by relying on non-spherical multivariate Gaussian noise, for which we +show tight subsampling bounds for privacy amplification and incorporate that +into a R\'enyi filter for individual privacy accounting. We experimentally +reach the same utility while saving $50\%$ in terms of epsilon for $\varepsilon +\le 0.5$ on the Abalone regression dataset (dataset size $\approx 4K$), saving +$30\%$ in terms of epsilon for $\varepsilon \le 0.08$ for the Adult +classification dataset (dataset size $\approx 50K$), and saving $30\%$ in terms +of epsilon for $\varepsilon\leq0.03$ for the Spambase classification dataset +(dataset size $\approx 5K$). Moreover, we show that for situations where a GBDT +is learning a stream of data that originates from different subpopulations +(non-IID), S-BDT improves the saving of epsilon even further. + +
+
+ comment: The first two authors equally contributed to this work +
+
+
+
+
+ + ♻ ☆ Improving Sampling Methods for Fine-tuning SentenceBERT in Text Streams ICPR + + +
+ The proliferation of textual data on the Internet presents a unique +opportunity for institutions and companies to monitor public opinion about +their services and products. Given the rapid generation of such data, the text +stream mining setting, which handles sequentially arriving, potentially +infinite text streams, is often more suitable than traditional batch learning. +While pre-trained language models are commonly employed for their high-quality +text vectorization capabilities in streaming contexts, they face challenges +adapting to concept drift - the phenomenon where the data distribution changes +over time, adversely affecting model performance. Addressing the issue of +concept drift, this study explores the efficacy of seven text sampling methods +designed to selectively fine-tune language models, thereby mitigating +performance degradation. We precisely assess the impact of these methods on +fine-tuning the SBERT model using four different loss functions. Our +evaluation, focused on Macro F1-score and elapsed time, employs two text stream +datasets and an incremental SVM classifier to benchmark performance. Our +findings indicate that Softmax loss and Batch All Triplets loss are +particularly effective for text stream classification, demonstrating that +larger sample sizes generally correlate with improved macro F1-scores. Notably, +our proposed WordPieceToken ratio sampling method significantly enhances +performance with the identified loss functions, surpassing baseline results. + +
+
+ comment: Accepted for presentation at the 27th International Conference on + Pattern Recognition (ICPR) 2024 +
+
+
+
+
+ + ♻ ☆ ChemVLM: Exploring the Power of Multimodal Large Language Models in + Chemistry Area + + +
+ Large Language Models (LLMs) have achieved remarkable success and have been +applied across various scientific fields, including chemistry. However, many +chemical tasks require the processing of visual information, which cannot be +successfully handled by existing chemical LLMs. This brings a growing need for +models capable of integrating multimodal information in the chemical domain. In +this paper, we introduce \textbf{ChemVLM}, an open-source chemical multimodal +large language model specifically designed for chemical applications. ChemVLM +is trained on a carefully curated bilingual multimodal dataset that enhances +its ability to understand both textual and visual chemical information, +including molecular structures, reactions, and chemistry examination questions. +We develop three datasets for comprehensive evaluation, tailored to Chemical +Optical Character Recognition (OCR), Multimodal Chemical Reasoning (MMCR), and +Multimodal Molecule Understanding tasks. We benchmark ChemVLM against a range +of open-source and proprietary multimodal large language models on various +tasks. Experimental results demonstrate that ChemVLM achieves competitive +performance across all evaluated tasks. Our model can be found at +https://huggingface.co/AI4Chem/ChemVLM-26B. + +
+
+ comment: 11 pages, updated version +
+
+
+
+
+ + ♻ ☆ Active Learning with Weak Supervision for Gaussian Processes + + +
+ Annotating data for supervised learning can be costly. When the annotation +budget is limited, active learning can be used to select and annotate those +observations that are likely to give the most gain in model performance. We +propose an active learning algorithm that, in addition to selecting which +observation to annotate, selects the precision of the annotation that is +acquired. Assuming that annotations with low precision are cheaper to obtain, +this allows the model to explore a larger part of the input space, with the +same annotation budget. We build our acquisition function on the previously +proposed BALD objective for Gaussian Processes, and empirically demonstrate the +gains of being able to adjust the annotation precision in the active learning +loop. + +
+
+ comment: This version of the contribution has been accepted for publication, + after peer review but is not the Version of Record and does not reflect + post-acceptance improvements, or any corrections. The Version of Record is + available online at: http://dx.doi.org/10.1007/978-981-99-1642-9_17. Use of + this Accepted Version is subject to the publisher's Accepted Manuscript terms + of use +
+
+
+
+
+ + ♻ ☆ Prediction Instability in Machine Learning Ensembles ICML2024 + + +
+ In machine learning ensembles predictions from multiple models are +aggregated. Despite widespread use and strong performance of ensembles in +applied problems little is known about the mathematical properties of +aggregating models and associated consequences for safe, explainable use of +such models. In this paper we prove a theorem that shows that any ensemble will +exhibit at least one of the following forms of prediction instability. It will +either ignore agreement among all underlying models, change its mind when none +of the underlying models have done so, or be manipulable through inclusion or +exclusion of options it would never actually predict. As a consequence, +ensemble aggregation procedures will always need to balance the benefits of +information use against the risk of these prediction instabilities. This +analysis also sheds light on what specific forms of prediction instability to +expect from particular ensemble algorithms; for example popular tree ensembles +like random forest, or xgboost will violate basic, intuitive fairness +properties. Finally, we show that this can be ameliorated by using consistent +models in asymptotic conditions. + +
+
+ comment: 15 pages, uses a modified version of ICML2024.sty +
+
+
+
+
+ + ♻ ☆ Federated Natural Policy Gradient and Actor Critic Methods for + Multi-task Reinforcement Learning + + +
+ Federated reinforcement learning (RL) enables collaborative decision making +of multiple distributed agents without sharing local data trajectories. In this +work, we consider a multi-task setting, in which each agent has its own private +reward function corresponding to different tasks, while sharing the same +transition kernel of the environment. Focusing on infinite-horizon Markov +decision processes, the goal is to learn a globally optimal policy that +maximizes the sum of the discounted total rewards of all the agents in a +decentralized manner, where each agent only communicates with its neighbors +over some prescribed graph topology. + We develop federated vanilla and entropy-regularized natural policy gradient +(NPG) methods in the tabular setting under softmax parameterization, where +gradient tracking is applied to estimate the global Q-function to mitigate the +impact of imperfect information sharing. We establish non-asymptotic global +convergence guarantees under exact policy evaluation, where the rates are +nearly independent of the size of the state-action space and illuminate the +impacts of network size and connectivity. To the best of our knowledge, this is +the first time that near dimension-free global convergence is established for +federated multi-task RL using policy optimization. We further go beyond the +tabular setting by proposing a federated natural actor critic (NAC) method for +multi-task RL with function approximation, and establish its finite-time sample +complexity taking the errors of function approximation into account. + +
+
+
+
+
+ + ♻ ☆ Self-Supervised Multimodal Learning: A Survey + + +
+ Multimodal learning, which aims to understand and analyze information from +multiple modalities, has achieved substantial progress in the supervised regime +in recent years. However, the heavy dependence on data paired with expensive +human annotations impedes scaling up models. Meanwhile, given the availability +of large-scale unannotated data in the wild, self-supervised learning has +become an attractive strategy to alleviate the annotation bottleneck. Building +on these two directions, self-supervised multimodal learning (SSML) provides +ways to learn from raw multimodal data. In this survey, we provide a +comprehensive review of the state-of-the-art in SSML, in which we elucidate +three major challenges intrinsic to self-supervised learning with multimodal +data: (1) learning representations from multimodal data without labels, (2) +fusion of different modalities, and (3) learning with unaligned data. We then +detail existing solutions to these challenges. Specifically, we consider (1) +objectives for learning from multimodal unlabeled data via self-supervision, +(2) model architectures from the perspective of different multimodal fusion +strategies, and (3) pair-free learning strategies for coarse-grained and +fine-grained alignment. We also review real-world applications of SSML +algorithms in diverse fields such as healthcare, remote sensing, and machine +translation. Finally, we discuss challenges and future directions for SSML. A +collection of related resources can be found at: +https://github.com/ys-zong/awesome-self-supervised-multimodal-learning. + +
+
+ comment: Accepted to IEEE T-PAMI +
+
+
+
+
+ + ♻ ☆ Detecting Hidden Triggers: Mapping Non-Markov Reward Functions to Markov + + +
+ Many Reinforcement Learning algorithms assume a Markov reward function to +guarantee optimality. However, not all reward functions are Markov. This paper +proposes a framework for mapping non-Markov reward functions into equivalent +Markov ones by learning specialized reward automata, Reward Machines. Unlike +the general practice of learning Reward Machines, we do not require a set of +high-level propositional symbols from which to learn. Rather, we learn hidden +triggers, directly from data, that construct them. We demonstrate the +importance of learning Reward Machines over their Deterministic Finite-State +Automata counterparts given their ability to model reward dependencies. We +formalize this distinction in our learning objective. Our mapping process is +constructed as an Integer Linear Programming problem. We prove that our +mappings form a suitable proxy for maximizing reward expectations. We +empirically validate our approach by learning black-box, non-Markov reward +functions in the Officeworld domain. Additionally, we demonstrate the +effectiveness of learning reward dependencies in a new domain, Breakfastworld. + +
+
+
+
+
+ + ♻ ☆ Agentic Skill Discovery + + +
+ Language-conditioned robotic skills make it possible to apply the high-level +reasoning of Large Language Models (LLMs) to low-level robotic control. A +remaining challenge is to acquire a diverse set of fundamental skills. Existing +approaches either manually decompose a complex task into atomic robotic actions +in a top-down fashion, or bootstrap as many combinations as possible in a +bottom-up fashion to cover a wider range of task possibilities. These +decompositions or combinations, however, require an initial skill library. For +example, a ``grasping'' capability can never emerge from a skill library +containing only diverse ``pushing'' skills. Existing skill discovery techniques +with reinforcement learning acquire skills by an exhaustive exploration but +often yield non-meaningful behaviors. In this study, we introduce a novel +framework for skill discovery that is entirely driven by LLMs. The framework +begins with an LLM generating task proposals based on the provided scene +description and the robot's configurations, aiming to incrementally acquire new +skills upon task completion. For each proposed task, a series of reinforcement +learning processes are initiated, utilizing reward and success determination +functions sampled by the LLM to develop the corresponding policy. The +reliability and trustworthiness of learned behaviors are further ensured by an +independent vision-language model. We show that starting with zero skill, the +skill library emerges and expands to more and more meaningful and reliable +skills, enabling the robot to efficiently further propose and complete advanced +tasks. Project page: \url{https://agentic-skill-discovery.github.io}. + +
+
+ comment: Webpage see https://agentic-skill-discovery.github.io/ +
+
+
+
+
+ + ♻ ☆ Dataset-learning duality and emergent criticality + + +
+ In artificial neural networks, the activation dynamics of non-trainable +variables is strongly coupled to the learning dynamics of trainable variables. +During the activation pass, the boundary neurons (e.g., input neurons) are +mapped to the bulk neurons (e.g., hidden neurons), and during the learning +pass, both bulk and boundary neurons are mapped to changes in trainable +variables (e.g., weights and biases). For example, in feed-forward neural +networks, forward propagation is the activation pass and backward propagation +is the learning pass. We show that a composition of the two maps establishes a +duality map between a subspace of non-trainable boundary variables (e.g., +dataset) and a tangent subspace of trainable variables (i.e., learning). In +general, the dataset-learning duality is a complex non-linear map between +high-dimensional spaces, but in a learning equilibrium, the problem can be +linearized and reduced to many weakly coupled one-dimensional problems. We use +the duality to study the emergence of criticality, or the power-law +distributions of fluctuations of the trainable variables. In particular, we +show that criticality can emerge in the learning system even from the dataset +in a non-critical state, and that the power-law distribution can be modified by +changing either the activation function or the loss function. + +
+
+ comment: 29 pages, 9 figures, 1 table, minor corrections +
+
+
+
+
+ + ♻ ☆ Heavy-Ball Momentum Accelerated Actor-Critic With Function Approximation + + +
+ By using an parametric value function to replace the Monte-Carlo rollouts for +value estimation, the actor-critic (AC) algorithms can reduce the variance of +stochastic policy gradient so that to improve the convergence rate. While +existing works mainly focus on analyzing convergence rate of AC algorithms +under Markovian noise, the impacts of momentum on AC algorithms remain largely +unexplored. In this work, we first propose a heavy-ball momentum based +advantage actor-critic (\mbox{HB-A2C}) algorithm by integrating the heavy-ball +momentum into the critic recursion that is parameterized by a linear function. +When the sample trajectory follows a Markov decision process, we quantitatively +certify the acceleration capability of the proposed HB-A2C algorithm. Our +theoretical results demonstrate that the proposed HB-A2C finds an +$\epsilon$-approximate stationary point with $\oo{\epsilon^{-2}}$ iterations +for reinforcement learning tasks with Markovian noise. Moreover, we also reveal +the dependence of learning rates on the length of the sample trajectory. By +carefully selecting the momentum factor of the critic recursion, the proposed +HB-A2C can balance the errors introduced by the initialization and the +stoschastic approximation. + +
+
+
+
+
+ + ♻ ☆ Mind the Privacy Unit! User-Level Differential Privacy for Language + Model Fine-Tuning + + +
+ Large language models (LLMs) have emerged as powerful tools for tackling +complex tasks across diverse domains, but they also raise privacy concerns when +fine-tuned on sensitive data due to potential memorization. While differential +privacy (DP) offers a promising solution by ensuring models are 'almost +indistinguishable' with or without any particular privacy unit, current +evaluations on LLMs mostly treat each example (text record) as the privacy +unit. This leads to uneven user privacy guarantees when contributions per user +vary. We therefore study user-level DP motivated by applications where it +necessary to ensure uniform privacy protection across users. We present a +systematic evaluation of user-level DP for LLM fine-tuning on natural language +generation tasks. Focusing on two mechanisms for achieving user-level DP +guarantees, Group Privacy and User-wise DP-SGD, we investigate design choices +like data selection strategies and parameter tuning for the best +privacy-utility tradeoff. + +
+
+ comment: Published as a conference paper at COLM 2024 +
+
+
+
+
+ + ♻ ☆ Centralized and Federated Heart Disease Classification Models Using UCI + Dataset and their Shapley-value Based Interpretability + + +
+ Cardiovascular diseases are a leading cause of mortality worldwide, +highlighting the need for accurate diagnostic methods. This study benchmarks +centralized and federated machine learning algorithms for heart disease +classification using the UCI dataset which includes 920 patient records from +four hospitals in the USA, Hungary and Switzerland. Our benchmark is supported +by Shapley-value interpretability analysis to quantify features' importance for +classification. In the centralized setup, various binary classification +algorithms are trained on pooled data, with a support vector machine (SVM) +achieving the highest testing accuracy of 83.3\%, surpassing the established +benchmark of 78.7\% with logistic regression. Additionally, federated learning +algorithms with four clients (hospitals) are explored, leveraging the dataset's +natural partition to enhance privacy without sacrificing accuracy. Federated +SVM, an uncommon approach in the literature, achieves a top testing accuracy of +73.8\%. Our interpretability analysis aligns with existing medical knowledge of +heart disease indicators. Overall, this study establishes a benchmark for +efficient and interpretable pre-screening tools for heart disease while +maintaining patients' privacy. This work is available at +https://github.com/padillma1/Heart-Disease-Classification-on-UCI-dataset-and-Shapley-Interpretability-Analysis. + +
+
+
+
+
+ + ♻ ☆ Learning Diffusion Priors from Observations by Expectation Maximization + + +
+ Diffusion models recently proved to be remarkable priors for Bayesian inverse +problems. However, training these models typically requires access to large +amounts of clean data, which could prove difficult in some settings. In this +work, we present a novel method based on the expectation-maximization algorithm +for training diffusion models from incomplete and noisy observations only. +Unlike previous works, our method leads to proper diffusion models, which is +crucial for downstream tasks. As part of our method, we propose and motivate an +improved posterior sampling scheme for unconditional diffusion models. We +present empirical evidence supporting the effectiveness of our method. + +
+
+
+
+
+ + ♻ ☆ Automated Contrastive Learning Strategy Search for Time Series CIKM'2024 + + +
+ In recent years, Contrastive Learning (CL) has become a predominant +representation learning paradigm for time series. Most existing methods +manually build specific CL Strategies (CLS) by human heuristics for certain +datasets and tasks. However, manually developing CLS usually requires excessive +prior knowledge about the data, and massive experiments to determine the +detailed CL configurations. In this paper, we present an Automated Machine +Learning (AutoML) practice at Microsoft, which automatically learns CLS for +time series datasets and tasks, namely Automated Contrastive Learning (AutoCL). +We first construct a principled search space of size over $3\times10^{12}$, +covering data augmentation, embedding transformation, contrastive pair +construction, and contrastive losses. Further, we introduce an efficient +reinforcement learning algorithm, which optimizes CLS from the performance on +the validation tasks, to obtain effective CLS within the space. Experimental +results on various real-world datasets demonstrate that AutoCL could +automatically find the suitable CLS for the given dataset and task. From the +candidate CLS found by AutoCL on several public datasets/tasks, we compose a +transferable Generally Good Strategy (GGS), which has a strong performance for +other datasets. We also provide empirical analysis as a guide for the future +design of CLS. + +
+
+ comment: Accepted by CIKM'2024 +
+
+
+
+
+ + ♻ ☆ Causality-Aware Spatiotemporal Graph Neural Networks for Spatiotemporal + Time Series Imputation CIKM'2024 + + +
+ Spatiotemporal time series are usually collected via monitoring sensors +placed at different locations, which usually contain missing values due to +various mechanical failures. Imputing the missing values is crucial for +analyzing time series. When recovering a specific data point, most existing +methods consider all the information relevant to that point regardless of the +cause-and-effect relationship. During data collection, it is inevitable that +some unknown confounders are included, e.g., background noise in time series +and non-causal shortcut edges in the constructed sensor network. These +confounders could open backdoor paths and establish non-causal correlations +between the input and output. Over-exploiting these non-causal correlations +could cause overfitting. In this paper, we first revisit spatiotemporal time +series imputation from a causal perspective and show how to block the +confounders via the frontdoor adjustment. Based on the results of frontdoor +adjustment, we introduce a novel Causality-Aware Spatiotemporal Graph Neural +Network (Casper), which contains a novel Prompt Based Decoder (PBD) and a +Spatiotemporal Causal Attention (SCA). PBD could reduce the impact of +confounders and SCA could discover the sparse causal relationships among +embeddings. Theoretical analysis reveals that SCA discovers causal +relationships based on the values of gradients. We evaluate Casper on three +real-world datasets, and the experimental results show that Casper could +outperform the baselines and could effectively discover causal relationships. + +
+
+ comment: Accepted by CIKM'2024 +
+
+
+
+
+ + ♻ ☆ Multi-marginal Schrödinger Bridges with Iterative Reference Refinement + + +
+ Practitioners frequently aim to infer an unobserved population trajectory +using sample snapshots at multiple time points. For instance, in single-cell +sequencing, scientists would like to learn how gene expression evolves over +time. But sequencing any cell destroys that cell. So we cannot access any +cell's full trajectory, but we can access snapshot samples from many cells. +Stochastic differential equations are commonly used to analyze systems with +full individual-trajectory access; since here we have only sample snapshots, +these methods are inapplicable. The deep learning community has recently +explored using Schr\"odinger bridges (SBs) and their extensions to estimate +these dynamics. However, these methods either (1) interpolate between just two +time points or (2) require a single fixed reference dynamic within the SB, +which is often just set to be Brownian motion. But learning piecewise from +adjacent time points can fail to capture long-term dependencies. And +practitioners are typically able to specify a model class for the reference +dynamic but not the exact values of the parameters within it. So we propose a +new method that (1) learns the unobserved trajectories from sample snapshots +across multiple time points and (2) requires specification only of a class of +reference dynamics, not a single fixed one. In particular, we suggest an +iterative projection method inspired by Schr\"odinger bridges; we alternate +between learning a piecewise SB on the unobserved trajectories and using the +learned SB to refine our best guess for the dynamics within the reference +class. We demonstrate the advantages of our method via a well-known simulated +parametric model from ecology, simulated and real data from systems biology, +and real motion-capture data. + +
+
+ comment: Updated to fix title error +
+
+
+
+
+ + ♻ ☆ Kernel Density Estimators in Large Dimensions + + +
+ This paper studies Kernel density estimation for a high-dimensional +distribution $\rho(x)$. Traditional approaches have focused on the limit of +large number of data points $n$ and fixed dimension $d$. We analyze instead the +regime where both the number $n$ of data points $y_i$ and their dimensionality +$d$ grow with a fixed ratio $\alpha=(\log n)/d$. Our study reveals three +distinct statistical regimes for the kernel-based estimate of the density $\hat +\rho_h^{\mathcal {D}}(x)=\frac{1}{n h^d}\sum_{i=1}^n +K\left(\frac{x-y_i}{h}\right)$, depending on the bandwidth $h$: a classical +regime for large bandwidth where the Central Limit Theorem (CLT) holds, which +is akin to the one found in traditional approaches. Below a certain value of +the bandwidth, $h_{CLT}(\alpha)$, we find that the CLT breaks down. The +statistics of $\hat \rho_h^{\mathcal {D}}(x)$ for a fixed $x$ drawn from +$\rho(x)$ is given by a heavy-tailed distribution (an alpha-stable +distribution). In particular below a value $h_G(\alpha)$, we find that $\hat +\rho_h^{\mathcal {D}}(x)$ is governed by extreme value statistics: only a few +points in the database matter and give the dominant contribution to the density +estimator. We provide a detailed analysis for high-dimensional multivariate +Gaussian data. We show that the optimal bandwidth threshold based on +Kullback-Leibler divergence lies in the new statistical regime identified in +this paper. Our findings reveal limitations of classical approaches, show the +relevance of these new statistical regimes, and offer new insights for Kernel +density estimation in high-dimensional settings. + +
+
+
+
+
+ + ♻ ☆ Resilience in Online Federated Learning: Mitigating Model-Poisoning + Attacks via Partial Sharing + + +
+ Federated learning (FL) allows training machine learning models on +distributed data without compromising privacy. However, FL is vulnerable to +model-poisoning attacks where malicious clients tamper with their local models +to manipulate the global model. In this work, we investigate the resilience of +the partial-sharing online FL (PSO-Fed) algorithm against such attacks. PSO-Fed +reduces communication overhead by allowing clients to share only a fraction of +their model updates with the server. We demonstrate that this partial sharing +mechanism has the added advantage of enhancing PSO-Fed's robustness to +model-poisoning attacks. Through theoretical analysis, we show that PSO-Fed +maintains convergence even under Byzantine attacks, where malicious clients +inject noise into their updates. Furthermore, we derive a formula for PSO-Fed's +mean square error, considering factors like stepsize, attack probability, and +the number of malicious clients. Interestingly, we find a non-trivial optimal +stepsize that maximizes PSO-Fed's resistance to these attacks. Extensive +numerical experiments confirm our theoretical findings and showcase PSO-Fed's +superior performance against model-poisoning attacks compared to other leading +FL algorithms. + +
+
+ comment: 13 pages, 9 figures, Submitted to TSIPN +
+
+
+
+
+ + ♻ ☆ A Medical Data-Effective Learning Benchmark for Highly Efficient + Pre-training of Foundation Models + + +
+ Foundation models, pre-trained on massive datasets, have achieved +unprecedented generalizability. However, is it truly necessary to involve such +vast amounts of data in pre-training, consuming extensive computational +resources? This paper introduces data-effective learning, aiming to use data in +the most impactful way to pre-train foundation models. This involves strategies +that focus on data quality rather than quantity, ensuring the data used for +training has high informational value. Data-effective learning plays a profound +role in accelerating foundation model training, reducing computational costs, +and saving data storage, which is very important as the volume of medical data +in recent years has grown beyond many people's expectations. However, due to +the lack of standards and comprehensive benchmarks, research on medical +data-effective learning is poorly studied. To address this gap, our paper +introduces a comprehensive benchmark specifically for evaluating data-effective +learning in the medical field. This benchmark includes a dataset with millions +of data samples from 31 medical centers (DataDEL), a baseline method for +comparison (MedDEL), and a new evaluation metric (NormDEL) to objectively +measure data-effective learning performance. Our extensive experimental results +show the baseline MedDEL can achieve performance comparable to the original +large dataset with only 5% of the data. Establishing such an open +data-effective learning benchmark is crucial for the medical foundation model +research community because it facilitates efficient data use, promotes +collaborative breakthroughs, and fosters the development of cost-effective, +scalable, and impactful healthcare solutions. + +
+
+
+
+
+ + ♻ ☆ Motion-compensated MR CINE reconstruction with reconstruction-driven + motion estimation + + +
+ In cardiac CINE, motion-compensated MR reconstruction (MCMR) is an effective +approach to address highly undersampled acquisitions by incorporating motion +information between frames. In this work, we propose a novel perspective for +addressing the MCMR problem and a more integrated and efficient solution to the +MCMR field. Contrary to state-of-the-art (SOTA) MCMR methods which break the +original problem into two sub-optimization problems, i.e. motion estimation and +reconstruction, we formulate this problem as a single entity with one single +optimization. Our approach is unique in that the motion estimation is directly +driven by the ultimate goal, reconstruction, but not by the canonical +motion-warping loss (similarity measurement between motion-warped images and +target images). We align the objectives of motion estimation and +reconstruction, eliminating the drawbacks of artifacts-affected motion +estimation and therefore error-propagated reconstruction. Further, we can +deliver high-quality reconstruction and realistic motion without applying any +regularization/smoothness loss terms, circumventing the non-trivial weighting +factor tuning. We evaluate our method on two datasets: 1) an in-house acquired +2D CINE dataset for the retrospective study and 2) the public OCMR cardiac +dataset for the prospective study. The conducted experiments indicate that the +proposed MCMR framework can deliver artifact-free motion estimation and +high-quality MR images even for imaging accelerations up to 20x, outperforming +SOTA non-MCMR and MCMR methods in both qualitative and quantitative evaluation +across all experiments. The code is available at +https://github.com/JZPeterPan/MCMR-Recon-Driven-Motion. + +
+
+
+
+
+ + ♻ ☆ Maximizing V-information for Pre-training Superior Foundation Models + + +
+ Pre-training foundation models on large-scale datasets demonstrates +exceptional performance. However, recent research questions this traditional +notion, exploring whether an increase in pre-training data always leads to +enhanced model performance. To address this issue, data-effective learning +approaches have been introduced. However, current methods in this area lack a +clear standard for sample selection. Our experiments reveal that by maximizing +V-information, sample selection can be framed as an optimization problem, +enabling effective improvement in model performance even with fewer samples. +Under this guidance, we develop an optimal data-effective learning method +(OptiDEL) to maximize V-information. The OptiDEL method generates hard samples +to achieve or even exceed the performance of models trained on the full dataset +while using substantially less data. We compare the OptiDEL method with +state-of-the-art approaches finding that OptiDEL consistently outperforms +existing approaches across different datasets, with foundation models trained +on only 5% of the pre-training data surpassing the performance of those trained +on the full dataset. + +
+
+
+
+
+ + ♻ ☆ Rethinking of Encoder-based Warm-start Methods in Hyperparameter + Optimization + + +
+ Effectively representing heterogeneous tabular datasets for meta-learning +purposes remains an open problem. Previous approaches rely on predefined +meta-features, for example, statistical measures or landmarkers. The emergence +of dataset encoders opens new possibilities for the extraction of meta-features +because they do not involve any handmade design. Moreover, they are proven to +generate dataset representations with desired spatial properties. In this +research, we evaluate an encoder-based approach to one of the most established +meta-tasks - warm-starting of the Bayesian Hyperparameter Optimization. To +broaden our analysis we introduce a new approach for representation learning on +tabular data based on [Tomoharu Iwata and Atsutoshi Kumagai. Meta-learning from +Tasks with Heterogeneous Attribute Spaces. In Advances in Neural Information +Processing Systems, 2020]. The validation on over 100 datasets from UCI and an +independent metaMIMIC set of datasets highlights the nuanced challenges in +representation learning. We show that general representations may not suffice +for some meta-tasks where requirements are not explicitly considered during +extraction. + +
+
+
+
+
+ + ♻ ☆ CollaFuse: Navigating Limited Resources and Privacy in Collaborative + Generative AI + + +
+ In the landscape of generative artificial intelligence, diffusion-based +models present challenges for socio-technical systems in data requirements and +privacy. Traditional approaches like federated learning distribute the learning +process but strain individual clients, especially with constrained resources +(e.g., edge devices). In response to these challenges, we introduce CollaFuse, +a novel framework inspired by split learning. Tailored for efficient and +collaborative use of denoising diffusion probabilistic models, CollaFuse +enables shared server training and inference, alleviating client computational +burdens. This is achieved by retaining data and computationally inexpensive GPU +processes locally at each client while outsourcing the computationally +expensive processes to the shared server. Demonstrated in a healthcare context, +CollaFuse enhances privacy by highly reducing the need for sensitive +information sharing. These capabilities hold the potential to impact various +application areas, such as the design of edge computing solutions, healthcare +research, or autonomous driving. In essence, our work advances distributed +machine learning, shaping the future of collaborative GenAI networks. + +
+
+ comment: Thirty-Second European Conference on Information Systems (ECIS 2024) +
+
+
+
+
+ + ♻ ☆ Active Sensing of Knee Osteoarthritis Progression with Reinforcement + Learning + + +
+ Osteoarthritis (OA) is the most common musculoskeletal disease, which has no +cure. Knee OA (KOA) is one of the highest causes of disability worldwide, and +it costs billions of United States dollars to the global community. Prediction +of KOA progression has been of high interest to the community for years, as it +can advance treatment development through more efficient clinical trials and +improve patient outcomes through more efficient healthcare utilization. +Existing approaches for predicting KOA, however, are predominantly static, i.e. +consider data from a single time point to predict progression many years into +the future, and knee level, i.e. consider progression in a single joint only. +Due to these and related reasons, these methods fail to deliver the level of +predictive performance, which is sufficient to result in cost savings and +better patient outcomes. Collecting extensive data from all patients on a +regular basis could address the issue, but it is limited by the high cost at a +population level. In this work, we propose to go beyond static prediction +models in OA, and bring a novel Active Sensing (AS) approach, designed to +dynamically follow up patients with the objective of maximizing the number of +informative data acquisitions, while minimizing their total cost over a period +of time. Our approach is based on Reinforcement Learning (RL), and it leverages +a novel reward function designed specifically for AS of disease progression in +more than one part of a human body. Our method is end-to-end, relies on +multi-modal Deep Learning, and requires no human input at inference time. +Throughout an exhaustive experimental evaluation, we show that using RL can +provide a higher monetary benefit when compared to state-of-the-art baselines. + +
+
+
+
+
+ + ♻ ☆ Model-agnostic variable importance for predictive uncertainty: an + entropy-based approach + + +
+ In order to trust the predictions of a machine learning algorithm, it is +necessary to understand the factors that contribute to those predictions. In +the case of probabilistic and uncertainty-aware models, it is necessary to +understand not only the reasons for the predictions themselves, but also the +reasons for the model's level of confidence in those predictions. In this +paper, we show how existing methods in explainability can be extended to +uncertainty-aware models and how such extensions can be used to understand the +sources of uncertainty in a model's predictive distribution. In particular, by +adapting permutation feature importance, partial dependence plots, and +individual conditional expectation plots, we demonstrate that novel insights +into model behaviour may be obtained and that these methods can be used to +measure the impact of features on both the entropy of the predictive +distribution and the log-likelihood of the ground truth labels under that +distribution. With experiments using both synthetic and real-world data, we +demonstrate the utility of these approaches to understand both the sources of +uncertainty and their impact on model performance. + +
+
+ comment: Data Mining and Knowledge Discovery. Springer +
+
+
+
+
+ + ♻ ☆ Efficient mapping of phase diagrams with conditional Boltzmann + Generators + + +
+ The accurate prediction of phase diagrams is of central importance for both +the fundamental understanding of materials as well as for technological +applications in material sciences. However, the computational prediction of the +relative stability between phases based on their free energy is a daunting +task, as traditional free energy estimators require a large amount of +simulation data to obtain uncorrelated equilibrium samples over a grid of +thermodynamic states. In this work, we develop deep generative machine learning +models based on the Boltzmann Generator approach for entire phase diagrams, +employing normalizing flows conditioned on the thermodynamic states, e.g., +temperature and pressure, that they map to. By training a single normalizing +flow to transform the equilibrium distribution sampled at only one reference +thermodynamic state to a wide range of target temperatures and pressures, we +can efficiently generate equilibrium samples across the entire phase diagram. +Using a permutation-equivariant architecture allows us, thereby, to treat solid +and liquid phases on the same footing. We demonstrate our approach by +predicting the solid-liquid coexistence line for a Lennard-Jones system in +excellent agreement with state-of-the-art free energy methods while +significantly reducing the number of energy evaluations needed. + +
+
+
+
+
+ + ♻ ☆ On the Overlooked Pitfalls of Weight Decay and How to Mitigate Them: A + Gradient-Norm Perspective NeurIPS 2023 + + +
+ Weight decay is a simple yet powerful regularization technique that has been +very widely used in training of deep neural networks (DNNs). While weight decay +has attracted much attention, previous studies fail to discover some overlooked +pitfalls on large gradient norms resulted by weight decay. In this paper, we +discover that, weight decay can unfortunately lead to large gradient norms at +the final phase (or the terminated solution) of training, which often indicates +bad convergence and poor generalization. To mitigate the gradient-norm-centered +pitfalls, we present the first practical scheduler for weight decay, called the +Scheduled Weight Decay (SWD) method that can dynamically adjust the weight +decay strength according to the gradient norm and significantly penalize large +gradient norms during training. Our experiments also support that SWD indeed +mitigates large gradient norms and often significantly outperforms the +conventional constant weight decay strategy for Adaptive Moment Estimation +(Adam). + +
+
+ comment: NeurIPS 2023, 21 pages, 20 figures. Keywords: Weight Decay, + Regularization, Optimization, Deep Learning +
+
+
+
+
+ + ♻ ☆ Revisiting Score Function Estimators for $k$-Subset Sampling ICML 2024 + + +
+ Are score function estimators an underestimated approach to learning with +$k$-subset sampling? Sampling $k$-subsets is a fundamental operation in many +machine learning tasks that is not amenable to differentiable parametrization, +impeding gradient-based optimization. Prior work has focused on relaxed +sampling or pathwise gradient estimators. Inspired by the success of score +function estimators in variational inference and reinforcement learning, we +revisit them within the context of $k$-subset sampling. Specifically, we +demonstrate how to efficiently compute the $k$-subset distribution's score +function using a discrete Fourier transform, and reduce the estimator's +variance with control variates. The resulting estimator provides both exact +samples and unbiased gradient estimates while also applying to +non-differentiable downstream models, unlike existing methods. Experiments in +feature selection show results competitive with current methods, despite weaker +assumptions. + +
+
+ comment: ICML 2024 Workshop on Differentiable Almost Everything: + Differentiable Relaxations, Algorithms, Operators, and Simulators +
+
+
+
+
+ + ♻ ☆ MathBridge: A Large Corpus Dataset for Translating Spoken Mathematical + Expressions into $LaTeX$ Formulas for Improved Readability + + +
+ Improving the readability of mathematical expressions in text-based document +such as subtitle of mathematical video, is an significant task. To achieve +this, mathematical expressions should be convert to compiled formulas. For +instance, the spoken expression ``x equals minus b plus or minus the square +root of b squared minus four a c, all over two a'' from automatic speech +recognition is more readily comprehensible when displayed as a compiled formula +$x = \frac{-b \pm \sqrt{b^2 - 4ac}}{2a}$. To convert mathematical spoken +sentences to compiled formulas, two processes are required: spoken sentences +are converted into LaTeX formulas, and LaTeX formulas are converted into +compiled formulas. The latter can be managed by using LaTeX engines. However, +there is no way to do the former effectively. Even if we try to solve this +using language models, there is no paired data between spoken sentences and +LaTeX formulas to train it. In this paper, we introduce MathBridge, the first +extensive dataset for translating mathematical spoken sentences into LaTeX +formulas. MathBridge comprises approximately 23 million LaTeX formulas paired +with the corresponding mathematical spoken sentences. Through comprehensive +evaluations, including fine-tuning with proposed data, we discovered that +MathBridge significantly enhances the capabilities of pretrained language +models for converting to LaTeX formulas from mathematical spoken sentences. +Specifically, for the T5-large model, the sacreBLEU score increased from 4.77 +to 46.8, demonstrating substantial enhancement. + +
+
+ comment: 9 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Indirectly Parameterized Concrete Autoencoders ICML 2024 + + +
+ Feature selection is a crucial task in settings where data is +high-dimensional or acquiring the full set of features is costly. Recent +developments in neural network-based embedded feature selection show promising +results across a wide range of applications. Concrete Autoencoders (CAEs), +considered state-of-the-art in embedded feature selection, may struggle to +achieve stable joint optimization, hurting their training time and +generalization. In this work, we identify that this instability is correlated +with the CAE learning duplicate selections. To remedy this, we propose a simple +and effective improvement: Indirectly Parameterized CAEs (IP-CAEs). IP-CAEs +learn an embedding and a mapping from it to the Gumbel-Softmax distributions' +parameters. Despite being simple to implement, IP-CAE exhibits significant and +consistent improvements over CAE in both generalization and training time +across several datasets for reconstruction and classification. Unlike CAE, +IP-CAE effectively leverages non-linear relationships and does not require +retraining the jointly optimized decoder. Furthermore, our approach is, in +principle, generalizable to Gumbel-Softmax distributions beyond feature +selection. + +
+
+ comment: ICML 2024 +
+
+
+
+
+ + ♻ ☆ Attacking Graph Neural Networks with Bit Flips: Weisfeiler and Lehman Go + Indifferent + + +
+ Prior attacks on graph neural networks have mostly focused on graph poisoning +and evasion, neglecting the network's weights and biases. Traditional +weight-based fault injection attacks, such as bit flip attacks used for +convolutional neural networks, do not consider the unique properties of graph +neural networks. We propose the Injectivity Bit Flip Attack, the first bit flip +attack designed specifically for graph neural networks. Our attack targets the +learnable neighborhood aggregation functions in quantized message passing +neural networks, degrading their ability to distinguish graph structures and +losing the expressivity of the Weisfeiler-Lehman test. Our findings suggest +that exploiting mathematical properties specific to certain graph neural +network architectures can significantly increase their vulnerability to bit +flip attacks. Injectivity Bit Flip Attacks can degrade the maximal expressive +Graph Isomorphism Networks trained on various graph property prediction +datasets to random output by flipping only a small fraction of the network's +bits, demonstrating its higher destructive power compared to a bit flip attack +transferred from convolutional neural networks. Our attack is transparent and +motivated by theoretical insights which are confirmed by extensive empirical +results. + +
+
+
+
+
+ + ♻ ☆ ExtremeCast: Boosting Extreme Value Prediction for Global Weather + Forecast + + +
+ Data-driven weather forecast based on machine learning (ML) has experienced +rapid development and demonstrated superior performance in the global +medium-range forecast compared to traditional physics-based dynamical models. +However, most of these ML models struggle with accurately predicting extreme +weather, which is related to training loss and the uncertainty of weather +systems. Through mathematical analysis, we prove that the use of symmetric +losses, such as the Mean Squared Error (MSE), leads to biased predictions and +underestimation of extreme values. To address this issue, we introduce Exloss, +a novel loss function that performs asymmetric optimization and highlights +extreme values to obtain accurate extreme weather forecast. Beyond the +evolution in training loss, we introduce a training-free extreme value +enhancement module named ExBooster, which captures the uncertainty in +prediction outcomes by employing multiple random samples, thereby increasing +the hit rate of low-probability extreme events. Combined with an advanced +global weather forecast model, extensive experiments show that our solution can +achieve state-of-the-art performance in extreme weather prediction, while +maintaining the overall forecast accuracy comparable to the top medium-range +forecast models. + +
+
+
+
+
+ + ♻ ☆ DropKAN: Dropout Kolmogorov-Arnold Networks + + +
+ We propose DropKAN (Dropout Kolmogorov-Arnold Networks) a regularization +method that prevents co-adaptation of activation function weights in +Kolmogorov-Arnold Networks (KANs). DropKAN functions by embedding the drop mask +directly within the KAN layer, randomly masking the outputs of some activations +within the KANs' computation graph. We show that this simple procedure that +require minimal coding effort has a regularizing effect and consistently lead +to better generalization of KANs. We analyze the adaptation of the standard +Dropout with KANs and demonstrate that Dropout applied to KANs' neurons can +lead to unpredictable behavior in the feedforward pass. We carry an empirical +study with real world Machine Learning datasets to validate our findings. Our +results suggest that DropKAN is consistently a better alternative to using +standard Dropout with KANs, and improves the generalization performance of +KANs. Our implementation of DropKAN is available at: +\url{https://github.com/Ghaith81/dropkan}. + +
+
+
+
+
+ + ♻ ☆ Fair Sampling in Diffusion Models through Switching Mechanism AAAI 2024 + + +
+ Diffusion models have shown their effectiveness in generation tasks by +well-approximating the underlying probability distribution. However, diffusion +models are known to suffer from an amplified inherent bias from the training +data in terms of fairness. While the sampling process of diffusion models can +be controlled by conditional guidance, previous works have attempted to find +empirical guidance to achieve quantitative fairness. To address this +limitation, we propose a fairness-aware sampling method called +\textit{attribute switching} mechanism for diffusion models. Without additional +training, the proposed sampling can obfuscate sensitive attributes in generated +data without relying on classifiers. We mathematically prove and experimentally +demonstrate the effectiveness of the proposed method on two key aspects: (i) +the generation of fair data and (ii) the preservation of the utility of the +generated data. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Handling Distribution Shifts on Graphs: An Invariance Perspective ICLR2022 + + +
+ There is increasing evidence suggesting neural networks' sensitivity to +distribution shifts, so that research on out-of-distribution (OOD) +generalization comes into the spotlight. Nonetheless, current endeavors mostly +focus on Euclidean data, and its formulation for graph-structured data is not +clear and remains under-explored, given two-fold fundamental challenges: 1) the +inter-connection among nodes in one graph, which induces non-IID generation of +data points even under the same environment, and 2) the structural information +in the input graph, which is also informative for prediction. In this paper, we +formulate the OOD problem on graphs and develop a new invariant learning +approach, Explore-to-Extrapolate Risk Minimization (EERM), that facilitates +graph neural networks to leverage invariance principles for prediction. EERM +resorts to multiple context explorers (specified as graph structure editers in +our case) that are adversarially trained to maximize the variance of risks from +multiple virtual environments. Such a design enables the model to extrapolate +from a single observed environment which is the common case for node-level +prediction. We prove the validity of our method by theoretically showing its +guarantee of a valid OOD solution and further demonstrate its power on various +real-world datasets for handling distribution shifts from artificial spurious +features, cross-domain transfers and dynamic graph evolution. + +
+
+ comment: ICLR2022, 30 pages +
+
+
+
+
+ + ♻ ☆ SGFormer: Simplifying and Empowering Transformers for Large-Graph + Representations NeurIPS 2023 + + +
+ Learning representations on large-sized graphs is a long-standing challenge +due to the inter-dependence nature involved in massive data points. +Transformers, as an emerging class of foundation encoders for graph-structured +data, have shown promising performance on small graphs due to its global +attention capable of capturing all-pair influence beyond neighboring nodes. +Even so, existing approaches tend to inherit the spirit of Transformers in +language and vision tasks, and embrace complicated models by stacking deep +multi-head attentions. In this paper, we critically demonstrate that even using +a one-layer attention can bring up surprisingly competitive performance across +node property prediction benchmarks where node numbers range from +thousand-level to billion-level. This encourages us to rethink the design +philosophy for Transformers on large graphs, where the global attention is a +computation overhead hindering the scalability. We frame the proposed scheme as +Simplified Graph Transformers (SGFormer), which is empowered by a simple +attention model that can efficiently propagate information among arbitrary +nodes in one layer. SGFormer requires none of positional encodings, +feature/graph pre-processing or augmented loss. Empirically, SGFormer +successfully scales to the web-scale graph ogbn-papers100M and yields up to +141x inference acceleration over SOTA Transformers on medium-sized graphs. +Beyond current results, we believe the proposed methodology alone enlightens a +new technical path of independent interest for building Transformers on large +graphs. + +
+
+ comment: Accepted to NeurIPS 2023, the codes are available at + https://github.com/qitianwu/SGFormer +
+
+
+
+
+ + ♻ ☆ Graph Out-of-Distribution Generalization via Causal Intervention WWW + + +
+ Out-of-distribution (OOD) generalization has gained increasing attentions for +learning on graphs, as graph neural networks (GNNs) often exhibit performance +degradation with distribution shifts. The challenge is that distribution shifts +on graphs involve intricate interconnections between nodes, and the environment +labels are often absent in data. In this paper, we adopt a bottom-up +data-generative perspective and reveal a key observation through causal +analysis: the crux of GNNs' failure in OOD generalization lies in the latent +confounding bias from the environment. The latter misguides the model to +leverage environment-sensitive correlations between ego-graph features and +target nodes' labels, resulting in undesirable generalization on new unseen +nodes. Built upon this analysis, we introduce a conceptually simple yet +principled approach for training robust GNNs under node-level distribution +shifts, without prior knowledge of environment labels. Our method resorts to a +new learning objective derived from causal inference that coordinates an +environment estimator and a mixture-of-expert GNN predictor. The new approach +can counteract the confounding bias in training data and facilitate learning +generalizable predictive relations. Extensive experiment demonstrates that our +model can effectively enhance generalization with various types of distribution +shifts and yield up to 27.4\% accuracy improvement over state-of-the-arts on +graph OOD generalization benchmarks. Source codes are available at +https://github.com/fannie1208/CaNet. + +
+
+ comment: Accepted by the research paper track of The Web Conference (WWW) + 2024. The codes are available at https://github.com/fannie1208/CaNet +
+
+
+
+
+ + ♻ ☆ Robust Neural Information Retrieval: An Adversarial and + Out-of-distribution Perspective + + +
+ Recent advances in neural information retrieval (IR) models have +significantly enhanced their effectiveness over various IR tasks. The +robustness of these models, essential for ensuring their reliability in +practice, has also garnered significant attention. With a wide array of +research on robust IR being proposed, we believe it is the opportune moment to +consolidate the current status, glean insights from existing methodologies, and +lay the groundwork for future development. We view the robustness of IR to be a +multifaceted concept, emphasizing its necessity against adversarial attacks, +out-of-distribution (OOD) scenarios and performance variance. With a focus on +adversarial and OOD robustness, we dissect robustness solutions for dense +retrieval models (DRMs) and neural ranking models (NRMs), respectively, +recognizing them as pivotal components of the neural IR pipeline. We provide an +in-depth discussion of existing methods, datasets, and evaluation metrics, +shedding light on challenges and future directions in the era of large language +models. To the best of our knowledge, this is the first comprehensive survey on +the robustness of neural IR models, and we will also be giving our first +tutorial presentation at SIGIR 2024 +\url{https://sigir2024-robust-information-retrieval.github.io}. Along with the +organization of existing work, we introduce a Benchmark for robust IR (BestIR), +a heterogeneous evaluation benchmark for robust neural information retrieval, +which is publicly available at \url{https://github.com/Davion-Liu/BestIR}. We +hope that this study provides useful clues for future research on the +robustness of IR models and helps to develop trustworthy search engines +\url{https://github.com/Davion-Liu/Awesome-Robustness-in-Information-Retrieval}. + +
+
+ comment: Survey paper +
+
+
+
+
+ + ♻ ☆ What Do Language Models Hear? Probing for Auditory Representations in + Language Models + + +
+ This work explores whether language models encode meaningfully grounded +representations of sounds of objects. We learn a linear probe that retrieves +the correct text representation of an object given a snippet of audio related +to that object, where the sound representation is given by a pretrained audio +model. This probe is trained via a contrastive loss that pushes the language +representations and sound representations of an object to be close to one +another. After training, the probe is tested on its ability to generalize to +objects that were not seen during training. Across different language models +and audio models, we find that the probe generalization is above chance in many +cases, indicating that despite being trained only on raw text, language models +encode grounded knowledge of sounds for some objects. + +
+
+
+
+
+ + ♻ ☆ Natural Language Interaction with a Household Electricity + Knowledge-based Digital Twin + + +
+ Domain specific digital twins, representing a digital replica of various +segments of the smart grid, are foreseen as able to model, simulate, and +control the respective segments. At the same time, knowledge-based digital +twins, coupled with AI, may also empower humans to understand aspects of the +system through natural language interaction in view of planning and policy +making. This paper is the first to assess and report on the potential of +Retrieval Augmented Generation (RAG) question answers related to household +electrical energy measurement aspects leveraging a knowledge-based energy +digital twin. Relying on the recently published electricity consumption +knowledge graph that actually represents a knowledge-based digital twin, we +study the capabilities of ChatGPT, Gemini and Llama in answering electricity +related questions. Furthermore, we compare the answers with the ones generated +through a RAG techniques that leverages an existing electricity knowledge-based +digital twin. Our findings illustrate that the RAG approach not only reduces +the incidence of incorrect information typically generated by LLMs but also +significantly improves the quality of the output by grounding responses in +verifiable data. This paper details our methodology, presents a comparative +analysis of responses with and without RAG, and discusses the implications of +our findings for future applications of AI in specialized sectors like energy +data analysis. + +
+
+ comment: Accepted at IEEE SmartGridComm'24 +
+
+
+
+
+ + ♻ ☆ Reliable Generation of Privacy-preserving Synthetic EHR Time Series via + Diffusion Models + + +
+ Electronic Health Records (EHRs) are rich sources of patient-level data, +offering valuable resources for medical data analysis. However, privacy +concerns often restrict access to EHRs, hindering downstream analysis. Current +EHR de-identification methods are flawed and can lead to potential privacy +leakage. Additionally, existing publicly available EHR databases are limited, +preventing the advancement of medical research using EHR. This study aims to +overcome these challenges by generating realistic and privacy-preserving +synthetic electronic health records (EHRs) time series efficiently. We +introduce a new method for generating diverse and realistic synthetic EHR time +series data using Denoising Diffusion Probabilistic Models (DDPM). We conducted +experiments on six databases: Medical Information Mart for Intensive Care III +and IV (MIMIC-III/IV), the eICU Collaborative Research Database (eICU), and +non-EHR datasets on Stocks and Energy. We compared our proposed method with +eight existing methods. Our results demonstrate that our approach significantly +outperforms all existing methods in terms of data fidelity while requiring less +training effort. Additionally, data generated by our method yields a lower +discriminative accuracy compared to other baseline methods, indicating the +proposed method can generate data with less privacy risk. The proposed +diffusion-model-based method can reliably and efficiently generate synthetic +EHR time series, which facilitates the downstream medical data analysis. Our +numerical results show the superiority of the proposed method over all other +existing methods. + +
+
+
+
+
+ + ♻ ☆ Trading Devil Final: Backdoor attack via Stock market and Bayesian + Optimization + + +
+ Since the advent of generative artificial intelligence, every company and +researcher has been rushing to develop their own generative models, whether +commercial or not. Given the large number of users of these powerful new tools, +there is currently no intrinsically verifiable way to explain from the ground +up what happens when LLMs (large language models) learn. For example, those +based on automatic speech recognition systems, which have to rely on huge and +astronomical amounts of data collected from all over the web to produce fast +and efficient results, In this article, we develop a backdoor attack called +MarketBackFinal 2.0, based on acoustic data poisoning, MarketBackFinal 2.0 is +mainly based on modern stock market models. In order to show the possible +vulnerabilities of speech-based transformers that may rely on LLMs. + +
+
+ comment: END (will never be modified again) :Jumps-Diffusion and stock market: + Better quantify uncertainty in financial simulations +
+
+
+
+
+ + ♻ ☆ Data-driven identification of latent port-Hamiltonian systems + + +
+ Conventional physics-based modeling techniques involve high effort, e.g., +time and expert knowledge, while data-driven methods often lack +interpretability, structure, and sometimes reliability. To mitigate this, we +present a data-driven system identification framework that derives models in +the port-Hamiltonian (pH) formulation. This formulation is suitable for +multi-physical systems while guaranteeing the useful system theoretical +properties of passivity and stability. Our framework combines linear and +nonlinear reduction with structured, physics-motivated system identification. +In this process, high-dimensional state data obtained from possibly nonlinear +systems serves as input for an autoencoder, which then performs two tasks: (i) +nonlinearly transforming and (ii) reducing this data onto a low-dimensional +latent space. In this space, a linear pH system, that satisfies the pH +properties per construction, is parameterized by the weights of a neural +network. The mathematical requirements are met by defining the pH matrices +through Cholesky factorizations. The neural networks that define the coordinate +transformation and the pH system are identified in a joint optimization process +to match the dynamics observed in the data while defining a linear pH system in +the latent space. The learned, low-dimensional pH system can describe even +nonlinear systems and is rapidly computable due to its small size. The method +is exemplified by a parametric mass-spring-damper and a nonlinear pendulum +example, as well as the high-dimensional model of a disc brake with linear +thermoelastic behavior. + +
+
+ comment: 33 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ AirPilot: A PPO-based DRL Auto-Tuned Nonlinear PID Drone Controller for + Robust Autonomous Flights + + +
+ Navigation precision, speed and stability are crucial for safe UAV flight +maneuvers and effective flight mission executions in dynamic environments. +Different flight missions may have varying objectives, such as minimizing +energy consumption, achieving precise positioning, or maximizing speed. A +controller that can adapt to different objectives on the fly is highly +valuable. Proportional Integral Derivative controllers are one of the most +popular and widely used control algorithms for drones control systems, but +their linear control algorithm fails to capture the nonlinear nature of the +dynamic wind conditions and complex drone system. Manually tuning the PID gains +for various missions can be time-consuming and requires significant expertise. +This paper aims to revolutionize drone flight control by presenting the +AirPilot, a nonlinear Deep Reinforcement Learning (DRL) - enhanced PID drone +controller using Proximal Policy Optimization. AirPilot controller combines the +simplicity and effectiveness of traditional PID control with the adaptability, +learning capability, and optimization potential of DRL. This makes it better +suited for modern drone applications where the environment is dynamic, and +mission-specific performance demands are high. We employed a COEX Clover +autonomous drone for training the DRL agent within the Gazebo simulator and +subsequently implemented it in a real-world lab setting, which marks a +significant milestone as one of the first attempts to apply a DRL-based flight +controller on an actual drone. Airpilot is capable of reducing the navigation +error by more than 82% and improving overshoot, speed and settling time +significantly. + +
+
+ comment: 14 pages, 17 figures +
+
+
+
+
+ + ♻ ☆ Large Language Models for Code: Security Hardening and Adversarial + Testing CCS 2023 + + +
+ Large language models (large LMs) are increasingly trained on massive +codebases and used to generate code. However, LMs lack awareness of security +and are found to frequently produce unsafe code. This work studies the security +of LMs along two important axes: (i) security hardening, which aims to enhance +LMs' reliability in generating secure code, and (ii) adversarial testing, which +seeks to evaluate LMs' security at an adversarial standpoint. We address both +of these by formulating a new security task called controlled code generation. +The task is parametric and takes as input a binary property to guide the LM to +generate secure or unsafe code, while preserving the LM's capability of +generating functionally correct code. We propose a novel learning-based +approach called SVEN to solve this task. SVEN leverages property-specific +continuous vectors to guide program generation towards the given property, +without modifying the LM's weights. Our training procedure optimizes these +continuous vectors by enforcing specialized loss terms on different regions of +code, using a high-quality dataset carefully curated by us. Our extensive +evaluation shows that SVEN is highly effective in achieving strong security +control. For instance, a state-of-the-art CodeGen LM with 2.7B parameters +generates secure code for 59.1% of the time. When we employ SVEN to perform +security hardening (or adversarial testing) on this LM, the ratio is +significantly boosted to 92.3% (or degraded to 36.8%). Importantly, SVEN +closely matches the original LMs in functional correctness. + +
+
+ comment: Accepted to ACM CCS 2023 +
+
+
+
+
+ + ♻ ☆ Spectral Clustering for Discrete Distributions + + +
+ The discrete distribution is often used to describe complex instances in +machine learning, such as images, sequences, and documents. Traditionally, +clustering of discrete distributions (D2C) has been approached using +Wasserstein barycenter methods. These methods operate under the assumption that +clusters can be well-represented by barycenters, which is seldom true in many +real-world applications. Additionally, these methods are not scalable for large +datasets due to the high computational cost of calculating Wasserstein +barycenters. In this work, we explore the feasibility of using spectral +clustering combined with distribution affinity measures (e.g., maximum mean +discrepancy and Wasserstein distance) to cluster discrete distributions. We +demonstrate that these methods can be more accurate and efficient than +barycenter methods. To further enhance scalability, we propose using linear +optimal transport to construct affinity matrices efficiently for large +datasets. We provide theoretical guarantees for the success of our methods in +clustering distributions. Experiments on both synthetic and real data show that +our methods outperform existing baselines. + +
+
+
+
+
+ + ♻ ☆ Implicit Causal Representation Learning via Switchable Mechanisms + + +
+ Learning causal representations from observational and interventional data in +the absence of known ground-truth graph structures necessitates implicit latent +causal representation learning. Implicit learning of causal mechanisms +typically involves two categories of interventional data: hard and soft +interventions. In real-world scenarios, soft interventions are often more +realistic than hard interventions, as the latter require fully controlled +environments. Unlike hard interventions, which directly force changes in a +causal variable, soft interventions exert influence indirectly by affecting the +causal mechanism. However, the subtlety of soft interventions impose several +challenges for learning causal models. One challenge is that soft +intervention's effects are ambiguous, since parental relations remain intact. +In this paper, we tackle the challenges of learning causal models using soft +interventions while retaining implicit modelling. We propose ICLR-SM, which +models the effects of soft interventions by employing a causal mechanism switch +variable designed to toggle between different causal mechanisms. In our +experiments, we consistently observe improved learning of identifiable, causal +representations, compared to baseline approaches. + +
+
+
+
+
+ + ♻ ☆ Confronting the Reproducibility Crisis: A Case Study of Challenges in + Cybersecurity AI + + +
+ In the rapidly evolving field of cybersecurity, ensuring the reproducibility +of AI-driven research is critical to maintaining the reliability and integrity +of security systems. This paper addresses the reproducibility crisis within the +domain of adversarial robustness -- a key area in AI-based cybersecurity that +focuses on defending deep neural networks against malicious perturbations. +Through a detailed case study, we attempt to validate results from prior work +on certified robustness using the VeriGauge toolkit, revealing significant +challenges due to software and hardware incompatibilities, version conflicts, +and obsolescence. Our findings underscore the urgent need for standardized +methodologies, containerization, and comprehensive documentation to ensure the +reproducibility of AI models deployed in critical cybersecurity applications. +By tackling these reproducibility challenges, we aim to contribute to the +broader discourse on securing AI systems against advanced persistent threats, +enhancing network and IoT security, and protecting critical infrastructure. +This work advocates for a concerted effort within the research community to +prioritize reproducibility, thereby strengthening the foundation upon which +future cybersecurity advancements are built. + +
+
+ comment: 8 pages, 0 figures, 2 tables, updated to incorporate feedback and + improvements +
+
+
+
+
+ + ♻ ☆ Large Language Model Aided QoS Prediction for Service Recommendation + + +
+ Large language models (LLMs) have seen rapid improvement in the recent years, +and have been used in a wider range of applications. After being trained on +large text corpus, LLMs obtain the capability of extracting rich features from +textual data. Such capability is potentially useful for the web service +recommendation task, where the web users and services have intrinsic attributes +that can be described using natural language sentences and are useful for +recommendation. In this paper, we explore the possibility and practicality of +using LLMs for web service recommendation. We propose the large language model +aided QoS prediction (llmQoS) model, which use LLMs to extract useful +information from attributes of web users and services via descriptive +sentences. This information is then used in combination with the QoS values of +historical interactions of users and services, to predict QoS values for any +given user-service pair. On the WSDream dataset, llmQoS is shown to overcome +the data sparsity issue inherent to the QoS prediction problem, and outperforms +comparable baseline models consistently. + +
+
+
+
+
+ + ♻ ☆ Space Group Informed Transformer for Crystalline Materials Generation + + +
+ We introduce CrystalFormer, a transformer-based autoregressive model +specifically designed for space group-controlled generation of crystalline +materials. The incorporation of space group symmetry significantly simplifies +the crystal space, which is crucial for data and compute efficient generative +modeling of crystalline materials. Leveraging the prominent discrete and +sequential nature of the Wyckoff positions, CrystalFormer learns to generate +crystals by directly predicting the species and locations of +symmetry-inequivalent atoms in the unit cell. We demonstrate the advantages of +CrystalFormer in standard tasks such as symmetric structure initialization and +element substitution compared to conventional methods implemented in popular +crystal structure prediction software. Moreover, we showcase the application of +CrystalFormer of property-guided materials design in a plug-and-play manner. +Our analysis shows that CrystalFormer ingests sensible solid-state chemistry +knowledge and heuristics by compressing the material dataset, thus enabling +systematic exploration of crystalline materials. The simplicity, generality, +and flexibility of CrystalFormer position it as a promising architecture to be +the foundational model of the entire crystalline materials space, heralding a +new era in materials modeling and discovery. + +
+
+ comment: 26 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ RC-Mixup: A Data Augmentation Strategy against Noisy Data for Regression + Tasks KDD 2024 + + +
+ We study the problem of robust data augmentation for regression tasks in the +presence of noisy data. Data augmentation is essential for generalizing deep +learning models, but most of the techniques like the popular Mixup are +primarily designed for classification tasks on image data. Recently, there are +also Mixup techniques that are specialized to regression tasks like C-Mixup. In +comparison to Mixup, which takes linear interpolations of pairs of samples, +C-Mixup is more selective in which samples to mix based on their label +distances for better regression performance. However, C-Mixup does not +distinguish noisy versus clean samples, which can be problematic when mixing +and lead to suboptimal model performance. At the same time, robust training has +been heavily studied where the goal is to train accurate models against noisy +data through multiple rounds of model training. We thus propose our data +augmentation strategy RC-Mixup, which tightly integrates C-Mixup with +multi-round robust training methods for a synergistic effect. In particular, +C-Mixup improves robust training in identifying clean data, while robust +training provides cleaner data to C-Mixup for it to perform better. A key +advantage of RC-Mixup is that it is data-centric where the robust model +training algorithm itself does not need to be modified, but can simply benefit +from data mixing. We show in our experiments that RC-Mixup significantly +outperforms C-Mixup and robust training baselines on noisy data benchmarks and +can be integrated with various robust training methods. + +
+
+ comment: Accepted to KDD 2024 +
+
+
+
+
+ + ♻ ☆ BiLO: Bilevel Local Operator Learning for PDE inverse problems + + +
+ We propose a new neural network based method for solving inverse problems for +partial differential equations (PDEs) by formulating the PDE inverse problem as +a bilevel optimization problem. At the upper level, we minimize the data loss +with respect to the PDE parameters. At the lower level, we train a neural +network to locally approximate the PDE solution operator in the neighborhood of +a given set of PDE parameters, which enables an accurate approximation of the +descent direction for the upper level optimization problem. The lower level +loss function includes the L2 norms of both the residual and its derivative +with respect to the PDE parameters. We apply gradient descent simultaneously on +both the upper and lower level optimization problems, leading to an effective +and fast algorithm. The method, which we refer to as BiLO (Bilevel Local +Operator learning), is also able to efficiently infer unknown functions in the +PDEs through the introduction of an auxiliary variable. Through extensive +experiments over multiple PDE systems, we demonstrate that our method enforces +strong PDE constraints, is robust to sparse and noisy data, and eliminates the +need to balance the residual and the data loss, which is inherent to the soft +PDE constraints in many existing methods. + +
+
+
+
+
+ + ♻ ☆ Multistatic-Radar RCS-Signature Recognition of Aerial Vehicles: A + Bayesian Fusion Approach + + +
+ Radar Automated Target Recognition (RATR) for Unmanned Aerial Vehicles (UAVs) +involves transmitting Electromagnetic Waves (EMWs) and performing target type +recognition on the received radar echo, crucial for defense and aerospace +applications. Previous studies highlighted the advantages of multistatic radar +configurations over monostatic ones in RATR. However, fusion methods in +multistatic radar configurations often suboptimally combine classification +vectors from individual radars probabilistically. To address this, we propose a +fully Bayesian RATR framework employing Optimal Bayesian Fusion (OBF) to +aggregate classification probability vectors from multiple radars. OBF, based +on expected 0-1 loss, updates a Recursive Bayesian Classification (RBC) +posterior distribution for target UAV type, conditioned on historical +observations across multiple time steps. We evaluate the approach using +simulated random walk trajectories for seven drones, correlating target aspect +angles to Radar Cross Section (RCS) measurements in an anechoic chamber. +Comparing against single radar Automated Target Recognition (ATR) systems and +suboptimal fusion methods, our empirical results demonstrate that the OBF +method integrated with RBC significantly enhances classification accuracy +compared to other fusion methods and single radar configurations. + +
+
+ comment: Accepted to IEEE Transactions on Aerospace and Electronic Systems +
+
+
+
+
+ + ♻ ☆ Personalized Predictions of Glioblastoma Infiltration: Mathematical + Models, Physics-Informed Neural Networks and Multimodal Scans + + +
+ Predicting the infiltration of Glioblastoma (GBM) from medical MRI scans is +crucial for understanding tumor growth dynamics and designing personalized +radiotherapy treatment plans.Mathematical models of GBM growth can complement +the data in the prediction of spatial distributions of tumor cells. However, +this requires estimating patient-specific parameters of the model from clinical +data, which is a challenging inverse problem due to limited temporal data and +the limited time between imaging and diagnosis. This work proposes a method +that uses Physics-Informed Neural Networks (PINNs) to estimate patient-specific +parameters of a reaction-diffusion PDE model of GBM growth from a single 3D +structural MRI snapshot. PINNs embed both the data and the PDE into a loss +function, thus integrating theory and data. Key innovations include the +identification and estimation of characteristic non-dimensional parameters, a +pre-training step that utilizes the non-dimensional parameters and a +fine-tuning step to determine the patient specific parameters. Additionally, +the diffuse domain method is employed to handle the complex brain geometry +within the PINN framework. Our method is validated both on synthetic and +patient datasets, and shows promise for real-time parametric inference in the +clinical setting for personalized GBM treatment. + +
+
+
+
+
+ + ♻ ☆ Activations Through Extensions: A Framework To Boost Performance Of + Neural Networks + + +
+ Activation functions are non-linearities in neural networks that allow them +to learn complex mapping between inputs and outputs. Typical choices for +activation functions are ReLU, Tanh, Sigmoid etc., where the choice generally +depends on the application domain. In this work, we propose a +framework/strategy that unifies several works on activation functions and +theoretically explains the performance benefits of these works. We also propose +novel techniques that originate from the framework and allow us to obtain +``extensions'' (i.e. special generalizations of a given neural network) of +neural networks through operations on activation functions. We theoretically +and empirically show that ``extensions'' of neural networks have performance +benefits compared to vanilla neural networks with insignificant space and time +complexity costs on standard test functions. We also show the benefits of +neural network ``extensions'' in the time-series domain on real-world datasets. + +
+
+
+
+
+ + ♻ ☆ Enhancing Accuracy in Generative Models via Knowledge Transfer + + +
+ This paper investigates the accuracy of generative models and the impact of +knowledge transfer on their generation precision. Specifically, we examine a +generative model for a target task, fine-tuned using a pre-trained model from a +source task. Building on the "Shared Embedding" concept, which bridges the +source and target tasks, we introduce a novel framework for transfer learning +under distribution metrics such as the Kullback-Leibler divergence. This +framework underscores the importance of leveraging inherent similarities +between diverse tasks despite their distinct data distributions. Our theory +suggests that the shared structures can augment the generation accuracy for a +target task, reliant on the capability of a source model to identify shared +structures and effective knowledge transfer from source to target learning. To +demonstrate the practical utility of this framework, we explore the theoretical +implications for two specific generative models: diffusion and normalizing +flows. The results show enhanced performance in both models over their +non-transfer counterparts, indicating advancements for diffusion models and +providing fresh insights into normalizing flows in transfer and non-transfer +settings. These results highlight the significant contribution of knowledge +transfer in boosting the generation capabilities of these models. + +
+
+
+
+
+ + ♻ ☆ Characterizing and Understanding HGNN Training on GPUs + + +
+ Owing to their remarkable representation capabilities for heterogeneous graph +data, Heterogeneous Graph Neural Networks (HGNNs) have been widely adopted in +many critical real-world domains such as recommendation systems and medical +analysis. Prior to their practical application, identifying the optimal HGNN +model parameters tailored to specific tasks through extensive training is a +time-consuming and costly process. To enhance the efficiency of HGNN training, +it is essential to characterize and analyze the execution semantics and +patterns within the training process to identify performance bottlenecks. In +this study, we conduct an in-depth quantification and analysis of two +mainstream HGNN training scenarios, including single-GPU and multi-GPU +distributed training. Based on the characterization results, we disclose the +performance bottlenecks and their underlying causes in different HGNN training +scenarios and provide optimization guidelines from both software and hardware +perspectives. + +
+
+ comment: 23 pages, 14 figures, submitted to ACM TACO +
+
+
+
+
+ + ♻ ☆ A Survey of Meta-Reinforcement Learning + + +
+ While deep reinforcement learning (RL) has fueled multiple high-profile +successes in machine learning, it is held back from more widespread adoption by +its often poor data efficiency and the limited generality of the policies it +produces. A promising approach for alleviating these limitations is to cast the +development of better RL algorithms as a machine learning problem itself in a +process called meta-RL. Meta-RL is most commonly studied in a problem setting +where, given a distribution of tasks, the goal is to learn a policy that is +capable of adapting to any new task from the task distribution with as little +data as possible. In this survey, we describe the meta-RL problem setting in +detail as well as its major variations. We discuss how, at a high level, +meta-RL research can be clustered based on the presence of a task distribution +and the learning budget available for each individual task. Using these +clusters, we then survey meta-RL algorithms and applications. We conclude by +presenting the open problems on the path to making meta-RL part of the standard +toolbox for a deep RL practitioner. + +
+
+
+
+
+ + ♻ ☆ Open-Source Molecular Processing Pipeline for Generating Molecules + + +
+ Generative models for molecules have shown considerable promise for use in +computational chemistry, but remain difficult to use for non-experts. For this +reason, we introduce open-source infrastructure for easily building generative +molecular models into the widely used DeepChem [Ramsundar et al., 2019] library +with the aim of creating a robust and reusable molecular generation pipeline. +In particular, we add high quality PyTorch [Paszke et al., 2019] +implementations of the Molecular Generative Adversarial Networks (MolGAN) [Cao +and Kipf, 2022] and Normalizing Flows [Papamakarios et al., 2021]. Our +implementations show strong performance comparable with past work [Kuznetsov +and Polykovskiy, 2021, Cao and Kipf, 2022]. + +
+
+ comment: Presented at the 2024 Molecular Machine Learning Conference (MoML + 2024) +
+
+
+
+
+
+
+
+ + Multimedia 2 + +
+
+
+ + ☆ Scaling up Multimodal Pre-training for Sign Language Understanding + + +
+ Sign language serves as the primary meaning of communication for the +deaf-mute community. Different from spoken language, it commonly conveys +information by the collaboration of manual features, i.e., hand gestures and +body movements, and non-manual features, i.e., facial expressions and mouth +cues. To facilitate communication between the deaf-mute and hearing people, a +series of sign language understanding (SLU) tasks have been studied in recent +years, including isolated/continuous sign language recognition (ISLR/CSLR), +gloss-free sign language translation (GF-SLT) and sign language retrieval +(SL-RT). Sign language recognition and translation aims to understand the +semantic meaning conveyed by sign languages from gloss-level and +sentence-level, respectively. In contrast, SL-RT focuses on retrieving sign +videos or corresponding texts from a closed-set under the query-by-example +search paradigm. These tasks investigate sign language topics from diverse +perspectives and raise challenges in learning effective representation of sign +language videos. To advance the development of sign language understanding, +exploring a generalized model that is applicable across various SLU tasks is a +profound research direction. + +
+
+ comment: Sign language recognition; Sign language translation; Sign language + retrieval +
+
+
+
+
+ + ♻ ☆ HeadsetOff: Enabling Photorealistic Video Conferencing on Economical VR + Headsets + + +
+ Virtual Reality (VR) has become increasingly popular for remote +collaboration, but video conferencing poses challenges when the user's face is +covered by the headset. Existing solutions have limitations in terms of +accessibility. In this paper, we propose HeadsetOff, a novel system that +achieves photorealistic video conferencing on economical VR headsets by +leveraging voice-driven face reconstruction. HeadsetOff consists of three main +components: a multimodal predictor, a generator, and an adaptive controller. +The predictor effectively predicts user future behavior based on different +modalities. The generator employs voice, head motion, and eye blink to animate +the human face. The adaptive controller dynamically selects the appropriate +generator model based on the trade-off between video quality and delay. +Experimental results demonstrate the effectiveness of HeadsetOff in achieving +high-quality, low-latency video conferencing on economical VR headsets. + +
+
+ comment: Accepted by ACM Multimedia 2024 +
+
+
+
+
+
+
+ + + +
+
+ +
+
+ + diff --git a/index.js b/index.js new file mode 100644 index 00000000..69f5da7b --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`